In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.compose import ColumnTransformer
import xgboost as xg
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import datasets

# Set random seed
np.random.seed(0)

**K-FOLD CROSS VALIDATION**->For evaluating the model better.


Generally, linear algorithms have a high bias making them fast to learn and easier to understand but generally less flexible. In turn, they have lower predictive performance on complex problems that fail to meet the simplifying assumptions of the algorithms bias

    Low Bias: Suggests less assumptions about the form of the target function.
    High-Bias: Suggests more assumptions about the form of the target function.

Examples of low-bias machine learning algorithms include: Decision Trees, k-Nearest Neighbors and Support Vector Machines.

Examples of high-bias machine learning algorithms include: Linear Regression, Linear Discriminant Analysis and Logistic Regression.


    Linear machine learning algorithms often have a high bias but a low variance.
    Nonlinear machine learning algorithms often have a low bias but a high variance.

Goal->low bias and low var.

bias is inv prop to var

In [0]:
dataset=pd.read_csv('/content/drive/My Drive/Social_Network_Ads.csv')
X=dataset.iloc[:,[2,3]].values
y=dataset.iloc[:,-1].values

In [0]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [0]:
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

In [0]:
classifier3=SVC(kernel='rbf',random_state=0) # rbf gives better than linear
classifier3.fit(X_train,y_train)

In [0]:
y_pred=classifier3.predict(X_test)
accuracy_score(y_test,y_pred)

In [0]:
accuracies=cross_val_score(estimator=classifier3,X=X_train,y=y_train,cv=10) # if large daatset put n_jobs=-1 i.e to use all cpu's of system

In [0]:
accuracies.mean()  # relevant eval of model
accuracies.std()

In [0]:
# grid search
parameters=[{'C': [1,10,100,1000],'kernel':['linear']},
            {'C': [1,10,100,1000],'kernel':['rbf'],'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}]
grid_search=GridSearchCV(estimator=classifier3,param_grid=parameters,scoring='accuracy',cv=10,n_jobs=-1)
grid_search=grid_search.fit(X_train,y_train)

In [0]:
best=grid_search.best_score_
best_parameters=grid_search.best_params_
best_parameters

Performing grid search on models

In [0]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [0]:
# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression()],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]}]

In [0]:
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)

In [0]:
best_model=clf.fit(X,y)

In [9]:
# View best model
best_model.best_estimator_.get_params()['classifier']
# Predict target vector
best_model.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

**XGBOOST**

Works best for large dataset.Feature scaling is unnecessary.
It has fast exec speed.It has high performance.

In [0]:
dataset=pd.read_csv('/content/drive/My Drive/Churn_Modelling.csv')
X=dataset.iloc[:,3:13].values
y=dataset.iloc[:,13].values

In [0]:
label_encoder_X1 = LabelEncoder()
X[:,1] = label_encoder_X1.fit_transform(X[:,1])
label_encoder_X2 = LabelEncoder()
X[:,2] = label_encoder_X2.fit_transform(X[:,2])
onehot_encoder = ColumnTransformer([('Geography', OneHotEncoder(), [1])], remainder='passthrough')
X = onehot_encoder.fit_transform(X)

In [0]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [0]:
# fitting xgboost
classifier=XGBClassifier()
classifier.fit(X_train,y_train)

In [0]:
y_pred=classifier.predict(X_test)

In [0]:
accuracies=cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10) # if large daatset put n_jobs=-1 i.e to use all cpu's of system
accuracies.mean()  # relevant eval of model
accuracies.std()

0.0100971840133772