# Improving the model performance through K-fold cross validation and Grid Search

- Predicting whether customers will Purchase SUV or not 

## 1. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Importing the dataset 

In [2]:
dataset = pd.read_csv(r"F:\Torrent Downloads\Machine Learning A-Z™ Hands-On Python & R In Data Science\[Tutsgalaxy.com] - Machine Learning A-Z™ Hands-On Python & R In Data Science\12. Logistic Regression\Data\Logistic_Regression\Social_Network_Ads.csv")

In [3]:
dataset.head(10)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
5,15728773,Male,27,58000,0
6,15598044,Female,27,84000,0
7,15694829,Female,32,150000,1
8,15600575,Male,25,33000,0
9,15727311,Female,35,65000,0


In [4]:
X = dataset.iloc[:, [2,3]].values
y = dataset.iloc[:, 4].values

In [5]:
print("y Shape:", y.shape)
print("X Shape:", X.shape)

y Shape: (400,)
X Shape: (400, 2)


## 3. Splitting data into Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## 4. Feature Scaling


In [7]:
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)

## 5. Fitting kernel SVC  classifier to the Training set

In [8]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state= 0)
classifier.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

## 6. Predicting the Test Set results

In [9]:
y_pred = classifier.predict(X_test)

## 7. Creating the confusion matrix 

-- showing correct and incorrect predictions made by the model

In [10]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[64,  4],
       [ 3, 29]], dtype=int64)

In [11]:
## predicting accuracy = (65+24)/(65+24+3+8)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.93

## 8. K-Fold Cross Validation 

In [12]:
from sklearn.model_selection import cross_val_score

## here, we are considering k = 10 
accuracies = cross_val_score(estimator= classifier, X = X_train, y = y_train, cv = 10, n_jobs= -1)
accuracies

array([0.80645161, 0.96666667, 0.8       , 0.93333333, 0.86666667,
       0.83333333, 0.93333333, 0.93333333, 0.96666667, 0.96551724])

Here, we can see 10 accuracies, which are obtained through corss validation

In [13]:
## optimum accuracy, i.e. mean of all the accuracies
accuracies.mean()

0.9005302187615868

In [14]:
## standard deviation between accuracies
accuracies.std()

0.06388957356626285

Here, we can see that standard deviation is not too large and mean accuracy is also quite good.

## 9. Grid Search
    - with the help of Grid Search, we can find improve the performance of the model because Grid Search helps in finding optimal hyperparameters for the model.

In [15]:

from sklearn. model_selection import GridSearchCV
parameters = [{'c': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'c': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.5, 0.1, 0.01, 0.001, 0.0001]}]


In [16]:
grid_search = GridSearchCV(estimator= classifier,
                          param_grid= parameters,
                          scoring= 'accuracy',
                          cv = 10,
                          n_jobs= -1)

## Now we can see the attributes 

    - cv_results_ 
    - best_estimator_
    - best_score_
    - best_params_
    




In [20]:
sorted(grid_search.cv_results_.keys())


AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

## 10. Visualizing the training results

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('kernel SVC (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

## 11. Visualizing the test set

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('kernel SVC (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()