# GridSearchCV 

To select the best hyperparamter

# 1)-Importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
# For data processing and maths
import numpy as np
import pandas as pd
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# 2)- Loading Data

In [3]:
dataset = pd.read_csv('Advertising_data.csv')
dataset.head()


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [4]:
dataset.shape

(400, 5)

### 2.1)- Checking data type

In [5]:
def _tbl_dtype(dataset):
    sum_dtype = pd.DataFrame(dataset.dtypes).sort_values(0).rename(columns = {0:'Data Type'})
    return sum_dtype

table_dtype = _tbl_dtype(dataset)
table_dtype

Unnamed: 0,Data Type
User ID,int64
Purchased,int64
Age,float64
EstimatedSalary,float64
Gender,object


### 2.2)-Checking missing values

In [6]:
def find_missing_values(df, columns):
    """
    Finds number of rows where certain columns are missing values.
    ::param_df:: = target dataframe
    ::param_columns:: = list of columns
    """
    missing_vals = {}
    print("Number of missing or NaN values for each column:")
    df_length = len(df)
    for column in columns:
        total_column_values = df[column].value_counts().sum()
        missing_vals[column] = df_length-total_column_values
        #missing_vals.append(str(column)+ " column has {} missing or NaN values.".format())
    return missing_vals

In [7]:
missing_values = find_missing_values(dataset, columns=dataset.columns)
missing_values

Number of missing or NaN values for each column:


{'User ID': 0, 'Gender': 0, 'Age': 0, 'EstimatedSalary': 0, 'Purchased': 0}

### 2.3)-Dealing with categorical variables

In [8]:
# Getting Dummies from all other categorical vars
for col in dataset.dtypes[dataset.dtypes == 'object'].index:
    for_dummy = dataset.pop(col)
    dataset = pd.concat([dataset, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [9]:
dataset.shape

(400, 6)

In [10]:
dataset.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19.0,19000.0,0,0,1
1,15810944,35.0,20000.0,0,0,1
2,15668575,26.0,43000.0,0,1,0
3,15603246,27.0,57000.0,0,1,0
4,15804002,19.0,76000.0,0,0,1


Only categorical variable was Gender which is now encoded.

# 3)- Pre-processing for Model Building

### 3.1)- Splitting dataset into dependant and independant variables

In [11]:
X = dataset[['Age','EstimatedSalary','Gender_Female','Gender_Male']].values
y = dataset.pop('Purchased')

In [12]:
X[:3]

array([[1.9e+01, 1.9e+04, 0.0e+00, 1.0e+00],
       [3.5e+01, 2.0e+04, 0.0e+00, 1.0e+00],
       [2.6e+01, 4.3e+04, 1.0e+00, 0.0e+00]])

In [13]:
y[:3]

0    0
1    0
2    0
Name: Purchased, dtype: int64

### 3.2)-Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 2019)

### 3.3)-Feature Scaling

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 4)- Model

Support Vector Machine

### 4.1)-Fitting Kernel SVM to the Training set

In [16]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

### 4.2)-Predicting the Test set

In [17]:
y_pred = classifier.predict(X_test)

### 4.3)- Evaluation through Accuracy

In [18]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.82

### 4.4)- Making the Confusion Matrix

In [19]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[60,  5],
       [13, 22]], dtype=int64)

# 5)-Applying Grid Search

- to find the best model and the best parameters

In [20]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)




In [21]:
accuracy = grid_search.best_score_
accuracy

0.9133333333333333

We can see an improvement from 0.82 accuracy score to 0.913.

In [22]:
grid_search.best_params_

{'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}

In [23]:
grid_search.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

**Re-Construct model**

In [24]:
classifier = SVC(kernel = 'rbf', gamma=0.5) # C=1 is by default though
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
# Predicting the Test set results
y_pred_new = classifier.predict(X_test)

In [26]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_new)
cm

array([[57,  8],
       [ 4, 31]], dtype=int64)

In [27]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred_new)
accuracy

0.88

**This is our improved accuracy from model. Note it is not as high as we anticipated from GridSearchCV. Further analysis could be made on each feature/variable. It could be possible that some features are not needed. Their removal might increase accuracy further**