# RandomizedSearchCV

To select the best hyperparamter

# 1)-Importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
# For data processing and maths
import numpy as np
import pandas as pd
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# 2)- Loading Data

In [3]:
dataset = pd.read_csv('Advertising_data.csv')
dataset.head()


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [4]:
dataset.shape

(400, 5)

### 2.1)- Checking data type

In [5]:
def _tbl_dtype(dataset):
    sum_dtype = pd.DataFrame(dataset.dtypes).sort_values(0).rename(columns = {0:'Data Type'})
    return sum_dtype

table_dtype = _tbl_dtype(dataset)
table_dtype

Unnamed: 0,Data Type
User ID,int64
Purchased,int64
Age,float64
EstimatedSalary,float64
Gender,object


### 2.2)-Checking missing values

In [6]:
def find_missing_values(df, columns):
    """
    Finds number of rows where certain columns are missing values.
    ::param_df:: = target dataframe
    ::param_columns:: = list of columns
    """
    missing_vals = {}
    print("Number of missing or NaN values for each column:")
    df_length = len(df)
    for column in columns:
        total_column_values = df[column].value_counts().sum()
        missing_vals[column] = df_length-total_column_values
        #missing_vals.append(str(column)+ " column has {} missing or NaN values.".format())
    return missing_vals

In [7]:
missing_values = find_missing_values(dataset, columns=dataset.columns)
missing_values

Number of missing or NaN values for each column:


{'User ID': 0, 'Gender': 0, 'Age': 0, 'EstimatedSalary': 0, 'Purchased': 0}

### 2.3)-Dealing with categorical variables

In [8]:
# Getting Dummies from all other categorical vars
for col in dataset.dtypes[dataset.dtypes == 'object'].index:
    for_dummy = dataset.pop(col)
    dataset = pd.concat([dataset, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [9]:
dataset.shape

(400, 6)

In [10]:
dataset.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19.0,19000.0,0,0,1
1,15810944,35.0,20000.0,0,0,1
2,15668575,26.0,43000.0,0,1,0
3,15603246,27.0,57000.0,0,1,0
4,15804002,19.0,76000.0,0,0,1


Only categorical variable was Gender which is now encoded.

# 3)- Pre-processing for Model Building

### 3.1)- Splitting dataset into dependant and independant variables

In [11]:
X = dataset[['Age','EstimatedSalary','Gender_Female','Gender_Male']].values
y = dataset.pop('Purchased')

In [12]:
X[:3]

array([[1.9e+01, 1.9e+04, 0.0e+00, 1.0e+00],
       [3.5e+01, 2.0e+04, 0.0e+00, 1.0e+00],
       [2.6e+01, 4.3e+04, 1.0e+00, 0.0e+00]])

In [13]:
y[:3]

0    0
1    0
2    0
Name: Purchased, dtype: int64

### 3.2)-Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 2019)

### 3.3)-Feature Scaling

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 4)- Model

Random Forest

### 4.1)-Fitting Random Forest Classification to the Training set

In [16]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 50)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=50, verbose=0, warm_start=False)

### 4.2)-Predicting the Test set

In [17]:
y_pred = classifier.predict(X_test)

### 4.3)- Evaluation through Accuracy

In [18]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.86

### 4.4)- Making the Confusion Matrix

In [19]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[57,  8],
       [ 6, 29]], dtype=int64)

# 5)-Applying RandomizedSearchCV

- to find the best model and the best parameters

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [21]:
estimator = RandomForestClassifier(n_jobs=-1)
dist={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200,300,400,500],
              'max_features':randint(1,3),
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf':randint(1,4),
              }

In [22]:
rdmsearch = RandomizedSearchCV(estimator, param_distributions=dist,
                                  n_jobs=-1, n_iter=40, cv=10)
#CV = Cross-Validation ( here using Stratified KFold CV)

In [23]:
# fit model

rdmsearch.fit(X_train,y_train)



RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=40, n_jobs=-1,
          param_distributions={'max_depth': [3, 5, 10, None], 'n_estimators': [10, 100, 200, 300, 400, 500], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000027054470860>, 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000270544709B0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
       

In [24]:
rdmsearch.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 1,
 'min_samples_leaf': 3,
 'n_estimators': 100}

In [25]:
# Our accuarcy with RandomizedSearchCV
rdmsearch.best_score_

0.9266666666666666

In [26]:
rdmsearch.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=5, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# 6)-Re-Construct model

In [27]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=10,bootstrap= False,criterion='gini',max_depth=5,max_features=1,min_samples_leaf= 2,min_samples_split=2)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=5, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
# Predicting the Test set results
y_pred_new = rf.predict(X_test)

In [29]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_new)
cm

array([[57,  8],
       [ 6, 29]], dtype=int64)

In [30]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred_new)
accuracy

0.86

### 6.1)-Checking best features

In [31]:
dataset.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Gender_Female,Gender_Male
0,15624510,19.0,19000.0,0,1
1,15810944,35.0,20000.0,0,1
2,15668575,26.0,43000.0,1,0
3,15603246,27.0,57000.0,1,0
4,15804002,19.0,76000.0,0,1


In [32]:
independent_columns_names=dataset.copy()

In [33]:
independent_columns_names.shape

(400, 5)

In [34]:
independent_columns_names.drop(['User ID'], axis=1, inplace=True)

In [35]:
independent_columns_names.shape

(400, 4)

In [36]:
rdmsearch.best_estimator_.feature_importances_

array([0.55740135, 0.43847916, 0.00184605, 0.00227344])

In [37]:
pd.DataFrame({'feat': independent_columns_names,
              'coef': rdmsearch.best_estimator_.feature_importances_}).sort_values(by='coef', ascending=False)

Unnamed: 0,feat,coef
0,"(A, g, e)",0.557401
1,"(E, s, t, i, m, a, t, e, d, S, a, l, a, r, y)",0.438479
3,"(G, e, n, d, e, r, _, M, a, l, e)",0.002273
2,"(G, e, n, d, e, r, _, F, e, m, a, l, e)",0.001846


**If we drop Gender variable , we might get better results.**