In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset=pd.read_csv("Social_Network_Ads.csv")

In [3]:
data=pd.get_dummies(dataset,drop_first=True)

In [4]:
data.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [5]:
independent=data[['Age', 'EstimatedSalary','Gender_Male']]
dependent=data[['Purchased']]

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [7]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid={'criterion':['gini','entropy'],'max_features':['auto','sqrt','log2'], 'n_estimators':[10,100]}

grid=GridSearchCV(RandomForestClassifier(),param_grid,refit=True,verbose=3,n_jobs=-1,scoring='f1')

grid.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  26 out of  36 | elapsed:    1.7s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    1.8s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [10]:
re=grid.cv_results_

grid_pred=grid.predict(x_test)

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,grid_pred)

from sklearn.metrics import classification_report
clf_report=classification_report(y_test,grid_pred)

In [11]:
print(clf_report)


              precision    recall  f1-score   support

           0       0.94      0.94      0.94        79
           1       0.88      0.88      0.88        41

    accuracy                           0.92       120
   macro avg       0.91      0.91      0.91       120
weighted avg       0.92      0.92      0.92       120



In [12]:
print(cm)

[[74  5]
 [ 5 36]]


In [13]:
from sklearn.metrics import f1_score
f1_macro=f1_score(y_test,grid_pred,average='weighted')
print("The f1_macro value for best parameter{}:".format(grid.best_params_),f1_macro)

The f1_macro value for best parameter{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}: 0.9166666666666666


In [14]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,grid.predict_proba(x_test)[:,1])

0.9644952145723988

In [15]:
table=pd.DataFrame.from_dict(re)

In [16]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.020454,0.007968,0.002671,0.003778,gini,auto,10,"{'criterion': 'gini', 'max_features': 'auto', ...",0.776119,0.828571,0.865672,0.823285,0.036781,9
1,0.077933,0.001402,0.011641,0.006105,gini,auto,100,"{'criterion': 'gini', 'max_features': 'auto', ...",0.811594,0.833333,0.911765,0.852086,0.043014,4
2,0.010528,0.007674,0.005233,0.0074,gini,sqrt,10,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.776119,0.821918,0.84375,0.813794,0.028222,10
3,0.047633,0.00011,0.004388,0.000267,gini,sqrt,100,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.828571,0.849315,0.895522,0.857699,0.027988,1
4,0.015111,0.000747,0.000669,0.000946,gini,log2,10,"{'criterion': 'gini', 'max_features': 'log2', ...",0.761905,0.8,0.825397,0.795646,0.026124,12
5,0.076003,0.009727,0.008017,0.006549,gini,log2,100,"{'criterion': 'gini', 'max_features': 'log2', ...",0.794118,0.833333,0.878788,0.835265,0.034624,6
6,0.016586,0.000889,0.0,0.0,entropy,auto,10,"{'criterion': 'entropy', 'max_features': 'auto...",0.828571,0.816901,0.857143,0.834185,0.016878,7
7,0.066775,0.006924,0.010418,0.007367,entropy,auto,100,"{'criterion': 'entropy', 'max_features': 'auto...",0.794118,0.833333,0.895522,0.840824,0.04177,5
8,0.006213,0.006772,0.0,0.0,entropy,sqrt,10,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.818182,0.828571,0.84375,0.830125,0.010505,8
9,0.06302,0.000166,0.011375,0.008086,entropy,sqrt,100,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.828571,0.833333,0.895522,0.85239,0.030479,3
