In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset=pd.read_csv("Social_Network_Ads.csv")

In [3]:
data=pd.get_dummies(dataset,drop_first=True)

In [4]:
data.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [5]:
independent=data[['Age', 'EstimatedSalary','Gender_Male']]
dependent=data[['Purchased']]

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [7]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid={'criterion':['gini','entropy'],'max_features':['auto','sqrt','log2'], 'splitter':['best','random']}

grid=GridSearchCV(DecisionTreeClassifier(),param_grid,refit=True,verbose=3,n_jobs=-1,scoring='f1_weighted')

grid.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  26 out of  36 | elapsed:    1.7s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    1.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
       

In [10]:
re=grid.cv_results_

grid_pred=grid.predict(x_test)

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,grid_pred)

from sklearn.metrics import classification_report
clf_report=classification_report(y_test,grid_pred)

In [11]:
print(clf_report)


              precision    recall  f1-score   support

           0       0.92      0.91      0.92        79
           1       0.83      0.85      0.84        41

    accuracy                           0.89       120
   macro avg       0.88      0.88      0.88       120
weighted avg       0.89      0.89      0.89       120



In [12]:
print(cm)

[[72  7]
 [ 6 35]]


In [13]:
from sklearn.metrics import f1_score
f1_macro=f1_score(y_test,grid_pred,average='weighted')
print("The f1_macro value for best parameter{}:".format(grid.best_params_),f1_macro)

The f1_macro value for best parameter{'criterion': 'entropy', 'max_features': 'sqrt', 'splitter': 'best'}: 0.8919742664927225


In [14]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,grid.predict_proba(x_test)[:,1])

0.8825254708243284

In [15]:
table=pd.DataFrame.from_dict(re)

In [16]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005961,0.007044,0.001683,0.002381,gini,auto,best,"{'criterion': 'gini', 'max_features': 'auto', ...",0.811664,0.839189,0.844205,0.831615,0.01433,7
1,0.0,0.0,0.003019,0.002135,gini,auto,random,"{'criterion': 'gini', 'max_features': 'auto', ...",0.827213,0.79735,0.847226,0.823942,0.020458,11
2,0.000175,0.000248,0.0,0.0,gini,sqrt,best,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.827213,0.882677,0.836987,0.848881,0.024165,4
3,0.003276,0.003296,0.000774,0.001094,gini,sqrt,random,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.851929,0.79735,0.847226,0.832239,0.024679,6
4,0.010548,0.007463,0.0,0.0,gini,log2,best,"{'criterion': 'gini', 'max_features': 'log2', ...",0.772199,0.840013,0.881335,0.830972,0.045047,8
5,0.00517,0.007311,0.0,0.0,gini,log2,random,"{'criterion': 'gini', 'max_features': 'log2', ...",0.810528,0.828932,0.801769,0.813731,0.011301,12
6,0.0,0.0,0.0,0.0,entropy,auto,best,"{'criterion': 'entropy', 'max_features': 'auto...",0.828603,0.816608,0.847226,0.830805,0.012575,9
7,0.0,0.0,0.005212,0.007371,entropy,auto,random,"{'criterion': 'entropy', 'max_features': 'auto...",0.829787,0.851003,0.848436,0.843028,0.009471,5
8,0.006215,0.006774,0.005212,0.007371,entropy,sqrt,best,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.838689,0.860631,0.882072,0.860386,0.017727,1
9,0.001003,0.001418,0.0,0.0,entropy,sqrt,random,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.862123,0.819971,0.890876,0.857673,0.029068,2
