In [24]:
# Process Name : Data Collection & Data Preprocessing.pandas is a python library which is used for data handling and transformation
import pandas as pd 
data=pd.read_csv("Social_Network_Ads.csv") #read_csv is a fucntion inside pandas which will read/load the comma separated data.

In [25]:
data=data.drop("User ID",axis=1)
data=pd.get_dummies(data,drop_first=True)
data

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,True
1,35,20000,0,True
2,26,43000,0,False
3,27,57000,0,False
4,19,76000,0,True
...,...,...,...,...
395,46,41000,1,False
396,51,23000,1,True
397,50,20000,1,False
398,36,33000,0,True


In [26]:
value_counts=data['Purchased'].value_counts()
print(value_counts)
if value_counts.nunique() == 1:
    print("The dataset is balanced.")
else:
    print("The dataset is imbalanced.")


Purchased
0    257
1    143
Name: count, dtype: int64
The dataset is imbalanced.


In [27]:
#Process Name : Input Output Split
independent=data[['Gender_Male', 'Age', 'EstimatedSalary']] #Assigning the dataset column to independent which is input
dependent=data[['Purchased']]#Assigning the dataset column to dependent which is output/target

In [28]:
#Process Name : Train Test Split
from sklearn.model_selection import train_test_split #sklearn is a library where all ML algorithms are present.Here we are importing function called train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(independent, dependent, test_size=0.3, random_state=0) #Splitting the dataset in to test and train.test set being 30%(0.3)

Decision Tree Grid Search Model

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
               'criterion':['gini', 'entropy', 'log_loss'],
               'splitter':['best','random'],
               'min_samples_split':[2,5,10],
               'max_depth':[3,5,10]
             } 
Model = GridSearchCV(DecisionTreeClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted')
Model.fit(X_train,Y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [30]:
#Process Name : Prediction
Y_predicted=Model.predict(X_test)
Y_predicted

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [31]:
#Process Name : Evaluation metrics
from sklearn.metrics import confusion_matrix  #confusion_matrix fucntion in sklearn is used to evaluate model
cm=confusion_matrix(Y_test,Y_predicted)
print(cm)

from sklearn.metrics import classification_report  #confusion_matrix fucntion in sklearn is used to evaluate model
clf_report=classification_report(Y_test,Y_predicted)
print(clf_report)

from sklearn.metrics import roc_auc_score
roc_curve=roc_auc_score(Y_test,Model.predict_proba(X_test)[:,1])
print("The roc curve of the model for best parameter {}:".format(Model.best_params_),roc_curve)

from sklearn.metrics import accuracy_score
accuracy=accuracy_score(Y_test,Y_predicted)
print("The accuracy of the model for best parameter {}:".format(Model.best_params_),accuracy)

[[72  7]
 [ 3 38]]
              precision    recall  f1-score   support

           0       0.96      0.91      0.94        79
           1       0.84      0.93      0.88        41

    accuracy                           0.92       120
   macro avg       0.90      0.92      0.91       120
weighted avg       0.92      0.92      0.92       120

The roc curve of the model for best parameter {'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 2, 'splitter': 'best'}: 0.9546156221055881
The accuracy of the model for best parameter {'criterion': 'entropy', 'max_depth': 3, 'min_samples_split': 2, 'splitter': 'best'}: 0.9166666666666666


In [32]:
re=Model.cv_results_
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015546,0.003443,0.029689,0.004695,gini,3,2,best,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.838326,0.858503,0.821429,0.947015,0.982221,0.889499,0.063433,9
1,0.01299,0.002306,0.024666,0.003244,gini,3,2,random,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.802555,0.795918,0.804584,0.927778,0.480769,0.762321,0.149118,54
2,0.007432,0.001162,0.017348,0.002065,gini,3,5,best,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.838326,0.858503,0.821429,0.947015,0.982221,0.889499,0.063433,9
3,0.007358,0.001406,0.016304,0.001081,gini,3,5,random,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.747056,0.600649,0.802399,0.84161,0.964286,0.7912,0.119066,52
4,0.005897,0.000738,0.014048,0.000788,gini,3,10,best,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.838326,0.858503,0.87728,0.947015,0.982221,0.900669,0.054791,7
5,0.007625,0.001136,0.015296,0.002042,gini,3,10,random,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.746799,0.73125,0.858503,0.830519,0.824219,0.798258,0.049965,51
6,0.007758,0.001777,0.013121,0.001258,gini,5,2,best,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.819142,0.840114,0.821429,0.892857,0.909115,0.856531,0.037374,32
7,0.006957,0.001546,0.012328,0.001805,gini,5,2,random,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.855314,0.802399,0.747056,0.805644,0.859025,0.813888,0.041036,49
8,0.005525,0.000791,0.013374,0.001456,gini,5,5,best,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.819142,0.840114,0.821429,0.911105,0.909115,0.860181,0.041416,29
9,0.005647,0.000814,0.014691,0.001068,gini,5,5,random,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.910181,0.802399,0.723577,0.870721,0.929513,0.847278,0.075611,38


In [33]:
#Process Name : Saving the model
import pickle 
filename="decision_tree_classifier_grid_model.sav"  #model is saved in the filename
pickle.dump(Model,open(filename,'wb')) #assigning the model to this file given as write binary


Get User Input and Run prediction

In [34]:
age_user=float(input("Age:"))
estimated_salary_user=float(input("Salary:"))
gender_male_user=int(input("Sex Male 0 or 1:"))

User_Prediction=Model.predict([[age_user,estimated_salary_user,gender_male_user]])# change the paramter,play with it.
print("User_Prediction={}".format(User_Prediction))
if (User_Prediction==1):
    print("The user will purchase")
else:
    print("The user will not purchase")
 

Age: 30
Salary: 80000
Sex Male 0 or 1: 0


User_Prediction=[1]
The user will purchase


