In [1]:
# Process Name : Data Collection & Data Preprocessing.pandas is a python library which is used for data handling and transformation
import pandas as pd 
data=pd.read_csv("Social_Network_Ads.csv") #read_csv is a fucntion inside pandas which will read/load the comma separated data.

In [2]:
data=data.drop("User ID",axis=1)
data=pd.get_dummies(data,drop_first=True)
data

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,True
1,35,20000,0,True
2,26,43000,0,False
3,27,57000,0,False
4,19,76000,0,True
...,...,...,...,...
395,46,41000,1,False
396,51,23000,1,True
397,50,20000,1,False
398,36,33000,0,True


In [3]:
value_counts=data['Purchased'].value_counts()
print(value_counts)
if value_counts.nunique() == 1:
    print("The dataset is balanced.")
else:
    print("The dataset is imbalanced.")


Purchased
0    257
1    143
Name: count, dtype: int64
The dataset is imbalanced.


In [4]:
#Process Name : Input Output Split
independent=data[['Gender_Male', 'Age', 'EstimatedSalary']] #Assigning the dataset column to independent which is input
dependent=data[['Purchased']]#Assigning the dataset column to dependent which is output/target

In [5]:
#Process Name : Train Test Split
from sklearn.model_selection import train_test_split #sklearn is a library where all ML algorithms are present.Here we are importing function called train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(independent, dependent, test_size=0.3, random_state=0) #Splitting the dataset in to test and train.test set being 30%(0.3)

Random Forest Grid Search Model

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {  
               'n_estimators':[10,100],
               'criterion':['gini', 'entropy', 'log_loss'],
               'min_samples_split':[2,5,10],
               'max_depth':[3,5,10]
             } 
Model = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted')
Model.fit(X_train,Y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


  return fit_method(estimator, *args, **kwargs)


In [7]:
#Process Name : Prediction
Y_predicted=Model.predict(X_test)
Y_predicted

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [8]:
#Process Name : Evaluation metrics
from sklearn.metrics import confusion_matrix  #confusion_matrix fucntion in sklearn is used to evaluate model
cm=confusion_matrix(Y_test,Y_predicted)
print(cm)

from sklearn.metrics import classification_report  #confusion_matrix fucntion in sklearn is used to evaluate model
clf_report=classification_report(Y_test,Y_predicted)
print(clf_report)

from sklearn.metrics import roc_auc_score
roc_curve=roc_auc_score(Y_test,Model.predict_proba(X_test)[:,1])
print("The roc curve of the model for best parameter {}:".format(Model.best_params_),roc_curve)

from sklearn.metrics import accuracy_score
accuracy=accuracy_score(Y_test,Y_predicted)
print("The accuracy of the model for best parameter {}:".format(Model.best_params_),accuracy)

[[72  7]
 [ 4 37]]
              precision    recall  f1-score   support

           0       0.95      0.91      0.93        79
           1       0.84      0.90      0.87        41

    accuracy                           0.91       120
   macro avg       0.89      0.91      0.90       120
weighted avg       0.91      0.91      0.91       120

The roc curve of the model for best parameter {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}: 0.9660389008953381
The accuracy of the model for best parameter {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}: 0.9083333333333333


In [9]:
re=Model.cv_results_
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.372074,0.031384,0.136476,0.001043,gini,3,2,10,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.795918,0.858503,0.87728,0.929144,0.945469,0.881263,0.053353,48
1,2.470115,0.172262,0.158685,0.06777,gini,3,2,100,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.892857,0.858503,0.859435,0.947015,1.0,0.911562,0.0547,1
2,0.34787,0.04912,0.146157,0.017956,gini,3,5,10,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.855314,0.858503,0.876643,0.929144,0.982221,0.900365,0.048731,25
3,1.284499,0.315226,0.105534,0.102954,gini,3,5,100,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.892857,0.875644,0.859435,0.929144,0.982221,0.90786,0.043796,7
4,0.036147,0.00668,0.014909,0.001637,gini,3,10,10,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.892857,0.821429,0.841398,0.928571,0.982221,0.893295,0.058328,42
5,2.056563,0.173589,0.200606,0.017228,gini,3,10,100,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.892857,0.858503,0.87728,0.929144,0.982221,0.908001,0.043769,3
6,0.282314,0.106483,0.158099,0.0331,gini,5,2,10,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.855314,0.838326,0.859435,0.910254,0.946153,0.881896,0.040106,47
7,2.706469,0.106498,0.226355,0.031678,gini,5,2,100,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.874254,0.858503,0.841398,0.929144,0.964286,0.893517,0.046024,39
8,0.323798,0.036867,0.122879,0.017267,gini,5,5,10,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.874254,0.840114,0.859435,0.947015,0.946153,0.893394,0.044759,40
9,1.808389,0.246363,0.150709,0.136519,gini,5,5,100,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.874254,0.858503,0.859435,0.929144,0.982051,0.900677,0.048187,22


In [10]:
#Process Name : Saving the model
import pickle 
filename="random_forest_classifier_grid_model.sav"  #model is saved in the filename
pickle.dump(Model,open(filename,'wb')) #assigning the model to this file given as write binary


Get User Input and Run prediction

In [11]:
age_user=float(input("Age:"))
estimated_salary_user=float(input("Salary:"))
gender_male_user=int(input("Sex Male 0 or 1:"))

User_Prediction=Model.predict([[age_user,estimated_salary_user,gender_male_user]])# change the paramter,play with it.
print("User_Prediction={}".format(User_Prediction))
if (User_Prediction==1):
    print("The user will purchase")
else:
    print("The user will not purchase")
 

Age: 56
Salary: 100000
Sex Male 0 or 1: 1


User_Prediction=[1]
The user will purchase


