In [1]:
#import library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#load the dataset
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [3]:
#changing categorical value to numerical value
dataset = pd.get_dummies(dataset, drop_first=True)
dataset=dataset.astype(int)
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1
...,...,...,...,...,...
395,15691863,46,41000,1,0
396,15706071,51,23000,1,1
397,15654296,50,20000,1,0
398,15755018,36,33000,0,1


In [4]:
# to find the purchased not purchased count
dataset['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [5]:
dataset.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [6]:
independent = dataset[['User ID', 'Age', 'EstimatedSalary','Gender_Male']]
dependent = dataset[['Purchased']]

In [7]:
independent.shape

(400, 4)

In [8]:
dependent.shape

(400, 1)

In [9]:
#split model for training and testing
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(independent, dependent, test_size=1/3, random_state=0)

In [10]:
#Model Creation
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10,criterion='entropy',random_state = 0)
classifier.fit(X_train,Y_train)

  classifier.fit(X_train,Y_train)


In [11]:
# to test the data
test_Pred = classifier.predict(X_test)
test_Pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1])

In [12]:
#Evalution using confusion matrix
from sklearn.metrics import confusion_matrix
con_mat = confusion_matrix(Y_test,test_Pred)
con_mat

array([[79,  6],
       [ 6, 43]], dtype=int64)

In [13]:
#To create a classification report
from sklearn.metrics import classification_report
class_report = classification_report(Y_test,test_Pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        85
           1       0.88      0.88      0.88        49

    accuracy                           0.91       134
   macro avg       0.90      0.90      0.90       134
weighted avg       0.91      0.91      0.91       134



In [14]:
dataset.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [15]:
#to predict the user input
id_input=int(input("Enter the UserId: "))
age_input=int(input("Enter the age: "))
Salary= int(input("Enter the salary: "))
Gender = int(input("Enter the gender: "))

Enter the UserId: 36
Enter the age: 35
Enter the salary: 50000
Enter the gender: 0


In [16]:
classifier.predict([[id_input,age_input,Salary,Gender]])



array([0])

In [17]:
classifier.predict([[1,23,40000,1]])



array([0])

# Using Grid Search

In [18]:
#Model Creation
from sklearn.ensemble import RandomForestClassifier

In [19]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10,50,100], 
              'criterion':['gini', 'entropy', 'log_loss'],# paramters are random forest from website
              'max_features':['sqrt', 'log2']}

grid = GridSearchCV(RandomForestClassifier(),parameters, refit = True, verbose =3, n_jobs=-1, scoring='f1') 
#refit true is for finding the best model,
#false will give last model
#https://scikit-learn.org/stable/modules/model_evaluation.html - for scoring 
grid.fit(X_train,Y_train) # can able to call directly without training and testing 

Fitting 5 folds for each of 18 candidates, totalling 90 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [20]:
print(grid.best_params_)

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}


In [34]:
grid.cv_results_ 

{'mean_fit_time': array([0.10271626, 0.33360863, 0.6001421 , 0.06706972, 0.3002686 ,
        0.60383382, 0.06347504, 0.29787555, 0.59583406, 0.06470613,
        0.32215195, 0.59692054, 0.06606812, 0.30034642, 0.63072324,
        0.0750473 , 0.2758822 , 0.46095967]),
 'std_fit_time': array([0.0118188 , 0.01728265, 0.02860673, 0.00792529, 0.01520174,
        0.02206077, 0.00444233, 0.02715739, 0.01412245, 0.00839998,
        0.01777201, 0.01450563, 0.0047878 , 0.01232582, 0.02513508,
        0.00383816, 0.01453057, 0.01521427]),
 'mean_score_time': array([0.03057704, 0.04437418, 0.05694232, 0.02104921, 0.04651647,
        0.05379896, 0.02210898, 0.04865174, 0.0667912 , 0.024791  ,
        0.04134107, 0.05617442, 0.02534199, 0.04014711, 0.05707045,
        0.02786283, 0.03976421, 0.03096728]),
 'std_score_time': array([0.00708195, 0.00726281, 0.00721205, 0.00545948, 0.01260932,
        0.00951578, 0.00784817, 0.01307918, 0.01401362, 0.00838407,
        0.0135999 , 0.00733753, 0.00344413, 

In [21]:
classifier = grid.cv_results_ 
classifier
print('The value for best parameter: ',format(grid.best_params_))

The value for best parameter:  {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}


In [22]:
table = pd.DataFrame.from_dict(classifier)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.102716,0.011819,0.030577,0.007082,gini,sqrt,10,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.75,0.789474,0.780488,0.742857,0.888889,0.790342,0.052329,17
1,0.333609,0.017283,0.044374,0.007263,gini,sqrt,50,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.823529,0.842105,0.837209,0.9,0.918919,0.864353,0.037806,6
2,0.600142,0.028607,0.056942,0.007212,gini,sqrt,100,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.833333,0.789474,0.837209,0.9,0.918919,0.855787,0.047302,9
3,0.06707,0.007925,0.021049,0.005459,gini,log2,10,"{'criterion': 'gini', 'max_features': 'log2', ...",0.764706,0.820513,0.809524,0.9,0.848485,0.828645,0.044733,13
4,0.300269,0.015202,0.046516,0.012609,gini,log2,50,"{'criterion': 'gini', 'max_features': 'log2', ...",0.857143,0.85,0.837209,0.842105,0.944444,0.86618,0.039718,4
5,0.603834,0.022061,0.053799,0.009516,gini,log2,100,"{'criterion': 'gini', 'max_features': 'log2', ...",0.833333,0.842105,0.837209,0.9,0.918919,0.866313,0.035841,3
6,0.063475,0.004442,0.022109,0.007848,entropy,sqrt,10,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.6875,0.705882,0.780488,0.842105,0.857143,0.774624,0.068867,18
7,0.297876,0.027157,0.048652,0.013079,entropy,sqrt,50,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.857143,0.777778,0.837209,0.9,0.888889,0.852204,0.043402,10
8,0.595834,0.014122,0.066791,0.014014,entropy,sqrt,100,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.823529,0.820513,0.837209,0.926829,0.944444,0.870505,0.053766,1
9,0.064706,0.0084,0.024791,0.008384,entropy,log2,10,"{'criterion': 'entropy', 'max_features': 'log2...",0.787879,0.742857,0.8,0.810811,0.944444,0.817198,0.067697,16


In [23]:
grid_pred = grid.predict(X_test)
grid_pred


array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1])

In [24]:
#Evalution using confusion matrix
from sklearn.metrics import confusion_matrix
con_mat = confusion_matrix(Y_test,grid_pred)
con_mat

array([[78,  7],
       [ 4, 45]], dtype=int64)

In [25]:
#To create a classification report
from sklearn.metrics import classification_report
class_report = classification_report(Y_test,grid_pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.95      0.92      0.93        85
           1       0.87      0.92      0.89        49

    accuracy                           0.92       134
   macro avg       0.91      0.92      0.91       134
weighted avg       0.92      0.92      0.92       134



In [26]:
from sklearn.metrics import f1_score
f1_macro=f1_score(Y_test,grid_pred,average='weighted')
print("The f1_macro value for best parameter {}:".format(grid.best_params_),f1_macro)

The f1_macro value for best parameter {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 100}: 0.9183922682195829


In [27]:
from sklearn.metrics import roc_auc_score #Receiver Operating Characteristic Curve, Area Under the curve
roc_auc_score(Y_test,grid.predict_proba(X_test)[:,1]) # to get a greater probability value, [:,1][R,C]

0.966266506602641

In [28]:
grid.predict_proba(X_test)[:,1]

array([0.06, 0.01, 0.02, 0.03, 0.01, 0.  , 0.01, 0.92, 0.  , 0.6 , 0.  ,
       0.  , 0.07, 0.31, 0.  , 0.45, 0.19, 0.01, 0.86, 0.  , 0.01, 0.88,
       0.  , 1.  , 0.03, 0.88, 0.05, 0.01, 0.  , 0.1 , 0.12, 0.13, 0.61,
       0.01, 0.  , 0.  , 0.  , 0.01, 0.02, 0.93, 0.05, 0.  , 0.03, 0.  ,
       0.69, 0.01, 0.13, 0.93, 0.  , 0.78, 0.95, 0.03, 0.12, 0.8 , 0.87,
       0.89, 0.  , 0.14, 0.68, 0.02, 0.01, 0.48, 0.  , 0.91, 0.03, 0.91,
       0.08, 0.  , 0.01, 0.23, 0.97, 0.05, 0.01, 0.77, 0.03, 0.  , 0.04,
       0.02, 1.  , 0.65, 0.85, 0.85, 0.  , 0.02, 0.96, 0.07, 0.17, 0.85,
       0.56, 0.  , 0.01, 0.92, 0.01, 0.02, 0.  , 0.61, 0.02, 0.65, 0.8 ,
       0.87, 0.56, 0.94, 0.  , 0.83, 0.94, 0.95, 0.  , 0.  , 0.01, 0.  ,
       0.  , 0.  , 0.02, 0.66, 0.71, 0.57, 0.57, 0.07, 0.53, 0.61, 0.  ,
       0.94, 0.73, 0.68, 0.03, 0.  , 0.53, 0.6 , 0.63, 0.78, 0.  , 0.67,
       0.  , 0.91])

In [29]:
dataset.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [30]:
id_input = int(input("UserId: "))
age_input = int(input("Age: "))
Salary_input = int(input("Salary: "))
Gender_input = int(input("Gender: "))

UserId: 1
Age: 60
Salary: 89000
Gender: 1


In [33]:
Future_pred = grid.predict([[id_input,age_input,Salary_input,Gender_input]])
print("Future Prediction ={}".format(Future_pred))

Future Prediction =[1]




In [35]:
# Saving the model
import pickle
filename= "RandomForest_SN.sav"

In [40]:
pickle.dump(grid,open(filename,'wb'))

In [41]:
load_model=pickle.load(open("RandomForest_SN.sav",'rb'))

In [42]:
result=load_model.predict([[12,67,987000,0]])
result




array([1])