# Import Required Library

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Load Data Set

In [3]:
data=pd.read_csv('suv_data.xls')
data.tail()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0
399,15594041,Female,49,36000,1


# Clean Data

In [21]:
data.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [22]:
data.shape

(400, 5)

In [23]:
data.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

# Create X and y

In [4]:
X=data[['Age','EstimatedSalary']]
y=data['Purchased']

# Train The Model

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=91)

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
model= RandomForestClassifier()
model.fit(X_train,y_train)

RandomForestClassifier()

# Check Score and Confusion Matrix

In [9]:
# Score
model.score(X_test, y_test)

0.86

In [11]:
y_pred = model.predict(X_test)

In [12]:
y_pred

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [13]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# confusion matrix
matrix = confusion_matrix(y_test,y_pred)
print('Confusion matrix : \n',matrix)

# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(y_test,y_pred).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,y_pred)
print('Classification report : \n',matrix)

Confusion matrix : 
 [[56  9]
 [ 5 30]]
Outcome values : 
 56 9 5 30
Classification report : 
               precision    recall  f1-score   support

           0       0.92      0.86      0.89        65
           1       0.77      0.86      0.81        35

    accuracy                           0.86       100
   macro avg       0.84      0.86      0.85       100
weighted avg       0.87      0.86      0.86       100



# Hyperparameter Tunning

In [18]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [20]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.4min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [24]:
rf_randomcv.best_params_

{'n_estimators': 600,
 'min_samples_split': 14,
 'min_samples_leaf': 8,
 'max_features': 'auto',
 'max_depth': 890,
 'criterion': 'gini'}

In [25]:
rf_randomcv.best_estimator_

RandomForestClassifier(max_depth=890, min_samples_leaf=8, min_samples_split=14,
                       n_estimators=600)

# Fit Model Using New Parameters

In [30]:
RF_model = RandomForestClassifier(n_estimators = 600,min_samples_split = 14,
 min_samples_leaf = 8,max_features = 'auto',max_depth = 890,criterion ='gini')

RF_model.fit(X_train,y_train)

RandomForestClassifier(max_depth=890, min_samples_leaf=8, min_samples_split=14,
                       n_estimators=600)

# Check Score and Confusion Matrix

In [31]:
RF_model.score(X_test, y_test)

0.88

In [32]:
# confusion matrix
matrix = confusion_matrix(y_test,y_pred)
print('Confusion matrix : \n',matrix)

# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(y_test,y_pred).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,y_pred)
print('Classification report : \n',matrix)

Confusion matrix : 
 [[56  9]
 [ 5 30]]
Outcome values : 
 56 9 5 30
Classification report : 
               precision    recall  f1-score   support

           0       0.92      0.86      0.89        65
           1       0.77      0.86      0.81        35

    accuracy                           0.86       100
   macro avg       0.84      0.86      0.85       100
weighted avg       0.87      0.86      0.86       100



# Save/Dump Model

In [33]:
import pickle
filename = 'finalized_RF_model'
pickle.dump(RF_model, open(filename, 'wb'))