## Random Forest

In [3]:
#Reading data
import pandas as pd
train = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/train.csv', index_col = 0)
X = train.loc[:, train.columns != 'CARAVAN']
y = train['CARAVAN'].astype('category')

### Random Search CV

In [8]:
import numpy as np
from pprint import pprint

#For maximum depth 
d_max = [int(a) for a in np.linspace(10, 110, num = 11)]
d_max.append(None)

#Making a random grid for implementing random search CV
rg = {
    'n_estimators' : [int(b) for b in np.linspace(start = 10, stop = 510, num = 50)],
    'max_features' : ['auto', 'sqrt'],
    'max_depth' : max_depth,
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4],
    'bootstrap' : [True, False]
}

In [10]:
pprint(rg)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [10,
                  36,
                  62,
                  88,
                  115,
                  141,
                  167,
                  194,
                  220,
                  246,
                  273,
                  299,
                  325,
                  352,
                  378,
                  404,
                  431,
                  457,
                  483,
                  510]}


In [35]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state = 79)

rf_model = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_model_randomcv = RandomizedSearchCV(estimator = rf_model, 
                                       param_distributions = rg, #passing the random grid
                                       n_iter = 100, #100 iterations
                                       cv = 3, #3 cross validations
                                       verbose=2, 
                                       random_state= 79, 
                                       n_jobs = -1) 
# Fit the random search model
rf_model_randomcv.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.8min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [10, 36, 62, 88, 115,
                                                         141, 167, 194, 220,
                                                         246, 273, 299, 325,
                                                         352, 378, 404, 431,
                                                         457, 483, 510]},
             

In [36]:
#Printing the best parameters obtained
rf_model_randomcv.best_params_

{'n_estimators': 62,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [37]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

#Predicting on train data using the best parameters obtained from random search CV
model = RandomForestClassifier(n_estimators= 483, min_samples_split= 2, min_samples_leaf= 4, 
                              max_features= "sqrt", max_depth= 10, bootstrap= True)
model.fit(X, y)
y_pred = model.predict(X)

In [40]:
metrics.f1_score(y, y_pred)
print(confusion_matrix(y, y_pred))
#Worse than baseline

[[5474    0]
 [ 347    1]]


In [5]:
#Reading test data
test = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/test.csv', index_col = 0)
X_test = test.loc[:, test.columns != 'CARAVAN']
y_test = test['CARAVAN'].astype('category')

### Grid Search CV

In [45]:
from sklearn.model_selection import GridSearchCV
#Grid for CV using base accuracy score
g = {'n_estimators' : [int(x) for x in np.linspace(start = 10, stop = 510, num = 20)],
    'max_features' : ['auto', 'sqrt'],
    'max_depth' : max_depth,
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4],
    'bootstrap' : [True, False]
}

model = RandomForestClassifier()
# Instantiate the grid search model
gscv_model = GridSearchCV(estimator = model, param_grid = g, 
                    cv = 3, n_jobs = -1, verbose = 2)

In [46]:
#Fitting the model on train data
gscv_model.fit(X, y)

Fitting 3 folds for each of 8640 candidates, totalling 25920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 43.5min
[Parallel(n_jobs=-1)]: Done 9097 tasks      | e

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
                                       110, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [10, 36, 62, 88, 115, 141, 167, 194,
                                          220, 246, 273, 299, 325, 352, 378,
                                          404, 431, 457, 483, 510]},
             verbose=2)

In [47]:
#Displaying best parameters
gscv_model.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 10}

In [49]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

#Using best parameters and fitting the model
model = RandomForestClassifier(n_estimators= 10, min_samples_split= 10, min_samples_leaf= 1, 
                              max_features= "sqrt", max_depth= 10, bootstrap= True)
model.fit(X, y)
y_pred = model.predict(X)
metrics.f1_score(y, y_pred)
print(confusion_matrix(y, y_pred))

[[5471    3]
 [ 328   20]]


In [56]:
#New model to optimize F score
max_depth = [int(x) for x in np.linspace(10, 90, num = 10)]
max_depth.append(None)
max_depth

[10, 65, 121, 176, 232, 287, 343, 398, 454, 510]

In [57]:
### Grid Search using score = f1
from sklearn.model_selection import GridSearchCV
g = {'n_estimators' : [int(x) for x in np.linspace(start = 10, stop = 510, num = 10)],
    'max_features' : ['sqrt'],
    'max_depth' : max_depth,
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4],
    'bootstrap' : [True]
}
# Create a based model
model_f1 = RandomForestClassifier()
# Instantiate the grid search model
modelf1_cv = GridSearchCV(estimator = model_f1, param_grid = g, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring = 'f1')
modelf1_cv.fit(X,y)
print(f'Best F1 score: {modelf1_cv.best_score_} with parameters: {modelf1_cv.best_params_}')

Fitting 3 folds for each of 990 candidates, totalling 2970 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 2970 out of 2970 | elapsed: 14.3min finished


Best score: 0.10384615384615385 with param: {'bootstrap': True, 'max_depth': 36, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}


In [7]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

model_f1 = RandomForestClassifier(n_estimators= 10, min_samples_split= 2, min_samples_leaf= 1, 
                              max_features= "sqrt", max_depth= 36, bootstrap= True)
model_f1.fit(X, y)
y_pred = model.predict(X_test)

print(f'Random Forest Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Random Forest Accuracy: {accuracy_score(y_test,y_pred)}')
print(f'Random Forest Area Under Curve: {roc_auc_score(y_test, y_pred)}') 
print(f'Random Forest Recall: {recall_score(y_test,y_pred)}')
print(f'Random Forest Precision score: {precision_score(y_test,y_pred)}')
print(f'Random Forest F1 score: {metrics.f1_score(y_test,y_pred)}')

Random Forest Confusion Matrix: 
[[3718   44]
 [ 223   15]]
Random Forest Accuracy: 0.93325
Random Forest Area Under Curve: 0.5256646518256425
Random Forest Recall: 0.06302521008403361
Random Forest Precision score: 0.2542372881355932
Random Forest F1 score: 0.10101010101010101


In [63]:
#Checking the same on SMOTE data
smote_data = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/SMOTE_traindata.csv', index_col = 0)
X_smote = smote_data.loc[:, smote_data.columns != 'CARAVAN']
y_smote = smote_data['CARAVAN'].astype('category')

from sklearn.model_selection import GridSearchCV
g = {'n_estimators' : [int(x) for x in np.linspace(start = 10, stop = 510, num = 10)],
    'max_features' : ['sqrt'],
    'max_depth' : max_depth,
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4],
    'bootstrap' : [True]
}
# Create a based model
mod = RandomForestClassifier()
# Instantiate the grid search model
mod_gs = GridSearchCV(estimator = mod, param_grid = g, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring = 'f1')
mod_gs.fit(X_smote, y_smote)
print(f'Best F1 score: {mod_gs.best_score_} parameters: {mod_gs.best_params_}')

Fitting 3 folds for each of 990 candidates, totalling 2970 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 34.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 45.6min
[Parallel(n_jobs=-1)]: Done 2970 out of 2970 | elapsed: 53.0min finished


Best score: 0.9643397070180489 with param: {'bootstrap': True, 'max_depth': 81, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 343}


In [64]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators= 343, min_samples_split= 5, min_samples_leaf= 1, 
                              max_features= "sqrt", max_depth= 81, bootstrap= True)
model.fit(X_smote, y_smote)

#prediction on train data
y_pred = model.predict(X)

print(f'Accuracy Score: {accuracy_score(y,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y, y_pred)}') 
print(f'Recall score: {recall_score(y,y_pred)}')
print(f'F1 score: {metrics.f1_score(y,y_pred)}')
print(f'Precision score: {precision_score(y,y_pred)}')

Accuracy Score: 0.9385
Confusion Matrix: 
[[3744   18]
 [ 228   10]]
Area Under Curve: 0.5186160588637369
Recall score: 0.04201680672268908
F1 score: 0.07518796992481203
Precision score: 0.35714285714285715
