# Modeling with Upsampling

Import necessary modules and metrics.

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 300)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import resample
# Import metrics used
from sklearn.metrics import f1_score, accuracy_score, recall_score, confusion_matrix
import pickle
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

Import the California Wildfires dataset.

In [2]:
df = pd.read_csv('../data/california_wildfires.csv')

## Creating Dummy Variables for Categorical Features

There are two features that are categorical. The counties and the month of the year column that we engineered.

In [3]:
# Create dummy variables for the county column
counties = pd.get_dummies(df.county, drop_first = True)
# Drop county column along with unnecessary columns (Unnamed columns, year, and acres burned)
df2 = df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'county', 'year', 'acres_burned'], axis = 1)

In [4]:
# Feature engineer month column from the date column
df2['month'] = pd.DatetimeIndex(df2['date']).month
# Drop the date column
df2.drop(columns = ['date'], axis = 1, inplace = True)

In [5]:
# Create dummy variables for the months
month = pd.get_dummies(df2.month, drop_first = True)
# Drop the month column
df2.drop(columns = 'month', axis =1, inplace = True)

In [6]:
# Combine the original dataframe with the dummy variables
df2 = pd.concat([df2, counties, month], axis = 1)

In [7]:
df2.head()

Unnamed: 0,fire_started,Alfalfa & Hay_acres,Alfalfa & Hay_percentage,Almonds_acres,Almonds_percentage,Barren_acres,Barren_percentage,Corn_acres,Corn_percentage,Cotton_acres,Cotton_percentage,Deciduous Forest_acres,Deciduous Forest_percentage,Evergreen Forest_acres,Evergreen Forest_percentage,Fallow_acres,Fallow_percentage,Fruit Trees_acres,Fruit Trees_percentage,Grain Crops_acres,Grain Crops_percentage,Grapes_acres,Grapes_percentage,Grassland_acres,Grassland_percentage,High Intensity Developed_acres,High Intensity Developed_percentage,Low Intensity Developed_acres,Low Intensity Developed_percentage,Mixed Forest_acres,Mixed Forest_percentage,Other Ocean/Mexico_acres,Other Ocean/Mexico_percentage,Other Tree Crops_acres,Other Tree Crops_percentage,Other_acres,Other_percentage,Rice_acres,Rice_percentage,Shrubland_acres,Shrubland_percentage,Tomatoes_acres,Tomatoes_percentage,Vegs & Fruits_acres,Vegs & Fruits_percentage,Walnuts_acres,Walnuts_percentage,Water_acres,Water_percentage,Wetlands_acres,Wetlands_percentage,Winter Wheat_acres,Winter Wheat_percentage,max_elevation,min_elevation,Avg Air Temp (F)_Weekly,Avg Rel Hum (%)_Weekly,Avg Wind Speed (mph)_Weekly,Dew Point (F)_Weekly,Max Air Temp (F)_Weekly,Max Rel Hum (%)_Weekly,Min Air Temp (F)_Weekly,Min Rel Hum (%)_Weekly,Precip (in)_Weekly,Avg Air Temp (F)_month,Avg Rel Hum (%)_month,Avg Wind Speed (mph)_month,Dew Point (F)_month,Max Air Temp (F)_month,Max Rel Hum (%)_month,Min Air Temp (F)_month,Min Rel Hum (%)_month,Precip (in)_month,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,Del Norte,El Dorado,Fresno,Glenn,Humboldt,Imperial,Inyo,Kern,Kings,Lake,Lassen,Los Angeles,Madera,Marin,Mariposa,Mendocino,Merced,Modoc,Mono,Monterey,Napa,Nevada,Orange,Placer,Plumas,Riverside,Sacramento,San Benito,San Bernardino,San Diego,San Francisco,San Joaquin,San Luis Obispo,San Mateo,Santa Barbara,Santa Clara,Santa Cruz,Shasta,Sierra,Siskiyou,Solano,Sonoma,Stanislaus,Sutter,Tehama,Trinity,Tulare,Tuolumne,Ventura,Yolo,Yuba,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1102.856805,0.300074,4.225505,0.00115,194.595625,0.052947,4.670295,0.001271,0.0,0.0,5.33748,0.001452,7838.756565,2.132827,1536.74945,0.41813,1.77916,0.000484,991.214515,0.269697,3722.44751,1.012831,153671.38668,41.812059,28431.42159,7.735834,39470.886995,10.739534,74885.956375,20.375531,0.0,0.0,8.673405,0.00236,0.0,0.0,0.88958,0.000242,30958.051185,8.423298,4.670295,0.001271,164.12751,0.044657,4.670295,0.001271,19403.51896,5.279454,4497.494085,1.223712,624.48516,0.169915,1242,-42,44.214286,82.785714,2.392857,39.321429,54.157143,96.5,35.771429,60.785714,0.095714,45.506897,78.189655,2.915517,38.932759,55.896552,95.448276,35.725862,55.810345,0.130172,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,189.03575,0.04008,0.0,0.0,15482.472715,3.28265,0.0,0.0,0.0,0.0,194.595625,0.041259,195088.00753,41.363269,0.44479,9.4e-05,0.222395,4.7e-05,0.0,0.0,0.0,0.0,5644.82989,1.196837,121.42767,0.025746,3192.480225,0.676881,0.667185,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247783.390805,52.535935,0.0,0.0,0.0,0.0,0.0,0.0,2650.50361,0.561969,1297.45243,0.275091,0.0,0.0,3556,1442,29.657143,76.514286,3.228571,21.328571,34.428571,91.857143,22.857143,55.428571,0.0,30.789655,68.162069,4.968966,19.6,39.344828,86.0,22.758621,46.344828,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,1326.80857,0.41429,16.679625,0.005208,1873.01069,0.58484,242.632945,0.075761,0.0,0.0,17190.911105,5.367789,114386.866695,35.71681,168.13062,0.052498,12.00933,0.00375,120.0933,0.037499,2587.34343,0.807887,112912.61024,35.25648,440.119705,0.137425,8263.975805,2.580391,1727.11957,0.539286,0.0,0.0,1.33437,0.000417,0.0,0.0,1.111975,0.000347,52457.865415,16.379744,0.0,0.0,1.77916,0.000556,122.094855,0.038124,5822.74589,1.818128,105.86002,0.033054,479.48362,0.149717,3121,43,34.114286,83.571429,3.157143,29.585714,40.071429,96.0,27.757143,66.571429,0.141429,34.289655,76.724138,3.606897,27.410345,41.2,93.172414,27.768966,58.310345,0.155517,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0,3777.15668,0.374865,46196.556585,4.584787,1869.45237,0.185534,2023.34971,0.200808,9.118195,0.000905,33181.556395,3.293111,408193.790775,40.511281,56434.51041,5.600855,10563.31771,1.048359,2628.041715,0.260821,247.525635,0.024566,170758.216925,16.946936,4421.65739,0.438828,25520.048645,2.532743,165.684275,0.016443,0.0,0.0,790.39183,0.078443,0.0,0.0,105624.2813,10.482705,55372.129495,5.495419,94.29548,0.009358,469.475845,0.046593,42057.340845,4.17399,21360.372565,2.119915,11589.89303,1.150241,4257.08509,0.422495,2192,-1,40.985714,81.285714,3.142857,35.557143,50.114286,91.285714,32.171429,62.857143,0.117143,42.389655,77.448276,3.848276,35.586207,52.455172,88.965517,33.365517,58.862069,0.175517,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0,31.802485,0.005011,28.46656,0.004485,218.39189,0.034409,0.88958,0.00014,0.0,0.0,34882.878145,5.495994,255438.00431,40.245698,28.688955,0.00452,12.231725,0.001927,2.22395,0.00035,522.18346,0.082273,207502.763615,32.69323,465.027945,0.073268,12257.745215,1.931277,3351.270255,0.528012,0.0,0.0,0.88958,0.00014,0.0,0.0,0.0,0.0,106091.088405,16.715249,0.0,0.0,0.667185,0.000105,425.66403,0.067066,13178.68291,2.076376,245.07929,0.038614,11.786935,0.001857,3522,787,41.928571,93.014286,5.657143,39.0,50.142857,100.0,35.571429,74.142857,0.0,42.931034,87.017241,6.268966,37.196552,52.827586,97.551724,34.344828,61.275862,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Train-Test Split

We want to split our data into a training dataset and a test dataset. We want to do this before we work on the class imbalance so that we have a holdout set of data to test the model on.

In [8]:
# Split dataset into target variable and features
y = df2.fire_started
X = df2.drop(columns = ['fire_started'], axis = 1)

In [9]:
# Use train_test_split to create a training dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,  y, test_size = 0.25, random_state = 0)

## Upsampling Minority Class

There is a high class imbalance in the target variable. As seen in the EDA, the grand majority of the target variable are instances of no wildfire. In this case, we use upsampling the instances of wildfire to resolve the class imbalance issue. The first step to combine the `X_train` and `y_train` dataframes back into one dataframe. 

In [10]:
# Concatenate the X_train and y_train back into one training dataframe
training = pd.concat([X_train, y_train], axis=1)

The second step is to split the dataframe into the majority class and the minority class. In this case, the majority class are observations of weeks with no wildfire incidents. The minority class are observations of weeks with one or more wildfire incidents.

In [11]:
# Split the target variable by class into two dataframes
no_fire = training[training.fire_started == 0] # 0 = No Wildfire
fire = training[training.fire_started == 1] # 1 = Wildfire

The third step is to upsample the minority class with replacement. The minority class should be equal to the same number of observations and then recombine the two class dataframes back into one.

In [12]:
# Resample the minority class (wildfire)
fire_upsampled = resample(fire,
                          replace=True, # sample with replacement
                          n_samples=no_fire.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [13]:
# Create a new dataframe combining the target classes
resampled_df = pd.concat([no_fire, fire_upsampled])

In [14]:
# Double-check the different target classes
resampled_df.fire_started.value_counts()

1.0    12812
0.0    12812
Name: fire_started, dtype: int64

There is no longer any class imbalance between incidents of wildfire and incidents with no wildfires. We split the dataframe back into training target variable and the training features.

In [15]:
y_train = resampled_df.fire_started
X_train = resampled_df.drop(columns = ['fire_started'], axis = 1)

## Logistic Regression

The first type of model we tried was logistic regression

### Base Model

For our base model, we run the resampled training data through a logistic regression model with default settings

In [16]:
# Instantiate a logistic regression model
logreg = LogisticRegression(random_state = 0) # random state for consistant results
# Train model on resampled training data
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)

In [17]:
# Use the model to predict the target variable on the training dataset
y_hat_train = logreg.predict(X_train)
# Use the model to predict the target variable on the test dataset
y_hat_test = logreg.predict(X_test)

In [18]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, y_hat_train), 'Test', f1_score(y_test, y_hat_test))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, y_hat_train), 'Test', accuracy_score(y_test, y_hat_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, y_hat_train), 'Test', recall_score(y_test, y_hat_test))

F1 Score: Training 0.680963106303334 Test 0.1476340694006309
Accuracy Score: Training 0.6075163908835467 Test 0.40471469486671074
Recall Score: Training 0.8377302528879176 Test 0.8068965517241379


The results of our metrics show that the recall score is higher than the accuracy score, and we have a low f1 score. We use a confusion matrix to find the false positive and false negative values.

In [19]:
confusion_matrix(y_test, y_hat_test)

array([[1603, 2646],
       [  56,  234]])

From the confusion matrix, the majority of predictions are false positives. We want to try to decrease the number of false positives in our next model.

### Model - Scaled Data

In our first iteration we want to check how normalizing the features will change our score. To normalize our data, we will use a Standard Scaler.

In [20]:
# Insantiate the StandardScaler()
ss = StandardScaler()
# Fit the feature training data
ss.fit(X_train)

# Transform both the training and test features
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

In [21]:
# Pickle Standar Scaler
pickle_out = open("ss.pickle","wb")
pickle.dump(ss, pickle_out)
pickle_out.close()

In [22]:
# Instantiate a new logistic regression model
logreg1 = LogisticRegression(solver = 'liblinear')
# Fit the data to the new scaled data
logreg1.fit(X_train_scaled, y_train)

LogisticRegression(solver='liblinear')

In [23]:
# Use model to predict target variable on the training dataset
y_hat_train1 = logreg1.predict(X_train_scaled)
# Use model to predict target variable on the test dataset
y_hat_test1 = logreg1.predict(X_test_scaled)

In [24]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, y_hat_train1), 'Test', f1_score(y_test, y_hat_test1))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, y_hat_train1), 'Test', accuracy_score(y_test, y_hat_test1))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, y_hat_train1), 'Test', recall_score(y_test, y_hat_test1))

F1 Score: Training 0.8126948775055679 Test 0.28343558282208586
Accuracy Score: Training 0.8030752419606619 Test 0.7426745979290592
Recall Score: Training 0.8544333437402435 Test 0.7965517241379311


The results from our metrics show an increase in score across the board. However, the recall and accuracy score are still much better than the f1 score. The f1 score is highly overfit, while the recall and accuracy scores are slightly overfit. We check the confusion matrix next to check the value counts for false positive and false negatives.

In [25]:
confusion_matrix(y_test, y_hat_test1)

array([[3140, 1109],
       [  59,  231]])

The confusion matrix shows as in the previous model, the most error occurs from the model predicting false positives, however we have decrease the number of false positives by more than half.

In [26]:
# pickle logistic model
with open("../models/best_logistic.pickle", "wb") as best_logistic:
    pickle.dump(logreg1, best_logistic)

### Validation

In [96]:
scores = cross_val_score(logreg1, X_train_scaled, y_train, cv=10)
print(scores.mean())

0.8011629344880593


## K Nearest Neighbor (KNN) Models

The second type of model we used was K Nearest Neighbor

### Base KNN Model

For our base knn model, we chose k as 3. Due to how knn models function, the number of nearest neighbors should always be odd.

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
# Instantiate a knn model using 3 nearest neighbors
knn = KNeighborsClassifier(n_neighbors = 3)

In [29]:
# Fit knn model using the scaled data from the previous scaled logistic model
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=3)

In [30]:
# Predict target variable for both the train and test datasets.
knn_train = knn.predict(X_train_scaled)
knn_test = knn.predict(X_test_scaled)

In [31]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, knn_train), 'Test', f1_score(y_test, knn_test))
# Print the sccuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, knn_train), 'Test', accuracy_score(y_test, knn_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, knn_train), 'Test', recall_score(y_test, knn_test))

F1 Score: Training 0.975706343766659 Test 0.2788339670468948
Accuracy Score: Training 0.9751014673743366 Test 0.874641991628112
Recall Score: Training 1.0 Test 0.3793103448275862


The results show that while the test accuracy is high, the recall score and the f1 score are highly overfit. We check the confusion matrix to check the values of the false positive and false negatives.

In [32]:
confusion_matrix(y_test, knn_test)

array([[3860,  389],
       [ 180,  110]])

As seen in previous models, the f1 score and recall are highly overfit. The main error is in the prediction of false positives.

### Tuning Number of Nearest Neighbors

To tune the number (k) nearest neighbors, want to find the value of k that will return the max value for a given metric. Due to the business problem of wildfires, we want to reduce the number of false positives and thus maximise recall. 

In [33]:
# Create a function to find the max recall score and return the score along with the k value
def max_value(l):
    max_val = max(l)
    max_idx = l.index(max_val)
    return max_idx, max_val

In [34]:
# Create an empty list for recall scores
k_scores = []
# Choose a range of k values to test
k_range = list(range(1, 21))
# Iterate through the different k values
for k in k_range:
    # Instantiate new knn model with k nearest neighbors
    knn = KNeighborsClassifier(n_neighbors = k)
    # Fit knn model on scaled training data
    knn.fit(X_train_scaled, y_train)
    # Use model to predict target variable on testing set
    y_pred = knn.predict(X_test_scaled)
    # Find the recall score
    recall = recall_score(y_test, y_pred)
    # Append recall score to list of recall scorees
    k_scores.append(recall)

# Find max recall score
idx, val = max_value(k_scores)
# Print max recall score and it corresponding k value
print(idx + 1, val)

19 0.7413793103448276


The best k value is 19 with a recall score of 74%. We rerun the model using k=19 and check the f1 and accuracy metrics of the model

In [35]:
knn = KNeighborsClassifier(n_neighbors = 19)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
knn_train1 = knn.predict(X_train_scaled)
print('F1 Score: Training', f1_score(y_train, knn_train1), 'Test', f1_score(y_test, y_pred))
print('Accuracy Score: Training', accuracy_score(y_train, knn_train1), 'Test', accuracy_score(y_test, y_pred))
print('Recall Score: Training', recall_score(y_train, knn_train1), 'Test', recall_score(y_test, y_pred))

F1 Score: Training 0.883275080458179 Test 0.24826789838337182
Accuracy Score: Training 0.8683655947549173 Test 0.7131526768010575
Recall Score: Training 0.9960974086793631 Test 0.7413793103448276


This model has a highly increased test scores for recall, however the accuracy is much worse and the f1 score slight worse. The model is still highly overfit.

In [36]:
confusion_matrix(y_test, y_pred)

array([[3022, 1227],
       [  75,  215]])

The majority of error from this model is still predicting false positives.

In [37]:
# Pickle KNN model
with open("../models/best_knn.pickle", "wb") as best_knn:
    pickle.dump(knn, best_knn)

### Validation

In [97]:
scores_knn = cross_val_score(knn, X_train_scaled, y_train, cv=10)
print(scores_knn.mean())

0.8476827201973194


## Decision Tree

### Base Model

For our base model we will instantiate a Decision Tree Classifier with default settings.

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [40]:
dt_train = dt.predict(X_train)
dt_test = dt.predict(X_test)

print('F1 Score: Training', f1_score(y_train, dt_train), 'Test', f1_score(y_test, dt_test))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train), 'Test', accuracy_score(y_test, dt_test))
print('Recall Score: Training', recall_score(y_train, dt_train), 'Test', recall_score(y_test, dt_test))

F1 Score: Training 1.0 Test 0.24087591240875914
Accuracy Score: Training 1.0 Test 0.9083498567966513
Recall Score: Training 1.0 Test 0.22758620689655173


The base model has a high accuracy score, but a low f1 and recall score. The model is also highly overfit.

In [41]:
confusion_matrix(y_test, dt_test)

array([[4057,  192],
       [ 224,   66]])

The majority of the error is from predicting false negatives, while the false positives are also high.

### Tuning Parameters - GridSearch

In [16]:
from sklearn.model_selection import GridSearchCV

For the first tuning, we want to check a wide range for the parameters of max depth, max_features and the min_sample_split.

In [43]:
parameters = {'max_depth' : range(1, 21, 1), 'max_features' : range(55, 75, 1), 'min_samples_split' : range(15, 25, 1)}

In the grid search, we use the f1 scoring metric so that ideally both accuracy and recall scores will increase.

In [44]:
dtg = DecisionTreeClassifier(random_state = 0)
grid_model = GridSearchCV(dtg, parameters, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

grid_model.fit(X_train, y_train)

Fitting 10 folds for each of 4000 candidates, totalling 40000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:   47.6s
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 6018 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 8418 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 9768 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 11218 tasks      

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(1, 21),
                         'max_features': range(55, 75),
                         'min_samples_split': range(15, 25)},
             scoring='f1', verbose=1)

In [45]:
print(grid_model.best_score_)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

0.9471360686532485
{'max_depth': 20, 'max_features': 66, 'min_samples_split': 15}
DecisionTreeClassifier(max_depth=20, max_features=66, min_samples_split=15,
                       random_state=0)


Comparing the parameters to the range tested, the `max_depth` parameter is at the top end, while `min_samples_split` is at the bottoom of the ranges tested. The `max_features` is in the center of the range tested.

In [46]:
dt_train2 = dt.predict(X_train)
dt_test2 = dt.predict(X_test)

print('F1 Score: Training', f1_score(y_train, dt_train2), 'Test', f1_score(y_test, dt_test2))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train2), 'Test', accuracy_score(y_test, dt_test2))
print('Recall Score: Training', recall_score(y_train, dt_train2), 'Test', recall_score(y_test, dt_test2))

F1 Score: Training 1.0 Test 0.24087591240875914
Accuracy Score: Training 1.0 Test 0.9083498567966513
Recall Score: Training 1.0 Test 0.22758620689655173


The model is still highly overfit. The test recall and f1 score are still much worse than the test score.

In [47]:
confusion_matrix(y_test, dt_test2)

array([[4057,  192],
       [ 224,   66]])

The model the majority of the error is still in predicting false negatives, hence the low recall score.

### Tuning Parameters - GridSearch 2

In the previous model, the results showed that it was highly overfit and could not generalize well. By reducing the `max_depth`, it should reduce the overfitting. In the previous model the `min_samples_split` was at a low end of the range, so for model we reduce the range. The `max_features` will test the same range of features.

In [48]:
parameters2 = {'max_depth' : range(1, 15, 1), 'max_features' : range(55, 75, 1), 'min_samples_split' : range(10, 20, 1)}

In [49]:
dtg2 = DecisionTreeClassifier(random_state = 0)
dtg2_model = GridSearchCV(dtg2, parameters2, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

dtg2_model.fit(X_train, y_train)

Fitting 10 folds for each of 2800 candidates, totalling 28000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 2404 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 4804 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done 6304 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 8004 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 9904 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 12004 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 14304 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 16804 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 18728 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 20178 tasks 

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(1, 15),
                         'max_features': range(55, 75),
                         'min_samples_split': range(10, 20)},
             scoring='f1', verbose=1)

In [50]:
print(dtg2_model.best_score_)
print(dtg2_model.best_params_)
print(dtg2_model.best_estimator_)

0.9242727259779653
{'max_depth': 14, 'max_features': 73, 'min_samples_split': 13}
DecisionTreeClassifier(max_depth=14, max_features=73, min_samples_split=13,
                       random_state=0)


In [51]:
dt_train3 = dtg2_model.predict(X_train)
dt_test3 = dtg2_model.predict(X_test)

print('F1 Score: Training', f1_score(y_train, dt_train3), 'Test', f1_score(y_test, dt_test3))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train3), 'Test', accuracy_score(y_test, dt_test3))
print('Recall Score: Training', recall_score(y_train, dt_train3), 'Test', recall_score(y_test, dt_test3))

F1 Score: Training 0.9281367719501593 Test 0.2643312101910828
Accuracy Score: Training 0.9225725881985638 Test 0.7964309319233311
Recall Score: Training 1.0 Test 0.5724137931034483


By reducing the max depth, we were able to reduce overfitting in our recall score. From the confusion matrix, we can sse that while the number of false. negatives decreased, the number of false positives increased.

In [52]:
confusion_matrix(y_test, dt_test3)

array([[3449,  800],
       [ 124,  166]])

### Tuning Parameters - GridSearch 3

Due to the increase in test recall score and increase in the test f1 score, we reduce the possible maximum depth by a couple. We rerun the gridsearch with the same ranges for the other parameters.

In [53]:
parameters3 = {'max_depth' : range(10, 13, 1), 'max_features' : range(35, 65, 1), 'min_samples_split' : range(14, 30, 1)}

In [54]:
dtg3 = DecisionTreeClassifier(random_state = 0)
dtg3_model = GridSearchCV(dtg3, parameters3, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

dtg3_model.fit(X_train, y_train)

Fitting 10 folds for each of 1440 candidates, totalling 14400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 6018 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 8418 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 9768 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 11218 tasks      

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(10, 13),
                         'max_features': range(35, 65),
                         'min_samples_split': range(14, 30)},
             scoring='f1', verbose=1)

In [55]:
print(dtg3_model.best_score_)
print(dtg3_model.best_params_)
print(dtg3_model.best_estimator_)

0.9101123648575825
{'max_depth': 12, 'max_features': 43, 'min_samples_split': 14}
DecisionTreeClassifier(max_depth=12, max_features=43, min_samples_split=14,
                       random_state=0)


In [56]:
dt_train4 = dtg3_model.predict(X_train)
dt_test4 = dtg3_model.predict(X_test)

print('F1 Score: Training', f1_score(y_train, dt_train4), 'Test', f1_score(y_test, dt_test4))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train4), 'Test', accuracy_score(y_test, dt_test4))
print('Recall Score: Training', recall_score(y_train, dt_train4), 'Test', recall_score(y_test, dt_test4))

F1 Score: Training 0.9239783001808318 Test 0.2807843137254902
Accuracy Score: Training 0.9179675304402123 Test 0.7979731218330028
Recall Score: Training 0.9970340305963159 Test 0.6172413793103448


The test score for recall and f1 both increased and the test accuracy remained the same. This successfully reduced overfitting.

In [57]:
confusion_matrix(y_test, dt_test4)

array([[3443,  806],
       [ 111,  179]])

The confusion matrix shows the predicted false negatives reduced by 13 predictions, however the false positives increased by 6 predictions since the previous model.

### Tuning Parameters - GridSearch 4

In this iteration, we increased the `min_samples_split` to decrease the overfitting. Considering the parameter has always chosen the smallest in the range tested, we tested multiple minimums.

In [58]:
parameters4 = {
    'max_depth' : range(8, 12, 1), 
    'max_features' : range(64, 75, 1), 
    'min_samples_split' : range(400, 402, 1)
}

In [59]:
dtg4 = DecisionTreeClassifier(random_state = 0)
dtg4_model = GridSearchCV(dtg4, parameters4, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

dtg4_model.fit(X_train, y_train)

Fitting 10 folds for each of 88 candidates, totalling 880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 880 out of 880 | elapsed:   25.3s finished


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(8, 12),
                         'max_features': range(64, 75),
                         'min_samples_split': range(400, 402)},
             scoring='f1', verbose=1)

In [60]:
print(dtg4_model.best_score_)
print(dtg4_model.best_params_)
print(dtg4_model.best_estimator_)

0.836401284806693
{'max_depth': 10, 'max_features': 74, 'min_samples_split': 400}
DecisionTreeClassifier(max_depth=10, max_features=74, min_samples_split=400,
                       random_state=0)


In [61]:
dt_train5 = dtg4_model.best_estimator_.predict(X_train)
dt_test5 = dtg4_model.best_estimator_.predict(X_test)

print('F1 Score: Training', f1_score(y_train, dt_train5), 'Test', f1_score(y_test, dt_test5))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train5), 'Test', accuracy_score(y_test, dt_test5))
print('Recall Score: Training', recall_score(y_train, dt_train5), 'Test', recall_score(y_test, dt_test5))

F1 Score: Training 0.8480333945086185 Test 0.25458996328029376
Accuracy Score: Training 0.8351935685295035 Test 0.7316589557171183
Recall Score: Training 0.9196846706212926 Test 0.7172413793103448


The model found diminishing returns with increasing the minimum sample at 400 samples. The model is still overfit, but the overfitting seen with the recall score was reduced by 18%. The difference between the training and test score for accuracy also decreased.

In [62]:
confusion_matrix(y_test, dt_test5)

array([[3113, 1136],
       [  82,  208]])

From the previous model, the number of false negatives decreased, while the number of false positives increased.

In [63]:
# Pickle Decision Tree Model
with open("../models/best_decision_tree.pickle", "wb") as best_decision_tree:
    pickle.dump(dtg4_model.best_estimator_, best_decision_tree)

## Random Forest

### Base Model

For the base model we instantiate a Random Forest Classifier with a set `random_state` for reproducibility and kept everything else at their default settings.

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
rfc = RandomForestClassifier(random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [66]:
rfc_train = rfc.predict(X_train)
rfc_test = rfc.predict(X_test)

In [67]:
print('F1 Score: Training', f1_score(y_train, rfc_train), f1_score(y_test, rfc_test))
print('Accuracy Score: Training', accuracy_score(y_train, rfc_train), accuracy_score(y_test, rfc_test))
print('Recall Score: Training', recall_score(y_train, rfc_train), recall_score(y_test, rfc_test))

F1 Score: Training 1.0 0.27692307692307694
Accuracy Score: Training 1.0 0.9275170742454285
Recall Score: Training 1.0 0.21724137931034482


As seen in previous models, the base model was able to have perfect training scores, however from the f1 score and recall, the model is highly overfit.

In [68]:
confusion_matrix(y_test, rfc_test)

array([[4147,  102],
       [ 227,   63]])

The majority of error from this model occurs in the predictions of false negatives. We want to maximize the recall score, while not boosting the number of false positives too high.

### Tune Parameters - GridSearch

We use gridSearch to reduce the overfitting of the model and increase the test scores. The chosen parameters to be tuned are `n_estimators`, `criterion`, `min_samples_split`, and `max_features`.

In [69]:
param_grid = { 
    'n_estimators': [100,300],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': list(range(2,10)),
    'max_features': list(range(3,7))
}

In [70]:
grid_tree=GridSearchCV(RandomForestClassifier(random_state = 0), param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

In [71]:
grid_tree.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:  6.2min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
                         'n_estimators': [100, 300]},
             scoring='f1', verbose=1)

In [72]:
print(grid_tree.best_score_)
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)

0.9842912884846985
{'criterion': 'gini', 'max_features': 5, 'min_samples_split': 3, 'n_estimators': 100}
RandomForestClassifier(max_features=5, min_samples_split=3, random_state=0)


From the best paramters, the number of estimators is 100 and we can further tune this parameter by also checking 50 and 150. The max features if 5, which is at the high end of the range tested. The `min_samples_split` is at the low end of the range checked.

In [73]:
grid_tree_train = grid_tree.best_estimator_.predict(X_train)
grid_tree_test = grid_tree.best_estimator_.predict(X_test)
print('F1 Score: Training', f1_score(y_train, grid_tree_train), f1_score(y_test, grid_tree_test))
print('Accuracy Score: Training', accuracy_score(y_train, grid_tree_train), accuracy_score(y_test, grid_tree_test))
print('Recall Score: Training', recall_score(y_train, grid_tree_train), recall_score(y_test, grid_tree_test))

F1 Score: Training 1.0 0.28389830508474573
Accuracy Score: Training 1.0 0.9255342586472791
Recall Score: Training 1.0 0.23103448275862068


The model is still highly overfit, however the test scores did increase slightly.

In [74]:
confusion_matrix(y_test, grid_tree_test)

array([[4134,  115],
       [ 223,   67]])

The majority of the error continues to be from the number of false negatives.

### Tuning Parameters - GridSearch2

For the second iteration of tuning, we are trying maximize the test metrics by finding the best `n_estimators` and increasing the range of `max_features`, while tuning the other parameters using the same range as the previous model.

In [75]:
param_grid2 = { 
    'n_estimators': [50, 100, 150],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': list(range(2,10)),
    'max_features': list(range(4,10))
}

In [76]:
grid_tree2 =GridSearchCV(RandomForestClassifier(random_state = 0), 
                         param_grid2, 
                         cv=5, 
                         scoring='f1', 
                         verbose=1, 
                         n_jobs=-1)

In [77]:
grid_tree2.fit(X_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   50.3s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  8.9min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [4, 5, 6, 7, 8, 9],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
                         'n_estimators': [50, 100, 150]},
             scoring='f1', verbose=1)

In [78]:
print(grid_tree2.best_score_)
print(grid_tree2.best_params_)
print(grid_tree2.best_estimator_)

0.9848576101055144
{'criterion': 'entropy', 'max_features': 9, 'min_samples_split': 3, 'n_estimators': 50}
RandomForestClassifier(criterion='entropy', max_features=9, min_samples_split=3,
                       n_estimators=50, random_state=0)


The number of estimators decreased and the number of max features incresed. The `min_samples_split` remained constant compared to the previous model.

In [79]:
grid_tree_train2 = grid_tree2.best_estimator_.predict(X_train)
grid_tree_test2 = grid_tree2.best_estimator_.predict(X_test)
print('F1 Score: Training', f1_score(y_train, grid_tree_train2), f1_score(y_test, grid_tree_test2))
print('Accuracy Score: Training', accuracy_score(y_train, grid_tree_train2), accuracy_score(y_test, grid_tree_test2))
print('Recall Score: Training', recall_score(y_train, grid_tree_train2), recall_score(y_test, grid_tree_test2))

F1 Score: Training 1.0 0.2733188720173536
Accuracy Score: Training 1.0 0.9261951971799955
Recall Score: Training 1.0 0.21724137931034482


The F1 and Recall Test Score both did slightly worse, however the number of features was at the high end of the range, so by increasing the range of max features should boost the scores.

### Tune Parameters - GridSearch 3

Based on the previous model, we will check another range around 50 for `n_estimators` and we will increase in the range of the `max_features`.

In [80]:
param_grid3 = { 
    'n_estimators': [35, 50, 65],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': list(range(2,10)),
    'max_features': list(range(10,25))
}

In [81]:
grid_tree3 =GridSearchCV(RandomForestClassifier(random_state = 0), 
                         param_grid3, 
                         cv=5, 
                         scoring='f1', 
                         verbose=1, 
                         n_jobs=-1)

In [82]:
grid_tree3.fit(X_train, y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   46.5s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed: 20.8min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 24.3min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [10, 11, 12, 13, 14, 15, 16, 17, 18,
                                          19, 20, 21, 22, 23, 24],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
                         'n_estimators': [35, 50, 65]},
             scoring='f1', verbose=1)

In [83]:
print(grid_tree3.best_score_)
print(grid_tree3.best_params_)
print(grid_tree3.best_estimator_)

0.9854639621231124
{'criterion': 'entropy', 'max_features': 20, 'min_samples_split': 3, 'n_estimators': 50}
RandomForestClassifier(criterion='entropy', max_features=20,
                       min_samples_split=3, n_estimators=50, random_state=0)


In [84]:
grid_tree_train3 = grid_tree3.best_estimator_.predict(X_train)
grid_tree_test3 = grid_tree3.best_estimator_.predict(X_test)
print('F1 Score: Training', f1_score(y_train, grid_tree_train3), f1_score(y_test, grid_tree_test3))
print('Accuracy Score: Training', accuracy_score(y_train, grid_tree_train3), accuracy_score(y_test, grid_tree_test3))
print('Recall Score: Training', recall_score(y_train, grid_tree_train3), recall_score(y_test, grid_tree_test3))

F1 Score: Training 1.0 0.2538293216630197
Accuracy Score: Training 1.0 0.9248733201145627
Recall Score: Training 1.0 0.2


The test scores were worse than the previous model. This means that the overfitting was greater. In the next model we will try to reduce overfitting by increasing the `min_samples_split`.

### Tune Parameters - GridSearch 4

For this gridSearch, we checked increasing values of `min_samples_split` and `max_features` to reduce overfitting in the model.

In [85]:
param_grid4 = { 
    'n_estimators': [75, 100, 125],
    'criterion': ['gini'],
    'min_samples_split': list(range(1200, 1202)),
    'max_features': list(range(30, 40))
}

In [86]:
grid_tree4 =GridSearchCV(RandomForestClassifier(random_state = 0), 
                         param_grid4, 
                         cv=5, 
                         scoring='f1', 
                         verbose=1, 
                         n_jobs=-1)

In [87]:
grid_tree4.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.6min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'criterion': ['gini'],
                         'max_features': [30, 31, 32, 33, 34, 35, 36, 37, 38,
                                          39],
                         'min_samples_split': [1200, 1201],
                         'n_estimators': [75, 100, 125]},
             scoring='f1', verbose=1)

In [88]:
print(grid_tree4.best_score_)
print(grid_tree4.best_params_)
print(grid_tree4.best_estimator_)

0.802184679073717
{'criterion': 'gini', 'max_features': 36, 'min_samples_split': 1200, 'n_estimators': 100}
RandomForestClassifier(max_features=36, min_samples_split=1200, random_state=0)


In [89]:
grid_tree_train4 = grid_tree4.best_estimator_.predict(X_train)
grid_tree_test4 = grid_tree4.best_estimator_.predict(X_test)
print('F1 Score: Training', f1_score(y_train, grid_tree_train4), f1_score(y_test, grid_tree_test4))
print('Accuracy Score: Training', accuracy_score(y_train, grid_tree_train4), accuracy_score(y_test, grid_tree_test4))
print('Recall Score: Training', recall_score(y_train, grid_tree_train4), recall_score(y_test, grid_tree_test4))

F1 Score: Training 0.8202098246411393 0.28629304523970295
Accuracy Score: Training 0.8147439900093663 0.7671293236395682
Recall Score: Training 0.8451451763971277 0.7310344827586207


From this tuning, we found that there was a decreasing return on overfitting of recall or accuracy after the min_samples_split reached over 1000 samples. The f1 score did not move above 30% throughout the different tunings.

In [90]:
confusion_matrix(y_test, grid_tree_test4)

array([[3270,  979],
       [  78,  212]])

The majority of error is due to predicting false positives.

In [91]:
# Pickle Random Forest model
with open("../models/best_random_forest.pickle", "wb") as best_random_forest:
    pickle.dump(grid_tree4.best_estimator_, best_random_forest)

## Adaboost Modeling

### Base Model

In [16]:
adaboost_clf = AdaBoostClassifier(random_state = 0)
adaboost_clf.fit(X_train, y_train)

AdaBoostClassifier(random_state=0)

In [17]:
adaboost_train_preds = adaboost_clf.predict(X_train)
adaboost_test_preds = adaboost_clf.predict(X_test)

In [18]:
print('Train:')
print('Accuracy: {}'.format(accuracy_score(y_train, adaboost_train_preds)))
print('Recall: {}'.format(recall_score(y_train, adaboost_train_preds)))
print('Test:')
print('Accuracy: {}'.format(accuracy_score(y_test, adaboost_test_preds)))
print('Recall: {}'.format(recall_score(y_test, adaboost_test_preds)))

Train:
Accuracy: 0.8085388698095536
Recall: 0.8634093037777084
Test:
Accuracy: 0.7400308437981934
Recall: 0.7620689655172413


### Tune Parameters - GridSearch

In [20]:
para_adaboost1 = {
    'n_estimators' : [50, 75, 100],
    'learning_rate' : [0.5, .75, 1.0, 1.25, 1.5]
}

In [23]:
adaboost_clf1 = AdaBoostClassifier(random_state = 0)
adaboost_model1 = GridSearchCV(adaboost_clf1, para_adaboost1, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)
adaboost_model1.fit(X_train, y_train)

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.7min finished


GridSearchCV(cv=10, estimator=AdaBoostClassifier(random_state=0), n_jobs=-1,
             param_grid={'learning_rate': [0.5, 0.75, 1.0, 1.25, 1.5],
                         'n_estimators': [50, 75, 100]},
             scoring='f1', verbose=1)

In [26]:
print(adaboost_model1.best_score_)
print(adaboost_model1.best_params_)
print(adaboost_model1.best_estimator_)

0.8259039750485042
{'learning_rate': 1.0, 'n_estimators': 100}
AdaBoostClassifier(n_estimators=100, random_state=0)


In [28]:
adaboost_train_preds1 = adaboost_model1.best_estimator_.predict(X_train)
adaboost_test_preds1 = adaboost_model1.best_estimator_.predict(X_test)

In [30]:
print('Train:')
print('Accuracy: {}'.format(accuracy_score(y_train, adaboost_train_preds1)))
print('Recall: {}'.format(recall_score(y_train, adaboost_train_preds1)))
print('Test:')
print('Accuracy: {}'.format(accuracy_score(y_test, adaboost_test_preds1)))
print('Recall: {}'.format(recall_score(y_test, adaboost_test_preds1)))

Train:
Accuracy: 0.8217686543865127
Recall: 0.8726974711208242
Test:
Accuracy: 0.7565543071161048
Recall: 0.7655172413793103


In [31]:
confusion_matrix(y_test, adaboost_test_preds1)

array([[3212, 1037],
       [  68,  222]])

### Tuning Parameters - GridSearch 2

In [32]:
para_adaboost2 = {
    'n_estimators' : [100, 125, 150],
    'learning_rate' : [0.5, .75, 1.0, 1.25, 1.5]
}

In [33]:
adaboost_clf2 = AdaBoostClassifier(random_state = 0)
adaboost_model2 = GridSearchCV(adaboost_clf2, para_adaboost2, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)
adaboost_model2.fit(X_train, y_train)

Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.1min finished


GridSearchCV(cv=10, estimator=AdaBoostClassifier(random_state=0), n_jobs=-1,
             param_grid={'learning_rate': [0.5, 0.75, 1.0, 1.25, 1.5],
                         'n_estimators': [100, 125, 150]},
             scoring='f1', verbose=1)

In [34]:
print(adaboost_model2.best_score_)
print(adaboost_model2.best_params_)
print(adaboost_model2.best_estimator_)

0.8404693396132688
{'learning_rate': 1.25, 'n_estimators': 150}
AdaBoostClassifier(learning_rate=1.25, n_estimators=150, random_state=0)


In [35]:
adaboost_train_preds2 = adaboost_model2.best_estimator_.predict(X_train)
adaboost_test_preds2 = adaboost_model2.best_estimator_.predict(X_test)

In [36]:
print('Train:')
print('Accuracy: {}'.format(accuracy_score(y_train, adaboost_train_preds2)))
print('Recall: {}'.format(recall_score(y_train, adaboost_train_preds2)))
print('Test:')
print('Accuracy: {}'.format(accuracy_score(y_test, adaboost_test_preds2)))
print('Recall: {}'.format(recall_score(y_test, adaboost_test_preds2)))

Train:
Accuracy: 0.8359740867936309
Recall: 0.880502653762098
Test:
Accuracy: 0.7779246530072703
Recall: 0.7


In [37]:
confusion_matrix(y_test, adaboost_test_preds2)

array([[3328,  921],
       [  87,  203]])

## Gradient Boost Modeling

### Base Model

In [17]:
gbt_clf = GradientBoostingClassifier(random_state = 0)
gbt_clf.fit(X_train, y_train)

GradientBoostingClassifier(random_state=0)

In [18]:
gbt_train_preds = gbt_clf.predict(X_train)
gbt_test_preds = gbt_clf.predict(X_test)

In [29]:
print('Train:')
print('Accuracy: {}'.format(accuracy_score(y_train, gbt_train_preds)))
print('Recall: {}'.format(recall_score(y_train, gbt_train_preds)))
print('F1: {}'.format(f1_score(y_train, gbt_train_preds)))
print('Test:')
print('Accuracy: {}'.format(accuracy_score(y_test, gbt_test_preds)))
print('Recall: {}'.format(recall_score(y_test, gbt_test_preds)))
print('F1: {}'.format(f1_score(y_test, gbt_test_preds)))

Train:
Accuracy: 0.8481891976272244
Recall: 0.9120355916328442
F1: 0.8573000733675715
Test:
Accuracy: 0.7684512007050011
Recall: 0.7586206896551724
F1: 0.29510395707578807


In [28]:
confusion_matrix(y_test, gbt_test_preds)

array([[3268,  981],
       [  70,  220]])

The base score is similar to the tuned models from other types of models. In Gradient Boosting the accuracy is slightly higher than the recall score. There is some overfitting in the model and the majority of the error is in the prediction of false positives as seen in all previous models. The f1 score however, is slightly higher than the previous models.

To increase the metric scores, GridSearchCV will be used to tune the parameters of gradient boosting and to validate the results.

### Tuning Parameters - GridSearchCV

The three parameters that will be tuned are `n_estimators`, `learning_rate` and `max_depth`. The default values for these are 100, 0.1, and 3 respectively.

In [21]:
para_gbt1 = {
    'n_estimators' : [75, 100, 125],
    'learning_rate' : [0.1, 0.5, .75, 1.0, 1.25, 1.5],
    'max_depth' : [2, 3, 4, 5]    
}

In [22]:
gbt_clf1 = GradientBoostingClassifier(random_state = 0)
gbt_model1 = GridSearchCV(gbt_clf1, para_gbt1, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)
gbt_model1.fit(X_train, y_train)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   32.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 22.8min finished


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(random_state=0),
             n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.5, 0.75, 1.0, 1.25, 1.5],
                         'max_depth': [2, 3, 4, 5],
                         'n_estimators': [75, 100, 125]},
             scoring='f1', verbose=1)

In [23]:
print(gbt_model1.best_score_)
print(gbt_model1.best_params_)
print(gbt_model1.best_estimator_)

0.9728599745423188
{'learning_rate': 1.5, 'max_depth': 5, 'n_estimators': 125}
GradientBoostingClassifier(learning_rate=1.5, max_depth=5, n_estimators=125,
                           random_state=0)


For the best parameters, the `max_depth` and `n_estimators` are both at the highest range.

In [24]:
gbt_train_preds1 = gbt_model1.best_estimator_.predict(X_train)
gbt_test_preds1 = gbt_model1.best_estimator_.predict(X_test)

In [27]:
print('Train:')
print('Accuracy: {}'.format(accuracy_score(y_train, gbt_train_preds1)))
print('Recall: {}'.format(recall_score(y_train, gbt_train_preds1)))
print('F1: {}'.format(f1_score(y_train, gbt_train_preds1)))
print('Test:')
print('Accuracy: {}'.format(accuracy_score(y_test, gbt_test_preds1)))
print('Recall: {}'.format(recall_score(y_test, gbt_test_preds1)))
print('F1: {}'.format(f1_score(y_test, gbt_test_preds1)))

Train:
Accuracy: 1.0
Recall: 1.0
F1: 1.0
Test:
Accuracy: 0.9041639127561136
Recall: 0.3
F1: 0.28571428571428564


In [26]:
confusion_matrix(y_test, gbt_test_preds1)

array([[4017,  232],
       [ 203,   87]])

The training score is much higher than default model, however the except for accuracy, the other metric scores are much lower. The model is highly overfit to the training data. The most likely culprit for this is the `max_depth` parameter. The higher the number of splits, the better the model will do on the training data, but it will also increase the overfitting of the model. The best parameter for the `max_depth` will usually be the max number in the range. So for the next tuning, we will decrease the possible `max_depth`.

### Tuning Parameters - GridSearchCV 2

In [48]:
para_gbt2 = {
    'n_estimators' : [200, 250, 275],
    'learning_rate' : [0.1, 0.5, .75, 1.0, 1.25, 1.5],
    'max_depth' : [3]    
}

In [53]:
gbt_clf2 = GradientBoostingClassifier(random_state = 0)
gbt_model2 = GridSearchCV(gbt_clf2, para_gbt2, cv = 10, scoring = 'recall', verbose = 1, n_jobs = -1)
gbt_model2.fit(X_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 11.8min finished


GridSearchCV(cv=10, estimator=GradientBoostingClassifier(random_state=0),
             n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.5, 0.75, 1.0, 1.25, 1.5],
                         'max_depth': [3], 'n_estimators': [200, 250, 275]},
             scoring='recall', verbose=1)

In [54]:
print(gbt_model2.best_score_)
print(gbt_model2.best_params_)
print(gbt_model2.best_estimator_)

1.0
{'learning_rate': 0.75, 'max_depth': 3, 'n_estimators': 250}
GradientBoostingClassifier(learning_rate=0.75, n_estimators=250, random_state=0)


In [55]:
gbt_train_preds2 = gbt_model2.best_estimator_.predict(X_train)
gbt_test_preds2 = gbt_model2.best_estimator_.predict(X_test)

In [56]:
print('Train:')
print('Accuracy: {}'.format(accuracy_score(y_train, gbt_train_preds2)))
print('Recall: {}'.format(recall_score(y_train, gbt_train_preds2)))
print('F1: {}'.format(f1_score(y_train, gbt_train_preds2)))
print('Test:')
print('Accuracy: {}'.format(accuracy_score(y_test, gbt_test_preds2)))
print('Recall: {}'.format(recall_score(y_test, gbt_test_preds2)))
print('F1: {}'.format(f1_score(y_test, gbt_test_preds2)))

Train:
Accuracy: 0.9908289103965032
Recall: 1.0
F1: 0.9909122549209173
Test:
Accuracy: 0.8960123375192773
Recall: 0.36551724137931035
F1: 0.30994152046783624
