# Modeling with Upsampling

Import necessary modules and metrics.

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 300)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import resample

Import the California Wildfires dataset.

In [3]:
df = pd.read_csv('data/california_wildfires.csv')

## Creating Dummy Variables for Categorical Features

There are two features that are categorical. The counties and the month of the year column that we engineered.

In [4]:
# Create dummy variables for the county column
counties = pd.get_dummies(df.county, drop_first = True)
# Drop county column along with unnecessary columns (Unnamed columns, year, and acres burned)
df2 = df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'county', 'year', 'acres_burned'], axis = 1)

In [5]:
# Feature engineer month column from the date column
df2['month'] = pd.DatetimeIndex(df2['date']).month
# Drop the date column
df2.drop(columns = ['date'], axis = 1, inplace = True)

In [6]:
# Create dummy variables for the months
month = pd.get_dummies(df2.month, drop_first = True)
# Drop the month column
df2.drop(columns = 'month', axis =1, inplace = True)

In [7]:
# Combine the original dataframe with the dummy variables
df2 = pd.concat([df2, counties, month], axis = 1)

In [8]:
df2.head()

Unnamed: 0,fire_started,Alfalfa & Hay_acres,Alfalfa & Hay_percentage,Almonds_acres,Almonds_percentage,Barren_acres,Barren_percentage,Corn_acres,Corn_percentage,Cotton_acres,Cotton_percentage,Deciduous Forest_acres,Deciduous Forest_percentage,Evergreen Forest_acres,Evergreen Forest_percentage,Fallow_acres,Fallow_percentage,Fruit Trees_acres,Fruit Trees_percentage,Grain Crops_acres,Grain Crops_percentage,Grapes_acres,Grapes_percentage,Grassland_acres,Grassland_percentage,High Intensity Developed_acres,High Intensity Developed_percentage,Low Intensity Developed_acres,Low Intensity Developed_percentage,Mixed Forest_acres,Mixed Forest_percentage,Other Ocean/Mexico_acres,Other Ocean/Mexico_percentage,Other Tree Crops_acres,Other Tree Crops_percentage,Other_acres,Other_percentage,Rice_acres,Rice_percentage,Shrubland_acres,Shrubland_percentage,Tomatoes_acres,Tomatoes_percentage,Vegs & Fruits_acres,Vegs & Fruits_percentage,Walnuts_acres,Walnuts_percentage,Water_acres,Water_percentage,Wetlands_acres,Wetlands_percentage,Winter Wheat_acres,Winter Wheat_percentage,max_elevation,min_elevation,Avg Air Temp (F)_Weekly,Avg Rel Hum (%)_Weekly,Avg Wind Speed (mph)_Weekly,Dew Point (F)_Weekly,Max Air Temp (F)_Weekly,Max Rel Hum (%)_Weekly,Min Air Temp (F)_Weekly,Min Rel Hum (%)_Weekly,Precip (in)_Weekly,Avg Air Temp (F)_month,Avg Rel Hum (%)_month,Avg Wind Speed (mph)_month,Dew Point (F)_month,Max Air Temp (F)_month,Max Rel Hum (%)_month,Min Air Temp (F)_month,Min Rel Hum (%)_month,Precip (in)_month,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,Del Norte,El Dorado,Fresno,Glenn,Humboldt,Imperial,Inyo,Kern,Kings,Lake,Lassen,Los Angeles,Madera,Marin,Mariposa,Mendocino,Merced,Modoc,Mono,Monterey,Napa,Nevada,Orange,Placer,Plumas,Riverside,Sacramento,San Benito,San Bernardino,San Diego,San Francisco,San Joaquin,San Luis Obispo,San Mateo,Santa Barbara,Santa Clara,Santa Cruz,Shasta,Sierra,Siskiyou,Solano,Sonoma,Stanislaus,Sutter,Tehama,Trinity,Tulare,Tuolumne,Ventura,Yolo,Yuba,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1102.856805,0.300074,4.225505,0.00115,194.595625,0.052947,4.670295,0.001271,0.0,0.0,5.33748,0.001452,7838.756565,2.132827,1536.74945,0.41813,1.77916,0.000484,991.214515,0.269697,3722.44751,1.012831,153671.38668,41.812059,28431.42159,7.735834,39470.886995,10.739534,74885.956375,20.375531,0.0,0.0,8.673405,0.00236,0.0,0.0,0.88958,0.000242,30958.051185,8.423298,4.670295,0.001271,164.12751,0.044657,4.670295,0.001271,19403.51896,5.279454,4497.494085,1.223712,624.48516,0.169915,1242,-42,44.214286,82.785714,2.392857,39.321429,54.157143,96.5,35.771429,60.785714,0.095714,45.506897,78.189655,2.915517,38.932759,55.896552,95.448276,35.725862,55.810345,0.130172,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,189.03575,0.04008,0.0,0.0,15482.472715,3.28265,0.0,0.0,0.0,0.0,194.595625,0.041259,195088.00753,41.363269,0.44479,9.4e-05,0.222395,4.7e-05,0.0,0.0,0.0,0.0,5644.82989,1.196837,121.42767,0.025746,3192.480225,0.676881,0.667185,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247783.390805,52.535935,0.0,0.0,0.0,0.0,0.0,0.0,2650.50361,0.561969,1297.45243,0.275091,0.0,0.0,3556,1442,29.657143,76.514286,3.228571,21.328571,34.428571,91.857143,22.857143,55.428571,0.0,30.789655,68.162069,4.968966,19.6,39.344828,86.0,22.758621,46.344828,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,1326.80857,0.41429,16.679625,0.005208,1873.01069,0.58484,242.632945,0.075761,0.0,0.0,17190.911105,5.367789,114386.866695,35.71681,168.13062,0.052498,12.00933,0.00375,120.0933,0.037499,2587.34343,0.807887,112912.61024,35.25648,440.119705,0.137425,8263.975805,2.580391,1727.11957,0.539286,0.0,0.0,1.33437,0.000417,0.0,0.0,1.111975,0.000347,52457.865415,16.379744,0.0,0.0,1.77916,0.000556,122.094855,0.038124,5822.74589,1.818128,105.86002,0.033054,479.48362,0.149717,3121,43,34.114286,83.571429,3.157143,29.585714,40.071429,96.0,27.757143,66.571429,0.141429,34.289655,76.724138,3.606897,27.410345,41.2,93.172414,27.768966,58.310345,0.155517,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.0,3777.15668,0.374865,46196.556585,4.584787,1869.45237,0.185534,2023.34971,0.200808,9.118195,0.000905,33181.556395,3.293111,408193.790775,40.511281,56434.51041,5.600855,10563.31771,1.048359,2628.041715,0.260821,247.525635,0.024566,170758.216925,16.946936,4421.65739,0.438828,25520.048645,2.532743,165.684275,0.016443,0.0,0.0,790.39183,0.078443,0.0,0.0,105624.2813,10.482705,55372.129495,5.495419,94.29548,0.009358,469.475845,0.046593,42057.340845,4.17399,21360.372565,2.119915,11589.89303,1.150241,4257.08509,0.422495,2192,-1,40.985714,81.285714,3.142857,35.557143,50.114286,91.285714,32.171429,62.857143,0.117143,42.389655,77.448276,3.848276,35.586207,52.455172,88.965517,33.365517,58.862069,0.175517,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.0,31.802485,0.005011,28.46656,0.004485,218.39189,0.034409,0.88958,0.00014,0.0,0.0,34882.878145,5.495994,255438.00431,40.245698,28.688955,0.00452,12.231725,0.001927,2.22395,0.00035,522.18346,0.082273,207502.763615,32.69323,465.027945,0.073268,12257.745215,1.931277,3351.270255,0.528012,0.0,0.0,0.88958,0.00014,0.0,0.0,0.0,0.0,106091.088405,16.715249,0.0,0.0,0.667185,0.000105,425.66403,0.067066,13178.68291,2.076376,245.07929,0.038614,11.786935,0.001857,3522,787,41.928571,93.014286,5.657143,39.0,50.142857,100.0,35.571429,74.142857,0.0,42.931034,87.017241,6.268966,37.196552,52.827586,97.551724,34.344828,61.275862,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Upsampling Minority Class

There is a high class imbalance in the target variable. As seen in the EDA, the grand majority of the target variable are instances of no wildfire. In this case, we use upsampling the instances of wildfire to resolve the class imbalance issue.

In [9]:
# Split the target variable by class into two dataframes
no_fire = df2[df2.fire_started == 0] # 0 = No Wildfire
fire = df2[df2.fire_started == 1] # 1 = Wildfire

In [10]:
# Resample the minority class (wildfire)
fire_resample = resample(fire,
                          replace=True, # sample with replacement
                          n_samples=no_fire.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [11]:
# Create a new dataframe combining the target classes
resampled_df = pd.concat([no_fire, fire_resample])

In [12]:
# Double-check the different target classes
resampled_df.fire_started.value_counts()

1.0    17061
0.0    17061
Name: fire_started, dtype: int64

## Train-Test Split

We want to split our resampled data into a training dataset and a test dataset. We should do the same for the original class imbalance to test that models have a similar metric when predicting from the original dataset.

In [13]:
# Split resampled dataset into target variable and features
y = resampled_df.fire_started
X = resampled_df.drop(columns = ['fire_started'], axis = 1)

In [14]:
# Split original dataset into target variable and features
y2 = df2.fire_started
X2 = df2.drop(columns = ['fire_started'], axis = 1)

In [15]:
# Use train_test_split to create a training dataset and test dataset
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,  y2, test_size = 0.25, random_state = 0)

In [16]:
# Use train_test_split to create a training dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,  y, test_size = 0.25, random_state = 0)

## Logistic Regression

The first type of model we tried was logistic regression

### Base Model

For our base model, we run the resampled training data through a logistic regression model with default settings

In [21]:
# Instantiate a logistic regression model
logreg = LogisticRegression(random_state = 0) # random state for consistant results
# Train model on resampled training data
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)

In [22]:
# Use the model to predict the target variable on the training dataset
y_hat_train = logreg.predict(X_train)
# Use the model to predict the target variable on the test dataset
y_hat_test = logreg.predict(X_test)

In [23]:
# Import metrics used
from sklearn.metrics import f1_score, accuracy_score, recall_score, confusion_matrix
# Print the f1 score metric on both the training and test predictions to check for overfitting
print(f1_score(y_train, y_hat_train), f1_score(y_test, y_hat_test))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print(accuracy_score(y_train, y_hat_train), accuracy_score(y_test, y_hat_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print(recall_score(y_train, y_hat_train), recall_score(y_test, y_hat_test))

0.6618738558828424 0.6610761705101327
0.6030245008010628 0.6020396202086508
0.7786217697729052 0.7716150081566069


The results of our metrics show that the recall and f1 score are both higher than the accuracy score, which is unusual. We use a confusion matrix to find the false positive and false negative values

In [53]:
confusion_matrix(y_test, y_hat_test)

array([[1825, 2415],
       [ 980, 3311]])

From the confusion matrix, there is a high number of false positives.

### Model - Scaled Data

In our first iteration we want to check how normalizing the features will change our score. Due to the resampled data already having class balance, we don't have to change anything to do with `class_weight`. To normalize our data, we will use a Standard Scaler.

In [54]:
# Insantiate the StandardScaler()
ss = StandardScaler()
# Fit the feature training data
ss.fit(X_train)
ss.fit(X_train2)

# Transform both the training and test features
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)
X_train_scaled2 = ss.transform(X_train2)
X_test_scaled2 = ss.transform(X_test2)

In [55]:
# Instantiate a new logistic regression model
logreg1 = LogisticRegression(solver = 'liblinear')
# Fit the data to the new scaled data
logreg1.fit(X_train_scaled, y_train)

LogisticRegression(solver='liblinear')

In [56]:
# Use model to predict target variable on the training dataset
y_hat_train1 = logreg1.predict(X_train_scaled)
# Use model to predict target variable on the test dataset
y_hat_test1 = logreg1.predict(X_test_scaled)

In [57]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print(f1_score(y_train, y_hat_train1), f1_score(y_test, y_hat_test1))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print(accuracy_score(y_train, y_hat_train1), accuracy_score(y_test, y_hat_test1))
# Print the recall score metric on both the training and test predictions to check for overfitting
print(recall_score(y_train, y_hat_train1), recall_score(y_test, y_hat_test1))

0.810233592880979 0.8075346260387812
0.8000078152475479 0.7963896377915837
0.8555990602975725 0.8492192962013516


The results from our metrics show an increase in score across the board. However, the f1 and recall score are still slightly better than the accuracy score. We check the confusion matrix next to check the value counts for false positive and false negatives.

In [58]:
confusion_matrix(y_test, y_hat_test1)

array([[3150, 1090],
       [ 647, 3644]])

The confusion matrix shows as in the previous model, the most error occurs from the model predicting false positives.

## K Nearest Neighbor (KNN) Models

The second type of model we used was K Nearest Neighbor

### Base KNN Model

For our base knn model, we chose k as 3. Due to how knn models function, the number of nearest neighbors should always be negative.

In [1]:
from sklearn.neighbors import KNeighborsClassifier

In [59]:
# Instantiate a knn model using 3 nearest neighbors
knn = KNeighborsClassifier(n_neighbors = 3)

In [60]:
# Fit knn model using the scaled data from the previous scaled logistic model
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=3)

In [61]:
# Predict target variable for both the train and test datasets.
knn_train = knn.predict(X_train_scaled)
knn_test = knn.predict(X_test_scaled)

In [62]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print(f1_score(y_train, knn_train), f1_score(y_test, knn_test))
# Print the sccuracy score metric on both the training and test predictions to check for overfitting
print(accuracy_score(y_train, knn_train), accuracy_score(y_test, knn_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print(recall_score(y_train, knn_train), recall_score(y_test, knn_test))

0.9695911316958354 0.9448420125509193
0.9686999335703959 0.9412730043371235
1.0 1.0


The results show that this model is strong in reducing false negatives. The errors we will see in the confusion matrix will be false positives.

In [63]:
confusion_matrix(y_test, knn_test)

array([[3739,  501],
       [   0, 4291]])

### Tuning Number of Nearest Neighbors

To tune the number (k) nearest neighbors, want to find the value of k that will return the max value for a given metric. In our base model, it was able to predict the target variable with no false negatives. Thus to gain the best model the metric we used to tune our model was the f1 score.

In [41]:
# Create a function to find the max f1 score and return the score along with the k value
def max_value(l):
    max_val = max(l)
    max_idx = l.index(max_val)
    return max_idx, max_val

In [42]:
# Create an empty list for f1 scores
k_scores = []
# Choose a range of k values to test
k_range = list(range(1, 21))
# Iterate through the different k values
for k in k_range:
    # Instantiate new knn model with k nearest neighbors
    knn = KNeighborsClassifier(n_neighbors = k)
    # Fit knn model on scaled training data
    knn.fit(X_train_scaled, y_train)
    # Use model to predict target variable on testing set
    y_pred = knn.predict(X_test_scaled)
    # Find the f1 score
    f1 = f1_score(y_test, y_pred)
    # Append f1 score to list of f1 scorees
    k_scores.append(f1)

# Find max f1 score
idx, val = max_value(k_scores)
# Print max f1 score and it corresponding k value
print(idx + 1, val)

1 0.9685137117706805


The best k value is 1 with an f1 score of 96%. We rerun the model using k=1 and check the recall and accuracy metrics of the model

In [43]:
knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
knn_train1 = knn.predict(X_train_scaled)
print(f1_score(y_train, knn_train1), f1_score(y_test, y_pred))
print(accuracy_score(y_train, knn_train1), accuracy_score(y_test, y_pred))
print(recall_score(y_train, knn_train1), recall_score(y_test, y_pred))

1.0 0.9685137117706805
1.0 0.9672957449302544
1.0 1.0


This model has a slightly higher f1 and accuracy score than the base model and recall is still 100%.

In [44]:
confusion_matrix(y_test, y_pred)

array([[3961,  279],
       [   0, 4291]])

The error from this model is predicting false positives.

Next we want to check how the model does on the original dataset.

In [45]:
knn_pred_test = knn.predict(X_test_scaled2)
knn_pred_train = knn.predict(X_train_scaled2)
print(f1_score(y_train2, knn_pred_train), f1_score(y_test2, knn_pred_test))
print(accuracy_score(y_train2, knn_pred_train), accuracy_score(y_test2, knn_pred_test))
print(recall_score(y_train2, knn_pred_train), recall_score(y_test2, knn_pred_test))

0.8863134657836645 0.888208269525268
0.9848696290855674 0.9839171623705663
1.0 1.0


The training and test accuracy, and training f1 score did slightly worse than with our resampled data.

In [46]:
confusion_matrix(y_test2, knn_pred_test)

array([[4176,   73],
       [   0,  290]])

## Decision Tree

### Base Model

In [47]:
from sklearn.tree import DecisionTreeClassifier

In [48]:
dt = DecisionTreeClassifier(random_state = 0, class_weight = 'balanced')
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', random_state=0)

In [49]:
dt_train = dt.predict(X_train)
dt_test = dt.predict(X_test)

print(f1_score(y_train, dt_train), f1_score(y_test, dt_test))
print(accuracy_score(y_train, dt_train), accuracy_score(y_test, dt_test))
print(recall_score(y_train, dt_train), recall_score(y_test, dt_test))

1.0 0.9688417249943553
1.0 0.9676474035869183
1.0 1.0


In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
parameters = {'max_depth' : range(1, 21, 1), 'max_features' : range(55, 75, 1), 'min_samples_split' : range(15, 25, 1)}

In [52]:
dtg = DecisionTreeClassifier(random_state = 0, class_weight = 'balanced')
grid_model = GridSearchCV(dtg, parameters, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

grid_model.fit(X_train, y_train)

Fitting 10 folds for each of 4000 candidates, totalling 40000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 6018 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 8418 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 9768 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 11218 tasks      

GridSearchCV(cv=10,
             estimator=DecisionTreeClassifier(class_weight='balanced',
                                              random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': range(1, 21),
                         'max_features': range(55, 75),
                         'min_samples_split': range(15, 25)},
             scoring='f1', verbose=1)

In [64]:
print(grid_model.best_score_)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

0.9403985380964534
{'max_depth': 20, 'max_features': 55, 'min_samples_split': 15}
DecisionTreeClassifier(class_weight='balanced', max_depth=20, max_features=55,
                       min_samples_split=15, random_state=0)


In [65]:
y_pred = grid_model.best_estimator_.predict(X_test)
print(f1_score(y_test, y_pred), accuracy_score(y_test, y_pred), recall_score(y_test, y_pred))
dt_train2 = grid_model.best_estimator_.predict(X_train)
print(f1_score(y_train, dt_train2), accuracy_score(y_train, dt_train2), recall_score(y_train, dt_train2))
confusion_matrix(y_test, y_pred)

0.938507070042749 0.9342398312038448 0.9976695408995572
0.9569065746540149 0.9551014028369349 0.998981989036805


array([[3689,  551],
       [  10, 4281]])

In [161]:
dtg_pred_test = grid_model.best_estimator_.predict(X_test2)
dtg_pred_train = grid_model.best_estimator_.predict(X_train2)
print(f1_score(y_train2, dtg_pred_train), f1_score(y_test2, dtg_pred_test))
print(accuracy_score(y_train2, dtg_pred_train), accuracy_score(y_test2, dtg_pred_test))
print(recall_score(y_train2, dtg_pred_train), recall_score(y_test2, dtg_pred_test))
confusion_matrix(y_test2, dtg_pred_test)

0.5848506919155134 0.5806451612903225
0.9162688211531399 0.9083498567966513
1.0 0.993103448275862


array([[3835,  414],
       [   2,  288]])

In [64]:
parameters2 = {'max_depth' : range(1, 15, 1), 'max_features' : range(45, 65, 1), 'min_samples_split' : range(10, 20, 1)}

In [65]:
dtg2 = DecisionTreeClassifier(random_state = 0, class_weight = 'balanced')
dtg2_model = GridSearchCV(dtg2, parameters2, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

dtg2_model.fit(X_train, y_train)

Fitting 10 folds for each of 2800 candidates, totalling 28000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 1376 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 2276 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 3376 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 4676 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 6176 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 7876 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 9776 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 11876 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 14176 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 16676 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 19376 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 22276 tasks 

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=0,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': range(1, 15),
                         'max_features': range(45, 65),
                         '

In [66]:
print(dtg2_model.best_score_)
print(dtg2_model.best_params_)
print(dtg2_model.best_estimator_)

0.284097496138901
{'max_depth': 14, 'max_features': 59, 'min_samples_split': 14}
DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=14,
                       max_features=59, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=14,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')


In [67]:
y_pred2 = dtg2_model.best_estimator_.predict(X_test)
print(f1_score(y_test, y_pred2), accuracy_score(y_test, y_pred2), recall_score(y_test, y_pred2))
dtg_train2 = grid_model.best_estimator_.predict(X_train)
print(f1_score(y_train, dtg_train2), accuracy_score(y_train, dtg_train2), recall_score(y_train, dtg_train2))
confusion_matrix(y_test, y_pred2)

0.2611731843575419 0.7669090107953294 0.6448275862068965
0.5128040973111395 0.8882115313991921 0.9975093399750934


array([[3294,  955],
       [ 103,  187]])

In [76]:
parameters3 = {'max_depth' : range(1, 10, 1), 'max_features' : range(35, 65, 1), 'min_samples_split' : range(14, 30, 1)}

In [77]:
dtg3 = DecisionTreeClassifier(random_state = 0, class_weight = 'balanced')
dtg3_model = GridSearchCV(dtg3, parameters3, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

dtg3_model.fit(X_train, y_train)

Fitting 10 folds for each of 4320 candidates, totalling 43200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 2404 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 4804 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 6304 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 8004 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 9904 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done 12004 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 14304 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 16804 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 19504 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 22404 tasks 

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=0,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': range(1, 10),
                         'max_features': range(35, 65),
                         '

In [78]:
print(dtg3_model.best_score_)
print(dtg3_model.best_params_)
print(dtg3_model.best_estimator_)

0.2675162538246359
{'max_depth': 9, 'max_features': 47, 'min_samples_split': 25}
DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=9,
                       max_features=47, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=25,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')


In [79]:
y_pred3 = dtg3_model.best_estimator_.predict(X_test)
print(f1_score(y_test, y_pred3), accuracy_score(y_test, y_pred3), recall_score(y_test, y_pred3))
dtg_train3 = grid_model.best_estimator_.predict(X_train)
print(f1_score(y_train, dtg_train3), accuracy_score(y_train, dtg_train3), recall_score(y_train, dtg_train3))
confusion_matrix(y_test, y_pred3)

0.2469135802469136 0.7177792465300727 0.7241379310344828
0.5128040973111395 0.8882115313991921 0.9975093399750934


array([[3048, 1201],
       [  80,  210]])

In [82]:
n = 100 # of iteration
# value intialization
train_recall_sum = 0
test_recall_sum = 0
# f1 is same as recall when using micro as average value

for i in range(0, n): 
    # new split
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    #predict on new split
    y_tree_train_pred = dtg3_model.best_estimator_.predict(X_train)
    y_tree_test_pred = dtg3_model.best_estimator_.predict(X_test)
    
    #calculate recall score on new prediction
    train_recall_sum += recall_score(y_train, y_tree_train_pred)
    test_recall_sum += recall_score(y_test, y_tree_test_pred)
    #f1
    # print("Predicted", i+1, "times") #sanity check

# output average    
print(f"Check for Overfitting with {n} iterations")
print("Decision Tree Train recall: ", train_recall_sum/n)
print("Decision Tree Test recall: ", test_recall_sum/n)

Check for Overfitting with 100 iterations
Decision Tree Train recall:  0.9114670787680388
Decision Tree Test recall:  0.9105734780901449


In [83]:
n = 100 # of iteration
# value intialization
train_accuracy_sum = 0
test_accuracy_sum = 0
# f1 is same as recall when using micro as average value

for i in range(0, n): 
    # new split
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    #predict on new split
    y_tree_train_pred = dtg3_model.best_estimator_.predict(X_train)
    y_tree_test_pred = dtg3_model.best_estimator_.predict(X_test)
    
    #calculate recall score on new prediction
    train_accuracy_sum += accuracy_score(y_train, y_tree_train_pred)
    test_accuracy_sum += accuracy_score(y_test, y_tree_test_pred)
    #f1
    # print("Predicted", i+1, "times") #sanity check

# output average    
print(f"Check for Overfitting with {n} iterations")
print("Decision Tree Train accuracy: ", train_accuracy_sum/n)
print("Decision Tree Test accuracy: ", test_accuracy_sum/n)

Check for Overfitting with 100 iterations
Decision Tree Train accuracy:  0.7462005141388174
Decision Tree Test accuracy:  0.7460850407578764


## Different Class Imbalance Method

In [163]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 2.4 MB/s eta 0:00:01
Collecting scikit-learn>=0.23
  Downloading scikit_learn-0.23.2-cp36-cp36m-macosx_10_9_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 4.6 MB/s eta 0:00:01     |█████████████████████████▎      | 5.7 MB 4.6 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.21.3
    Uninstalling scikit-learn-0.21.3:
      Successfully uninstalled scikit-learn-0.21.3
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0 scikit-learn-0.23.2 threadpoolctl-2.1.0


In [1]:
from imblearn.over_sampling import SMOTE