# Optimizing Models


#### Import Tools

In [16]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, \
    validation_curve
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, \
    precision_recall_curve, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

#### Import Data

In [17]:
avi = pd.read_csv('SnowWeatherCleanFE.csv')
avi.head()

Unnamed: 0.1,Unnamed: 0,avi_danger,avg_wind,temp_max_swing,temp_max_swing_from_avg,year,month,day,temp_max,temp_min,...,prevailing_wind_N_2,prevailing_wind_NE_2,prevailing_wind_NW_2,prevailing_wind_S_2,prevailing_wind_SE_2,prevailing_wind_SW_2,prevailing_wind_W_2,three_day_snow_2,five_day_snow_2,next_day_avi_danger
0,0,1.0,20.58,0.0,0.0,2010.0,12.0,20.0,15,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.2,0.2,3.0
1,1,3.0,35.12,3.0,0.0,2010.0,12.0,21.0,18,10,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.3,0.3,2.0
2,2,2.0,33.78,-3.0,0.0,2010.0,12.0,22.0,15,7,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,3.0
3,3,3.0,31.32,0.0,0.0,2010.0,12.0,23.0,15,6,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,4.5,2.0
4,4,2.0,32.44,2.0,1.4,2010.0,12.0,24.0,17,9,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.3,8.6,2.0


In [18]:
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1224 entries, 0 to 1223
Data columns (total 74 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 1224 non-null   int64  
 1   avi_danger                 1224 non-null   float64
 2   avg_wind                   1224 non-null   float64
 3   temp_max_swing             1224 non-null   float64
 4   temp_max_swing_from_avg    1224 non-null   float64
 5   year                       1224 non-null   float64
 6   month                      1224 non-null   float64
 7   day                        1224 non-null   float64
 8   temp_max                   1224 non-null   int64  
 9   temp_min                   1224 non-null   int64  
 10  water_equivalent           1224 non-null   float64
 11  snow_fall                  1224 non-null   float64
 12  snow_depth_6am             1224 non-null   float64
 13  wind_speed_sum             1224 non-null   int64

#### filter data

In [19]:
avi = avi.drop(['Unnamed: 0'], axis=1)

In [20]:
# avi = avi.drop(['prevailing_wind_E', 'prevailing_wind_N', 'prevailing_wind_NE', 'prevailing_wind_NW',
#                 'prevailing_wind_S', 'prevailing_wind_SE', 'prevailing_wind_SW', 'prevailing_wind_W',
#                 'prevailing_wind_E_1', 'prevailing_wind_N_1', 'prevailing_wind_NE_1', 'prevailing_wind_NW_1',
#                 'prevailing_wind_S_1', 'prevailing_wind_SE_1', 'prevailing_wind_SW_1', 'prevailing_wind_W_1',
#                 'prevailing_wind_E_2', 'prevailing_wind_N_2', 'prevailing_wind_NE_2', 'prevailing_wind_NW_2',
#                 'prevailing_wind_S_2', 'prevailing_wind_SE_2', 'prevailing_wind_SW_2', 'prevailing_wind_W_2'], axis=1)

#### Splitting Dataset

In [21]:
x = avi.iloc[:, 0:avi.shape[1]-1]
y = avi.iloc[:, avi.shape[1]-1]

print(x.shape)
print(y.shape)

(1224, 72)
(1224,)


In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

#### Create Class Weights Dictionary

In [35]:
class_weight = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

In [36]:
weight_dict = {}
for class_num in range(0, len(class_weight)):
    weight_dict[class_num + 1] = class_weight[class_num]
print(weight_dict)

{1: 0.8861003861003861, 2: 0.6428571428571429, 3: 0.9405737704918032, 4: 3.956896551724138}


#### Standardizing Function

In [37]:
#standardizer 
def standardize(X_train, X_test):
    scaler = StandardScaler()
    # Fitting and transforming training data
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    # Tranforming testing data based on traning fit (prevent data leakage)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [38]:
x_train, x_test = standardize(x_train, x_test)

In [39]:
print(y_train)

(unique, counts) = np.unique(y_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

30      3.0
242     1.0
465     1.0
136     3.0
1031    1.0
       ... 
1044    2.0
1095    1.0
1130    1.0
860     1.0
1126    1.0
Name: next_day_avi_danger, Length: 918, dtype: float64
[[  1. 259.]
 [  2. 357.]
 [  3. 244.]
 [  4.  58.]]


### Random Forest w/ Class Weights

In [40]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf = RandomForestClassifier(class_weight=weight_dict)
rf_cv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=5, scoring='f1_weighted')
rf_cv.fit(x_train, y_train)
y_pred_rf = rf_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_rf))
print(cross_val_score(rf, x_train, y_train, cv=3))

0.6274509803921569
[0.64052288 0.58496732 0.64052288]


In [124]:
print(rf_cv.best_params_)

{'n_estimators': 800, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}


In [125]:
print(confusion_matrix(y_test, y_pred_rf))

[[70 21  3  0]
 [11 79 21  1]
 [ 9 35 40  4]
 [ 0  1  8  3]]


In [126]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

         1.0       0.78      0.74      0.76        94
         2.0       0.58      0.71      0.64       112
         3.0       0.56      0.45      0.50        88
         4.0       0.38      0.25      0.30        12

    accuracy                           0.63       306
   macro avg       0.57      0.54      0.55       306
weighted avg       0.63      0.63      0.62       306



In [127]:
# train_scores, test_scores = validation_curve(RandomForestClassifier(rf_cv.best_params_),
#                                              x_train,
#                                              y_train,
#                                              param_name="n_estimators",
#                                              param_range=param_range,
#                                              cv=3,
#                                              scoring="accuracy",
#                                              n_jobs=-1)

In [128]:
# plt.plot(param_range, train_mean, label="Training score", color="black")
# plt.plot(param_range, test_mean, label="Cross-validation score", color="dimgrey")
# plt.title("Validation Curve With Random Forest")
# plt.xlabel("Number Of Trees")
# plt.ylabel("Accuracy Score")
# plt.tight_layout()
# plt.legend(loc="best")
# plt.show()

In [129]:
# train_scores, test_scores = validation_curve(RandomForestClassifier(rf_cv.best_params_),
#                                              x_train,
#                                              y_train,
#                                              param_name="max_depth",
#                                              param_range=param_range,
#                                              cv=3,
#                                              scoring="accuracy",
#                                              n_jobs=-1)

In [130]:
# plt.plot(param_range, train_mean, label="Training score", color="black")
# plt.plot(param_range, test_mean, label="Cross-validation score", color="dimgrey")
# plt.title("Validation Curve With Random Forest")
# plt.xlabel("Number Of Trees")
# plt.ylabel("Accuracy Score")
# plt.tight_layout()
# plt.legend(loc="best")
# plt.show()

### Random Forest w/ SMOTE

In [26]:
sm = SMOTE(random_state=12, sampling_strategy={4:250})
x_res, y_res = sm.fit_sample(x_train, y_train)



In [37]:
#smk = SMOTETomek(random_state=12, sampling_strategy=1.0)
#x_res, y_res = smk.fit_sample(x_train, y_train)

In [33]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf = RandomForestClassifier()
rf_cv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=5, scoring='f1_weighted')
rf_cv.fit(x_res, y_res)
y_pred_rf = rf_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_rf))
print(cross_val_score(rf, x_train, y_train, cv=3))

0.6241830065359477
[0.62091503 0.58496732 0.6503268 ]


In [34]:
print(rf_cv.best_params_)

{'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 90, 'bootstrap': False}


In [35]:
print(confusion_matrix(y_test, y_pred_rf))

[[66 24  3  1]
 [10 88 13  1]
 [ 7 41 33  7]
 [ 0  4  4  4]]


In [36]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

         1.0       0.80      0.70      0.75        94
         2.0       0.56      0.79      0.65       112
         3.0       0.62      0.38      0.47        88
         4.0       0.31      0.33      0.32        12

    accuracy                           0.62       306
   macro avg       0.57      0.55      0.55       306
weighted avg       0.64      0.62      0.62       306



### Extra Forest

In [44]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

et = ExtraTreesClassifier(class_weight = weight_dict)
et_cv = RandomizedSearchCV(estimator=et, param_distributions=random_grid, n_iter=5, scoring='f1_weighted')
et_cv.fit(x_train, y_train)
y_pred_et = et_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_et))
print(cross_val_score(et, x_train, y_train, cv=3))

0.6405228758169934
[0.59150327 0.59477124 0.64052288]


In [45]:
print(et_cv.best_params_)

{'n_estimators': 1000, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'bootstrap': True}


In [46]:
print(confusion_matrix(y_test, y_pred_et))

[[71 19  4  0]
 [10 81 20  1]
 [10 31 42  5]
 [ 1  2  7  2]]


In [47]:
print(classification_report(y_test, y_pred_et))

              precision    recall  f1-score   support

         1.0       0.77      0.76      0.76        94
         2.0       0.61      0.72      0.66       112
         3.0       0.58      0.48      0.52        88
         4.0       0.25      0.17      0.20        12

    accuracy                           0.64       306
   macro avg       0.55      0.53      0.54       306
weighted avg       0.64      0.64      0.63       306



## Gradient Boosting Classifier

In [27]:
param_distributions = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 
                       'n_estimators':[100,250,500,750,1000,1250,1500,1750],
                       'max_depth':[2,3,4,5,6,7] }

gbc = GradientBoostingClassifier(n_estimators=500, max_depth=4, learning_rate=0.05)
# gbc_cv = RandomizedSearchCV(estimator=gbc, param_distributions= param_distributions, n_iter=5, scoring='f1_weighted')
gbc.fit(x_res, y_res)
y_pred_gbc = gbc.predict(x_test)
print(accuracy_score(y_test, y_pred_gbc))
print(cross_val_score(gbc, x_train, y_train, cv=3))

# print(gbc_cv.best_params_)
# {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.05}

0.6045751633986928
[0.61764706 0.58169935 0.61111111]


In [28]:
print(confusion_matrix(y_test, y_pred_gbc))

[[64 22  8  0]
 [ 9 78 22  3]
 [10 29 40  9]
 [ 1  3  5  3]]


In [29]:
print(classification_report(y_test, y_pred_gbc))

              precision    recall  f1-score   support

         1.0       0.76      0.68      0.72        94
         2.0       0.59      0.70      0.64       112
         3.0       0.53      0.45      0.49        88
         4.0       0.20      0.25      0.22        12

    accuracy                           0.60       306
   macro avg       0.52      0.52      0.52       306
weighted avg       0.61      0.60      0.60       306



### Decision trees with class weights balanced

In [44]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              }


ds = DecisionTreeClassifier(class_weight = weight_dict)
ds_cv = RandomizedSearchCV(estimator=ds, param_distributions=random_grid, n_iter=100, scoring='f1_weighted')
ds_cv.fit(x_train, y_train)
y_pred_ds = ds_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_ds))
print(cross_val_score(ds, x_train, y_train, cv=3))
print(ds_cv.best_params_)

0.5294117647058824
[0.49019608 0.50326797 0.47712418]
{'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}


In [45]:
print(confusion_matrix(y_test, y_pred_ds))

[[55 13 24  2]
 [11 57 41  3]
 [12 23 46  7]
 [ 1  6  1  4]]


In [46]:
print(classification_report(y_test, y_pred_ds))

              precision    recall  f1-score   support

         1.0       0.70      0.59      0.64        94
         2.0       0.58      0.51      0.54       112
         3.0       0.41      0.52      0.46        88
         4.0       0.25      0.33      0.29        12

    accuracy                           0.53       306
   macro avg       0.48      0.49      0.48       306
weighted avg       0.55      0.53      0.54       306



### Logistic Regression with class weights balanced

In [49]:
penalty = ['l2']
tol = [0.0001, 0.001, 0.01, 0.1]
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000] 
solver = ['newton-cg', 'lbfgs', 'sag']
param_distributions = dict(penalty=penalty,
                           tol=tol,
                           C=C,
                           solver = solver)


lr = LogisticRegression(class_weight = weight_dict, max_iter = 10000)
lr_cv = RandomizedSearchCV(estimator=lr, param_distributions= param_distributions, n_iter=5, scoring='f1_weighted')
lr_cv.fit(x_train, y_train)
y_pred_lr = lr_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_lr))
print(cross_val_score(lr, x_train, y_train, cv=3))
print(lr_cv.best_params_)

0.5522875816993464
[0.50653595 0.5620915  0.56535948]
{'tol': 0.0001, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.1}


### SVC with class weights balanced 

In [50]:
param_distributions = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  


svm = SVC(class_weight = weight_dict)
svm_cv = RandomizedSearchCV(estimator=svm, param_distributions= param_distributions, n_iter=5, scoring='f1_weighted')
svm_cv.fit(x_train, y_train)
y_pred_svm = svm_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_svm))
print(cross_val_score(svm, x_train, y_train, cv=3))
print(svm_cv.best_params_)

0.5620915032679739
[0.56862745 0.56862745 0.64705882]
{'kernel': 'rbf', 'gamma': 0.01, 'C': 10}


### KNN 

In [51]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
#Convert to dictionary
param_distributions = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

knn = KNeighborsClassifier()
knn_cv = RandomizedSearchCV(estimator=knn, param_distributions= param_distributions, n_iter=5, scoring='f1_weighted')
knn_cv.fit(x_train, y_train)
y_pred_knn = knn_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_knn))
print(cross_val_score(knn, x_train, y_train, cv=3))
print(knn_cv.best_params_)

0.5620915032679739
[0.53267974 0.51633987 0.54248366]
{'p': 1, 'n_neighbors': 25, 'leaf_size': 21}
