### Danger Level Forecasting Preliminary Models

Testing out some of the classification models

#### Import Tools

In [70]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

#### Import Data

In [71]:
avi = pd.read_csv('snowweatheModel.csv')
avi.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,AVY_DANGER,AWND,SNOW,FIVE_DAY_SNOWFALL,TMAX_SWING,TMAX_SWING_FROM_AVE,WDF5,...,day_y,prevailing_wind_E,prevailing_wind_N,prevailing_wind_NE,prevailing_wind_NW,prevailing_wind_S,prevailing_wind_SE,prevailing_wind_SW,prevailing_wind_W,prevailing_wind_na
0,0,0,0,2.0,10.74,0.2,,,,320.0,...,18,0,0,0,1,0,0,0,0,0
1,1,1,1,1.0,9.4,0.1,,3.0,,180.0,...,19,0,0,0,1,0,0,0,0,0
2,2,2,2,1.0,20.58,2.2,,0.0,,360.0,...,20,0,1,0,0,0,0,0,0,0
3,3,3,3,3.0,35.12,2.0,,3.0,,360.0,...,21,0,1,0,0,0,0,0,0,0
4,4,4,4,2.0,33.78,4.1,8.6,-3.0,,360.0,...,22,0,1,0,0,0,0,0,0,0


#### filter data

In [72]:
avi = avi.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y'], axis=1)

In [73]:
avi = avi[avi['AVY_DANGER'].notnull()]
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1254 entries, 0 to 1356
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   AWND                  1225 non-null   float64
 2   SNOW                  1253 non-null   float64
 3   FIVE_DAY_SNOWFALL     1244 non-null   float64
 4   TMAX_SWING            1251 non-null   float64
 5   TMAX_SWING_FROM_AVE   1242 non-null   float64
 6   WDF5                  1225 non-null   float64
 7   year_x                1254 non-null   float64
 8   month_x               1254 non-null   float64
 9   day_x                 1254 non-null   float64
 10  temp_max              1254 non-null   int64  
 11  temp_min              1254 non-null   int64  
 12  water_equivalent      1254 non-null   float64
 13  snow_fall             1254 non-null   float64
 14  snow_depth_6am        1254 non-null   float64
 15  wind_speed_sum       

#### Fill Remaing NA's

In [74]:
avi = avi.fillna(0)
avi = avi[avi.AVY_DANGER != 5]
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1252 entries, 0 to 1356
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1252 non-null   float64
 1   AWND                  1252 non-null   float64
 2   SNOW                  1252 non-null   float64
 3   FIVE_DAY_SNOWFALL     1252 non-null   float64
 4   TMAX_SWING            1252 non-null   float64
 5   TMAX_SWING_FROM_AVE   1252 non-null   float64
 6   WDF5                  1252 non-null   float64
 7   year_x                1252 non-null   float64
 8   month_x               1252 non-null   float64
 9   day_x                 1252 non-null   float64
 10  temp_max              1252 non-null   int64  
 11  temp_min              1252 non-null   int64  
 12  water_equivalent      1252 non-null   float64
 13  snow_fall             1252 non-null   float64
 14  snow_depth_6am        1252 non-null   float64
 15  wind_speed_sum       

#### Splitting Dataset

In [75]:
x = avi.iloc[:, 1:avi.shape[1]]
y = avi.iloc[:, 0]

print(x.shape)
print(y.shape)

(1252, 30)
(1252,)


#### Standardizing

In [76]:
#standardizer 
def standardize(X_train, X_test):
    scaler = StandardScaler()
    # Fitting and transforming training data
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    # Tranforming testing data based on traning fit (prevent data leakage)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [77]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Scaling
x_train, x_test = standardize(x_train, x_test)

NameError: name 'StandardScaler' is not defined

In [78]:
print(y)

(unique, counts) = np.unique(y, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)


0       2.0
1       1.0
2       1.0
3       3.0
4       2.0
       ... 
1352    3.0
1353    3.0
1354    3.0
1355    2.0
1356    1.0
Name: AVY_DANGER, Length: 1252, dtype: float64
[[  1. 371.]
 [  2. 474.]
 [  3. 339.]
 [  4.  68.]]


### Descision Trees 

In [79]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              }


ds = DecisionTreeClassifier()
ds_cv = RandomizedSearchCV(estimator=ds, param_distributions=random_grid, n_iter=100, scoring='f1_weighted')
ds_cv.fit(x_train, y_train)
y_pred_ds = ds_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_ds))
print(cross_val_score(ds, x_train, y_train, cv=3))

0.5559105431309904
[0.53354633 0.53354633 0.50798722]


In [80]:
print(confusion_matrix(y_test, y_pred_ds))

[[64 13 19  1]
 [18 67 34  1]
 [13 18 39  5]
 [ 2  4 11  4]]


In [81]:

print(classification_report(y_test, y_pred_ds))

              precision    recall  f1-score   support

         1.0       0.66      0.66      0.66        97
         2.0       0.66      0.56      0.60       120
         3.0       0.38      0.52      0.44        75
         4.0       0.36      0.19      0.25        21

    accuracy                           0.56       313
   macro avg       0.51      0.48      0.49       313
weighted avg       0.57      0.56      0.56       313



### Random Forest

In [82]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf = RandomForestClassifier(class_weight = "balanced")
rf_cv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=5, scoring='f1_weighted')
rf_cv.fit(x_train, y_train)
y_pred_rf = rf_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_rf))
print(cross_val_score(rf, x_train, y_train, cv=3))

0.6325878594249201
[0.61341853 0.61022364 0.60383387]


In [83]:
print(confusion_matrix(y_test, y_pred_rf))

[[76 12  9  0]
 [13 74 32  1]
 [ 4 20 41 10]
 [ 0  1 13  7]]


In [59]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

         1.0       0.83      0.74      0.78        87
         2.0       0.67      0.75      0.71       115
         3.0       0.56      0.57      0.56        86
         4.0       0.50      0.33      0.40        15

    accuracy                           0.67       303
   macro avg       0.64      0.60      0.61       303
weighted avg       0.68      0.67      0.67       303



### Extra Forest

In [60]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

et = ExtraTreesClassifier(class_weight = "balanced")
et_cv = RandomizedSearchCV(estimator=et, param_distributions=random_grid, n_iter=5, scoring='f1_weighted')
et_cv.fit(x_train, y_train)
y_pred_et = et_cv.predict(x_test)
print(accuracy_score(y_test, y_pred_et))
print(cross_val_score(rf, x_train, y_train, cv=3))

0.6336633663366337
[0.62706271 0.62046205 0.64686469]


In [61]:
print(confusion_matrix(y_test, y_pred_et))

[[54 25  8  0]
 [11 89 15  0]
 [ 6 32 48  0]
 [ 0  1 13  1]]


In [62]:
print(classification_report(y_test, y_pred_et))

              precision    recall  f1-score   support

         1.0       0.76      0.62      0.68        87
         2.0       0.61      0.77      0.68       115
         3.0       0.57      0.56      0.56        86
         4.0       1.00      0.07      0.12        15

    accuracy                           0.63       303
   macro avg       0.73      0.50      0.51       303
weighted avg       0.66      0.63      0.62       303

