# Avalanche Danger Level Forecast: Preliminary Modeling

I am going to plug out newly created model-ready dataset into a few out-of-the-box models

## Import Tools

In [76]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# pandas
from pandas.plotting import scatter_matrix

## Import Data

In [24]:
avi = pd.read_csv('snowweatheModel.csv')
print(avi)

      Unnamed: 0  Unnamed: 0.1  Unnamed: 0_x  AVY_DANGER   AWND  SNOW  \
0              0             0             0         2.0  10.74   0.2   
1              1             1             1         1.0   9.40   0.1   
2              2             2             2         1.0  20.58   2.2   
3              3             3             3         3.0  35.12   2.0   
4              4             4             4         2.0  33.78   4.1   
...          ...           ...           ...         ...    ...   ...   
1352        1352          1352          1252         3.0    NaN   1.1   
1353        1353          1353          1253         3.0    NaN   0.0   
1354        1354          1354          1254         3.0    NaN   0.4   
1355        1355          1355          1255         2.0    NaN   0.7   
1356        1356          1356          1256         1.0    NaN   0.0   

      FIVE_DAY_SNOWFALL  TMAX_SWING  TMAX_SWING_FROM_AVE   WDF5  ...  day_y  \
0                   NaN         NaN         

## Dataset at a Glance

In [25]:
avi.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,AVY_DANGER,AWND,SNOW,FIVE_DAY_SNOWFALL,TMAX_SWING,TMAX_SWING_FROM_AVE,WDF5,...,day_y,prevailing_wind_E,prevailing_wind_N,prevailing_wind_NE,prevailing_wind_NW,prevailing_wind_S,prevailing_wind_SE,prevailing_wind_SW,prevailing_wind_W,prevailing_wind_na
count,1357.0,1357.0,1357.0,1254.0,1328.0,1356.0,1347.0,1354.0,1345.0,1328.0,...,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0
mean,678.0,678.0,667.768607,2.087719,43.658607,1.528392,7.74833,0.044313,0.05026,268.185241,...,15.831245,0.016949,0.048637,0.021371,0.281503,0.064112,0.024318,0.06927,0.473839,0.0
std,391.876469,391.876469,373.368486,0.889035,16.464729,2.490871,6.359469,11.350798,13.005103,63.718759,...,8.647401,0.129128,0.215187,0.14467,0.449898,0.245043,0.154092,0.254007,0.499499,0.0
min,0.0,0.0,0.0,1.0,4.25,0.0,0.0,-44.0,-53.8,10.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,339.0,339.0,341.0,1.0,31.54,0.0,3.1,-5.0,-8.4,260.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,678.0,678.0,683.0,2.0,43.62,0.5,6.2,1.0,0.4,280.0,...,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1017.0,1017.0,1028.0,3.0,55.48,1.9,10.8,7.0,9.4,310.0,...,23.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1356.0,1356.0,1256.0,5.0,110.06,18.9,46.0,39.0,38.0,360.0,...,31.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [26]:
avi.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0_x', 'AVY_DANGER', 'AWND',
       'SNOW', 'FIVE_DAY_SNOWFALL', 'TMAX_SWING', 'TMAX_SWING_FROM_AVE',
       'WDF5', 'year_x', 'month_x', 'day_x', 'Unnamed: 0_y', 'temp_max',
       'temp_min', 'water_equivalent', 'snow_fall', 'snow_depth_6am',
       'wind_speed_sum', 'sunshine_percent', 'west_wind_hours',
       'northwest_wind_hours', 'year_y', 'month_y', 'day_y',
       'prevailing_wind_E', 'prevailing_wind_N', 'prevailing_wind_NE',
       'prevailing_wind_NW', 'prevailing_wind_S', 'prevailing_wind_SE',
       'prevailing_wind_SW', 'prevailing_wind_W', 'prevailing_wind_na'],
      dtype='object')

In [27]:
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1357 entries, 0 to 1356
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            1357 non-null   int64  
 1   Unnamed: 0.1          1357 non-null   int64  
 2   Unnamed: 0_x          1357 non-null   int64  
 3   AVY_DANGER            1254 non-null   float64
 4   AWND                  1328 non-null   float64
 5   SNOW                  1356 non-null   float64
 6   FIVE_DAY_SNOWFALL     1347 non-null   float64
 7   TMAX_SWING            1354 non-null   float64
 8   TMAX_SWING_FROM_AVE   1345 non-null   float64
 9   WDF5                  1328 non-null   float64
 10  year_x                1357 non-null   float64
 11  month_x               1357 non-null   float64
 12  day_x                 1357 non-null   float64
 13  Unnamed: 0_y          1357 non-null   int64  
 14  temp_max              1357 non-null   int64  
 15  temp_min             

## Filter Dataset

In [34]:
avi = avi.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y'], axis=1)

In [35]:
avi = avi[avi['AVY_DANGER'].notnull()]
avi.info()

      AVY_DANGER   AWND  SNOW  FIVE_DAY_SNOWFALL  TMAX_SWING  \
0            2.0  10.74   0.2                NaN         NaN   
1            1.0   9.40   0.1                NaN         3.0   
2            1.0  20.58   2.2                NaN         0.0   
3            3.0  35.12   2.0                NaN         3.0   
4            2.0  33.78   4.1                8.6        -3.0   
...          ...    ...   ...                ...         ...   
1352         3.0    NaN   1.1                9.5         2.0   
1353         3.0    NaN   0.0                9.2        -1.0   
1354         3.0    NaN   0.4                9.4         8.0   
1355         2.0    NaN   0.7               10.1        -5.0   
1356         1.0    NaN   0.0               10.1         5.0   

      TMAX_SWING_FROM_AVE   WDF5  year_x  month_x  day_x  ...  day_y  \
0                     NaN  320.0  2010.0     12.0   18.0  ...     18   
1                     NaN  180.0  2010.0     12.0   19.0  ...     19   
2              

## Fill Remaing NA's

In [41]:
avi = avi.fillna(0)
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1254 entries, 0 to 1356
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   AWND                  1254 non-null   float64
 2   SNOW                  1254 non-null   float64
 3   FIVE_DAY_SNOWFALL     1254 non-null   float64
 4   TMAX_SWING            1254 non-null   float64
 5   TMAX_SWING_FROM_AVE   1254 non-null   float64
 6   WDF5                  1254 non-null   float64
 7   year_x                1254 non-null   float64
 8   month_x               1254 non-null   float64
 9   day_x                 1254 non-null   float64
 10  temp_max              1254 non-null   int64  
 11  temp_min              1254 non-null   int64  
 12  water_equivalent      1254 non-null   float64
 13  snow_fall             1254 non-null   float64
 14  snow_depth_6am        1254 non-null   float64
 15  wind_speed_sum       

## Splitting Dataset

In [55]:
x = avi.iloc[:, 1:avi.shape[1]]
y = avi.iloc[:, 0]

print(x.shape)
print(y.shape)

(1254, 30)
(1254,)


In [56]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [57]:
print(y)

0       2.0
1       1.0
2       1.0
3       3.0
4       2.0
       ... 
1352    3.0
1353    3.0
1354    3.0
1355    2.0
1356    1.0
Name: AVY_DANGER, Length: 1254, dtype: float64


# Logistic Regression

In [58]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
print(accuracy_score(y_test, y_pred_lr))
print(cross_val_score(lr, x_train, y_train, cv=3))

0.535031847133758
[0.55414013 0.47284345 0.50798722]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [60]:
print(confusion_matrix(y_test, y_pred_lr))

[[61 21 10  0]
 [20 66 34  0]
 [ 4 37 41  0]
 [ 2  3 15  0]]


In [61]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

         1.0       0.70      0.66      0.68        92
         2.0       0.52      0.55      0.53       120
         3.0       0.41      0.50      0.45        82
         4.0       0.00      0.00      0.00        20

    accuracy                           0.54       314
   macro avg       0.41      0.43      0.42       314
weighted avg       0.51      0.54      0.52       314



  _warn_prf(average, modifier, msg_start, len(result))


# Support Vector Machine (SVM)

In [62]:
svm = SVC()
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)
print(accuracy_score(y_test, y_pred_svm))
print(cross_val_score(svm, x_train, y_train, cv=3))

0.3821656050955414
[0.37579618 0.37699681 0.37699681]




In [64]:
print(confusion_matrix(y_test, y_pred_svm))

[[  0  92   0   0]
 [  0 120   0   0]
 [  0  82   0   0]
 [  0  20   0   0]]


In [65]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        92
         2.0       0.38      1.00      0.55       120
         3.0       0.00      0.00      0.00        82
         4.0       0.00      0.00      0.00        20

    accuracy                           0.38       314
   macro avg       0.10      0.25      0.14       314
weighted avg       0.15      0.38      0.21       314



  _warn_prf(average, modifier, msg_start, len(result))


# K-Nearest Neighbors

In [69]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)
print(accuracy_score(y_test, y_pred_knn))
print(cross_val_score(knn, x_train, y_train, cv=3))

0.43630573248407645
[0.40764331 0.38658147 0.41533546]




In [72]:
print(confusion_matrix(y_test, y_pred_knn))

[[55 31  6  0]
 [43 58 19  0]
 [33 26 22  1]
 [ 3  8  7  2]]


In [73]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

         1.0       0.41      0.60      0.49        92
         2.0       0.47      0.48      0.48       120
         3.0       0.41      0.27      0.32        82
         4.0       0.67      0.10      0.17        20

    accuracy                           0.44       314
   macro avg       0.49      0.36      0.37       314
weighted avg       0.45      0.44      0.42       314



In [83]:
knn2 = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
knn_gscv.fit(x_train, y_train)
print(knn_gscv.best_params_)
print(knn_gscv.best_score_)



{'n_neighbors': 5}
0.43510638297872334


# Gradient Boosting Classifier

In [77]:
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)
y_pred_gbc = gbc.predict(x_test)
print(accuracy_score(y_test, y_pred_gbc))
print(cross_val_score(gbc, x_train, y_train, cv=3))

0.6210191082802548




[0.61783439 0.59105431 0.61341853]


In [78]:
print(confusion_matrix(y_test, y_pred_gbc))

[[73 15  3  0  1]
 [15 80 25  0  0]
 [ 9 29 39  5  0]
 [ 0  1 16  3  0]
 [ 0  0  0  0  0]]


In [79]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

         1.0       0.41      0.60      0.49        92
         2.0       0.47      0.48      0.48       120
         3.0       0.41      0.27      0.32        82
         4.0       0.67      0.10      0.17        20

    accuracy                           0.44       314
   macro avg       0.49      0.36      0.37       314
weighted avg       0.45      0.44      0.42       314

