# Snow Data: Feature Engineering

I will be creating features that hold that past few days of data for each variable to hopefully increase model robustness

## Import Tools

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

# pandas
from pandas.plotting import scatter_matrix

In [2]:
avi = pd.read_csv('SnowWeatherModel.csv')
print(avi)

      Unnamed: 0  Unnamed: 0.1  Unnamed: 0_x  AVY_DANGER   AWND  SNOW  \
0              0             0             0         2.0  10.74   0.2   
1              1             1             1         1.0   9.40   0.1   
2              2             2             2         1.0  20.58   2.2   
3              3             3             3         3.0  35.12   2.0   
4              4             4             4         2.0  33.78   4.1   
...          ...           ...           ...         ...    ...   ...   
1352        1352          1352          1252         3.0    NaN   1.1   
1353        1353          1353          1253         3.0    NaN   0.0   
1354        1354          1354          1254         3.0    NaN   0.4   
1355        1355          1355          1255         2.0    NaN   0.7   
1356        1356          1356          1256         1.0    NaN   0.0   

      FIVE_DAY_SNOWFALL  TMAX_SWING  TMAX_SWING_FROM_AVE   WDF5  ...  day_y  \
0                   NaN         NaN         

## Dataset at a Glance

In [3]:
avi.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,AVY_DANGER,AWND,SNOW,FIVE_DAY_SNOWFALL,TMAX_SWING,TMAX_SWING_FROM_AVE,WDF5,...,day_y,prevailing_wind_E,prevailing_wind_N,prevailing_wind_NE,prevailing_wind_NW,prevailing_wind_S,prevailing_wind_SE,prevailing_wind_SW,prevailing_wind_W,prevailing_wind_na
count,1357.0,1357.0,1357.0,1254.0,1328.0,1356.0,1347.0,1354.0,1345.0,1328.0,...,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0,1357.0
mean,678.0,678.0,667.768607,2.087719,43.658607,1.528392,7.74833,0.044313,0.05026,268.185241,...,15.831245,0.016949,0.048637,0.021371,0.281503,0.064112,0.024318,0.06927,0.473839,0.0
std,391.876469,391.876469,373.368486,0.889035,16.464729,2.490871,6.359469,11.350798,13.005103,63.718759,...,8.647401,0.129128,0.215187,0.14467,0.449898,0.245043,0.154092,0.254007,0.499499,0.0
min,0.0,0.0,0.0,1.0,4.25,0.0,0.0,-44.0,-53.8,10.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,339.0,339.0,341.0,1.0,31.54,0.0,3.1,-5.0,-8.4,260.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,678.0,678.0,683.0,2.0,43.62,0.5,6.2,1.0,0.4,280.0,...,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1017.0,1017.0,1028.0,3.0,55.48,1.9,10.8,7.0,9.4,310.0,...,23.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1356.0,1356.0,1256.0,5.0,110.06,18.9,46.0,39.0,38.0,360.0,...,31.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [4]:
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1357 entries, 0 to 1356
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            1357 non-null   int64  
 1   Unnamed: 0.1          1357 non-null   int64  
 2   Unnamed: 0_x          1357 non-null   int64  
 3   AVY_DANGER            1254 non-null   float64
 4   AWND                  1328 non-null   float64
 5   SNOW                  1356 non-null   float64
 6   FIVE_DAY_SNOWFALL     1347 non-null   float64
 7   TMAX_SWING            1354 non-null   float64
 8   TMAX_SWING_FROM_AVE   1345 non-null   float64
 9   WDF5                  1328 non-null   float64
 10  year_x                1357 non-null   float64
 11  month_x               1357 non-null   float64
 12  day_x                 1357 non-null   float64
 13  Unnamed: 0_y          1357 non-null   int64  
 14  temp_max              1357 non-null   int64  
 15  temp_min             

## Filter Dataset

In [5]:
avi = avi.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y'], axis=1)

In [6]:
avi = avi[avi['AVY_DANGER'].notnull()]
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1254 entries, 0 to 1356
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   AWND                  1225 non-null   float64
 2   SNOW                  1253 non-null   float64
 3   FIVE_DAY_SNOWFALL     1244 non-null   float64
 4   TMAX_SWING            1251 non-null   float64
 5   TMAX_SWING_FROM_AVE   1242 non-null   float64
 6   WDF5                  1225 non-null   float64
 7   year_x                1254 non-null   float64
 8   month_x               1254 non-null   float64
 9   day_x                 1254 non-null   float64
 10  temp_max              1254 non-null   int64  
 11  temp_min              1254 non-null   int64  
 12  water_equivalent      1254 non-null   float64
 13  snow_fall             1254 non-null   float64
 14  snow_depth_6am        1254 non-null   float64
 15  wind_speed_sum       

In [7]:
avi.drop('FIVE_DAY_SNOWFALL', axis=1, inplace=True)
avi.drop('SNOW', axis=1, inplace=True)
avi.drop('WDF5', axis=1, inplace=True)
avi.drop('AWND', axis=1, inplace=True)
avi.drop('year_y', axis=1, inplace=True)
avi.drop('month_y', axis=1, inplace=True)
avi.drop('day_y', axis=1, inplace=True)

In [8]:
avi['TMAX_SWING'].fillna(0, inplace=True)
avi['TMAX_SWING_FROM_AVE'].fillna(0, inplace=True)
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1254 entries, 0 to 1356
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   TMAX_SWING            1254 non-null   float64
 2   TMAX_SWING_FROM_AVE   1254 non-null   float64
 3   year_x                1254 non-null   float64
 4   month_x               1254 non-null   float64
 5   day_x                 1254 non-null   float64
 6   temp_max              1254 non-null   int64  
 7   temp_min              1254 non-null   int64  
 8   water_equivalent      1254 non-null   float64
 9   snow_fall             1254 non-null   float64
 10  snow_depth_6am        1254 non-null   float64
 11  wind_speed_sum        1254 non-null   int64  
 12  sunshine_percent      1254 non-null   int64  
 13  west_wind_hours       1254 non-null   int64  
 14  northwest_wind_hours  1254 non-null   int64  
 15  prevailing_wind_E    

## Reset Dataframe Index

In [9]:
avi.reset_index(inplace=True)
avi.drop(columns=['index'], inplace=True) # Drop previous index created as column after redet_index() called
print(avi.index)

RangeIndex(start=0, stop=1254, step=1)


## Make 3-day and 5-day snowfall variables

In [50]:
print(avi.index)

RangeIndex(start=0, stop=1254, step=1)


In [10]:
three_day_snow = []

for day in range(0, avi.shape[0]):
    if (day == 0):
        three_day_snow.insert(day, avi['snow_fall'][day])
    elif (day == 1):
        three_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1])
    else:
        three_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2])

# Format elements
three_day_snow = ["%.2f" % row for row in three_day_snow]

In [11]:
avi['three_day_snow'] = three_day_snow

In [12]:
print(avi.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1254 entries, 0 to 1253
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   TMAX_SWING            1254 non-null   float64
 2   TMAX_SWING_FROM_AVE   1254 non-null   float64
 3   year_x                1254 non-null   float64
 4   month_x               1254 non-null   float64
 5   day_x                 1254 non-null   float64
 6   temp_max              1254 non-null   int64  
 7   temp_min              1254 non-null   int64  
 8   water_equivalent      1254 non-null   float64
 9   snow_fall             1254 non-null   float64
 10  snow_depth_6am        1254 non-null   float64
 11  wind_speed_sum        1254 non-null   int64  
 12  sunshine_percent      1254 non-null   int64  
 13  west_wind_hours       1254 non-null   int64  
 14  northwest_wind_hours  1254 non-null   int64  
 15  prevailing_wind_E    

In [28]:
five_day_snow = []

for day in range(0, avi.shape[0]):
    if (day == 0):
        five_day_snow.insert(day, avi['snow_fall'][day])
    elif (day == 1):
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1])
    elif (day == 2):
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2])
    elif (day == 3):
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2] + \
                                    avi['snow_fall'][day - 3])
    else:
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2] + \
                                    avi['snow_fall'][day - 3] + avi['snow_fall'][day - 4])

# Format elements
five_day_snow = ["%.2f" % row for row in five_day_snow]

In [29]:
avi['five_day_snow'] = five_day_snow

In [30]:
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1254 entries, 0 to 1253
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   TMAX_SWING            1254 non-null   float64
 2   TMAX_SWING_FROM_AVE   1254 non-null   float64
 3   year_x                1254 non-null   float64
 4   month_x               1254 non-null   float64
 5   day_x                 1254 non-null   float64
 6   temp_max              1254 non-null   int64  
 7   temp_min              1254 non-null   int64  
 8   water_equivalent      1254 non-null   float64
 9   snow_fall             1254 non-null   float64
 10  snow_depth_6am        1254 non-null   float64
 11  wind_speed_sum        1254 non-null   int64  
 12  sunshine_percent      1254 non-null   int64  
 13  west_wind_hours       1254 non-null   int64  
 14  northwest_wind_hours  1254 non-null   int64  
 15  prevailing_wind_E    

## Creating Features for Previous Days

In [32]:
prev_days_features = avi
print(prev_days_features.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1254 entries, 0 to 1253
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   TMAX_SWING            1254 non-null   float64
 2   TMAX_SWING_FROM_AVE   1254 non-null   float64
 3   year_x                1254 non-null   float64
 4   month_x               1254 non-null   float64
 5   day_x                 1254 non-null   float64
 6   temp_max              1254 non-null   int64  
 7   temp_min              1254 non-null   int64  
 8   water_equivalent      1254 non-null   float64
 9   snow_fall             1254 non-null   float64
 10  snow_depth_6am        1254 non-null   float64
 11  wind_speed_sum        1254 non-null   int64  
 12  sunshine_percent      1254 non-null   int64  
 13  west_wind_hours       1254 non-null   int64  
 14  northwest_wind_hours  1254 non-null   int64  
 15  prevailing_wind_E    

In [34]:
prev_days_features.drop(columns=['year_x', 'month_x', 'day_x'], inplace=True)
print(prev_days_features.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1254 entries, 0 to 1253
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AVY_DANGER            1254 non-null   float64
 1   TMAX_SWING            1254 non-null   float64
 2   TMAX_SWING_FROM_AVE   1254 non-null   float64
 3   temp_max              1254 non-null   int64  
 4   temp_min              1254 non-null   int64  
 5   water_equivalent      1254 non-null   float64
 6   snow_fall             1254 non-null   float64
 7   snow_depth_6am        1254 non-null   float64
 8   wind_speed_sum        1254 non-null   int64  
 9   sunshine_percent      1254 non-null   int64  
 10  west_wind_hours       1254 non-null   int64  
 11  northwest_wind_hours  1254 non-null   int64  
 12  prevailing_wind_E     1254 non-null   int64  
 13  prevailing_wind_N     1254 non-null   int64  
 14  prevailing_wind_NE    1254 non-null   int64  
 15  prevailing_wind_NW   

In [None]:
avi_columns = ['AVY_DANGER_0', 'AVY_DANGER_1', 'AVY_DANGER_2', 'TMAX_SWING_0', 'TMAX_SWING_1', 'TMAX_SWING_2']

for column in range(0, prev_days_features.shape[1]):
    name = prev_days_features.columns[column]
    prev_days_features[str(name + '_1')] = []