# Feature Engineering

I will be creating features that hold that past few days of data for each variable to hopefully increase model robustness

## Import Tools

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

# pandas
from pandas.plotting import scatter_matrix

In [2]:
avi = pd.read_csv('SnowWeatherClean.csv')
print(avi)

      Unnamed: 0  avi_danger   avg_wind  temp_max_swing  \
0              0         2.0  10.740000             0.0   
1              1         1.0   9.400000             3.0   
2              2         1.0  20.580000             0.0   
3              3         3.0  35.120000             3.0   
4              4         2.0  33.780000            -3.0   
...          ...         ...        ...             ...   
1248        1248         1.0  27.125000            13.0   
1249        1249         3.0  28.500000             2.0   
1250        1250         3.0  11.833333            -1.0   
1251        1251         3.0  24.416667             8.0   
1252        1252         2.0  56.875000            -5.0   

      temp_max_swing_from_avg    year  month   day  temp_max  temp_min  ...  \
0                         0.0  2010.0   12.0  18.0        12         4  ...   
1                         0.0  2010.0   12.0  19.0        15         3  ...   
2                         0.0  2010.0   12.0  20.0    

## Dataset at a Glance

In [3]:
avi.describe()

Unnamed: 0.1,Unnamed: 0,avi_danger,avg_wind,temp_max_swing,temp_max_swing_from_avg,year,month,day,temp_max,temp_min,...,northwest_wind_hours,prevailing_wind_E,prevailing_wind_N,prevailing_wind_NE,prevailing_wind_NW,prevailing_wind_S,prevailing_wind_SE,prevailing_wind_SW,prevailing_wind_W,next_day_avi_danger
count,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,...,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0
mean,626.0,2.088587,43.398746,-0.03352,-0.056345,2015.58739,3.499601,16.616919,19.794094,3.831604,...,6.521149,0.014366,0.049481,0.018356,0.276137,0.066241,0.026337,0.072626,0.476457,2.087789
std,361.854252,0.888858,16.51287,11.28772,13.006894,2.960021,3.110999,8.432409,14.034897,14.988959,...,7.619898,0.11904,0.216957,0.134288,0.447264,0.248802,0.160199,0.259625,0.499645,0.889387
min,0.0,1.0,4.25,-44.0,-53.8,2010.0,1.0,1.0,-26.0,-40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,313.0,1.0,31.09,-5.0,-8.4,2013.0,2.0,10.0,11.0,-7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,626.0,2.0,43.4,1.0,0.4,2016.0,3.0,17.0,19.0,5.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,939.0,3.0,55.48,7.0,9.2,2018.0,4.0,24.0,30.0,13.0,...,11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0
max,1252.0,5.0,110.06,39.0,34.6,2020.0,12.0,31.0,57.0,47.0,...,24.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0


In [4]:
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               1253 non-null   int64  
 1   avi_danger               1253 non-null   float64
 2   avg_wind                 1253 non-null   float64
 3   temp_max_swing           1253 non-null   float64
 4   temp_max_swing_from_avg  1253 non-null   float64
 5   year                     1253 non-null   float64
 6   month                    1253 non-null   float64
 7   day                      1253 non-null   float64
 8   temp_max                 1253 non-null   int64  
 9   temp_min                 1253 non-null   int64  
 10  water_equivalent         1253 non-null   float64
 11  snow_fall                1253 non-null   float64
 12  snow_depth_6am           1253 non-null   float64
 13  wind_speed_sum           1253 non-null   int64  
 14  sunshine_percent        

## Filter Dataset

In [5]:
avi = avi.drop(['Unnamed: 0'], axis=1)
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avi_danger               1253 non-null   float64
 1   avg_wind                 1253 non-null   float64
 2   temp_max_swing           1253 non-null   float64
 3   temp_max_swing_from_avg  1253 non-null   float64
 4   year                     1253 non-null   float64
 5   month                    1253 non-null   float64
 6   day                      1253 non-null   float64
 7   temp_max                 1253 non-null   int64  
 8   temp_min                 1253 non-null   int64  
 9   water_equivalent         1253 non-null   float64
 10  snow_fall                1253 non-null   float64
 11  snow_depth_6am           1253 non-null   float64
 12  wind_speed_sum           1253 non-null   int64  
 13  sunshine_percent         1253 non-null   int64  
 14  west_wind_hours         

In [6]:
# avi = avi[avi['AVY_DANGER'].notnull()]
# avi.info()

In [7]:
# avi.drop('FIVE_DAY_SNOWFALL', axis=1, inplace=True)
# avi.drop('SNOW', axis=1, inplace=True)
# avi.drop('WDF5', axis=1, inplace=True)
# avi.drop('year_y', axis=1, inplace=True)
# avi.drop('month_y', axis=1, inplace=True)
# avi.drop('day_y', axis=1, inplace=True)

In [8]:
# avi['TMAX_SWING'].fillna(0, inplace=True)
# avi['TMAX_SWING_FROM_AVE'].fillna(0, inplace=True)
# avi.info()

## Reset Dataframe Index

In [9]:
# avi.reset_index(inplace=True)
# avi.drop(columns=['index'], inplace=True) # Drop previous index created as column after redet_index() called
# print(avi.index)

## Make 3-day and 5-day snowfall variables

In [10]:
print(avi.index)

RangeIndex(start=0, stop=1253, step=1)


In [11]:
three_day_snow = []

for day in range(0, avi.shape[0]):
    if (day == 0):
        three_day_snow.insert(day, avi['snow_fall'][day])
    elif (day == 1):
        three_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1])
    else:
        three_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2])

# Format elements
three_day_snow = ["%.2f" % row for row in three_day_snow]
three_day_snow = [float(row) for row in three_day_snow] 

In [12]:
avi['three_day_snow'] = three_day_snow

In [13]:
print(avi.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avi_danger               1253 non-null   float64
 1   avg_wind                 1253 non-null   float64
 2   temp_max_swing           1253 non-null   float64
 3   temp_max_swing_from_avg  1253 non-null   float64
 4   year                     1253 non-null   float64
 5   month                    1253 non-null   float64
 6   day                      1253 non-null   float64
 7   temp_max                 1253 non-null   int64  
 8   temp_min                 1253 non-null   int64  
 9   water_equivalent         1253 non-null   float64
 10  snow_fall                1253 non-null   float64
 11  snow_depth_6am           1253 non-null   float64
 12  wind_speed_sum           1253 non-null   int64  
 13  sunshine_percent         1253 non-null   int64  
 14  west_wind_hours         

In [14]:
five_day_snow = []

for day in range(0, avi.shape[0]):
    if (day == 0):
        five_day_snow.insert(day, avi['snow_fall'][day])
    elif (day == 1):
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1])
    elif (day == 2):
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2])
    elif (day == 3):
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2] + \
                                    avi['snow_fall'][day - 3])
    else:
        five_day_snow.insert(day, avi['snow_fall'][day] + avi['snow_fall'][day - 1] + avi['snow_fall'][day - 2] + \
                                    avi['snow_fall'][day - 3] + avi['snow_fall'][day - 4])

# Format elements
five_day_snow = ["%.2f" % row for row in five_day_snow]
five_day_snow = [float(row) for row in five_day_snow] 

In [15]:
avi['five_day_snow'] = five_day_snow

In [16]:
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avi_danger               1253 non-null   float64
 1   avg_wind                 1253 non-null   float64
 2   temp_max_swing           1253 non-null   float64
 3   temp_max_swing_from_avg  1253 non-null   float64
 4   year                     1253 non-null   float64
 5   month                    1253 non-null   float64
 6   day                      1253 non-null   float64
 7   temp_max                 1253 non-null   int64  
 8   temp_min                 1253 non-null   int64  
 9   water_equivalent         1253 non-null   float64
 10  snow_fall                1253 non-null   float64
 11  snow_depth_6am           1253 non-null   float64
 12  wind_speed_sum           1253 non-null   int64  
 13  sunshine_percent         1253 non-null   int64  
 14  west_wind_hours         

## Creating Features for Previous Days

In [17]:
avi_columns = ['avi_danger','avg_wind', 'temp_max_swing', 'temp_max_swing_from_avg', 'temp_max', 'temp_min',
               'water_equivalent','snow_fall', 'snow_depth_6am','wind_speed_sum', 'sunshine_percent', 
               'west_wind_hours', 'northwest_wind_hours', 'prevailing_wind_E', 'prevailing_wind_N', 
               'prevailing_wind_NE', 'prevailing_wind_NW', 'prevailing_wind_S', 'prevailing_wind_SE', 
               'prevailing_wind_SW', 'prevailing_wind_W', 'three_day_snow', 'five_day_snow']

avi_columns_1 = ['avi_danger_1','avg_wind_1', 'temp_max_swing_1', 'temp_max_swing_from_avg_1', 'temp_max_1',
                 'temp_min_1', 'water_equivalent_1','snow_fall_1', 'snow_depth_6am_1','wind_speed_sum_1',
                 'sunshine_percent_1', 'west_wind_hours_1', 'northwest_wind_hours_1', 'prevailing_wind_E_1', 
                 'prevailing_wind_N_1', 'prevailing_wind_NE_1', 'prevailing_wind_NW_1', 'prevailing_wind_S_1',  
                 'prevailing_wind_SE_1', 'prevailing_wind_SW_1', 'prevailing_wind_W_1', 'three_day_snow_1', 
                 'five_day_snow_1']

avi_columns_2 = ['avi_danger_2','avg_wind_2', 'temp_max_swing_2', 'temp_max_swing_from_avg_2', 'temp_max_2',
                 'temp_min_2', 'water_equivalent_2','snow_fall_2', 'snow_depth_6am_2','wind_speed_sum_2',
                 'sunshine_percent_2', 'west_wind_hours_2', 'northwest_wind_hours_2', 'prevailing_wind_E_2', 
                 'prevailing_wind_N_2', 'prevailing_wind_NE_2', 'prevailing_wind_NW_2', 'prevailing_wind_S_2',  
                 'prevailing_wind_SE_2', 'prevailing_wind_SW_2', 'prevailing_wind_W_2', 'three_day_snow_2', 
                 'five_day_snow_2']

### 1-Day Ago

In [18]:
temp = np.array([])
for column in range(0, len(avi_columns)):
    for row in range(0, avi.shape[0]):
        if (row == 0):
            temp = np.append(temp, np.nan)
        else:
            temp = np.append(temp, avi[avi_columns[column]][row - 1])  
    temp = np.reshape(temp, [avi.shape[0], 1])
    avi[avi_columns_1[column]] = temp
    temp = np.array([])
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 50 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   avi_danger                 1253 non-null   float64
 1   avg_wind                   1253 non-null   float64
 2   temp_max_swing             1253 non-null   float64
 3   temp_max_swing_from_avg    1253 non-null   float64
 4   year                       1253 non-null   float64
 5   month                      1253 non-null   float64
 6   day                        1253 non-null   float64
 7   temp_max                   1253 non-null   int64  
 8   temp_min                   1253 non-null   int64  
 9   water_equivalent           1253 non-null   float64
 10  snow_fall                  1253 non-null   float64
 11  snow_depth_6am             1253 non-null   float64
 12  wind_speed_sum             1253 non-null   int64  
 13  sunshine_percent           1253 non-null   int64

### 2-Days Ago

In [19]:
temp = np.array([])
for column in range(0, len(avi_columns)):
    for row in range(0, avi.shape[0]):
        if (row < 2):
            temp = np.append(temp, np.nan)
        else:
            temp = np.append(temp, avi[avi_columns[column]][row - 2])  
    temp = np.reshape(temp, [avi.shape[0], 1])
    avi[avi_columns_2[column]] = temp
    temp = np.array([])
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 73 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   avi_danger                 1253 non-null   float64
 1   avg_wind                   1253 non-null   float64
 2   temp_max_swing             1253 non-null   float64
 3   temp_max_swing_from_avg    1253 non-null   float64
 4   year                       1253 non-null   float64
 5   month                      1253 non-null   float64
 6   day                        1253 non-null   float64
 7   temp_max                   1253 non-null   int64  
 8   temp_min                   1253 non-null   int64  
 9   water_equivalent           1253 non-null   float64
 10  snow_fall                  1253 non-null   float64
 11  snow_depth_6am             1253 non-null   float64
 12  wind_speed_sum             1253 non-null   int64  
 13  sunshine_percent           1253 non-null   int64

In [20]:
avi.dropna(inplace=True)

In [25]:
target = avi['next_day_avi_danger']
avi.drop(['next_day_avi_danger'], axis=1, inplace=True)
avi['next_day_avi_danger'] = target

In [26]:
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1251 entries, 2 to 1252
Data columns (total 73 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   avi_danger                 1251 non-null   float64
 1   avg_wind                   1251 non-null   float64
 2   temp_max_swing             1251 non-null   float64
 3   temp_max_swing_from_avg    1251 non-null   float64
 4   year                       1251 non-null   float64
 5   month                      1251 non-null   float64
 6   day                        1251 non-null   float64
 7   temp_max                   1251 non-null   int64  
 8   temp_min                   1251 non-null   int64  
 9   water_equivalent           1251 non-null   float64
 10  snow_fall                  1251 non-null   float64
 11  snow_depth_6am             1251 non-null   float64
 12  wind_speed_sum             1251 non-null   int64  
 13  sunshine_percent           1251 non-null   int64

In [27]:
avi.to_csv('SnowWeatherClean1.csv')