In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek

In [3]:
avi = pd.read_csv('SnowWeatherClean1.csv')
avi.head()

Unnamed: 0.1,Unnamed: 0,avi_danger,avg_wind,temp_max_swing,temp_max_swing_from_avg,year,month,day,temp_max,temp_min,...,prevailing_wind_N_2,prevailing_wind_NE_2,prevailing_wind_NW_2,prevailing_wind_S_2,prevailing_wind_SE_2,prevailing_wind_SW_2,prevailing_wind_W_2,three_day_snow_2,five_day_snow_2,next_day_avi_danger
0,2,1.0,20.58,0.0,0.0,2010.0,12.0,20.0,15,5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.2,0.2,3.0
1,3,3.0,35.12,3.0,0.0,2010.0,12.0,21.0,18,10,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.3,0.3,2.0
2,4,2.0,33.78,-3.0,0.0,2010.0,12.0,22.0,15,7,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,3.0
3,5,3.0,31.32,0.0,0.0,2010.0,12.0,23.0,15,6,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,4.5,2.0
4,6,2.0,32.44,2.0,1.4,2010.0,12.0,24.0,17,9,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.3,8.6,2.0


In [4]:
avi = avi.drop(['Unnamed: 0'], axis=1)

In [5]:
avi = avi[avi['avi_danger'].notnull()]
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1251 entries, 0 to 1250
Data columns (total 73 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   avi_danger                 1251 non-null   float64
 1   avg_wind                   1251 non-null   float64
 2   temp_max_swing             1251 non-null   float64
 3   temp_max_swing_from_avg    1251 non-null   float64
 4   year                       1251 non-null   float64
 5   month                      1251 non-null   float64
 6   day                        1251 non-null   float64
 7   temp_max                   1251 non-null   int64  
 8   temp_min                   1251 non-null   int64  
 9   water_equivalent           1251 non-null   float64
 10  snow_fall                  1251 non-null   float64
 11  snow_depth_6am             1251 non-null   float64
 12  wind_speed_sum             1251 non-null   int64  
 13  sunshine_percent           1251 non-null   int64

## Avi Danger Level = 5 Removed, Not Sure if Best Idea (Combining with 4 may be better)

In [6]:
avi = avi.fillna(0)
avi = avi[avi.avi_danger != 5]
avi = avi[avi.next_day_avi_danger != 5]
avi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1247 entries, 0 to 1250
Data columns (total 73 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   avi_danger                 1247 non-null   float64
 1   avg_wind                   1247 non-null   float64
 2   temp_max_swing             1247 non-null   float64
 3   temp_max_swing_from_avg    1247 non-null   float64
 4   year                       1247 non-null   float64
 5   month                      1247 non-null   float64
 6   day                        1247 non-null   float64
 7   temp_max                   1247 non-null   int64  
 8   temp_min                   1247 non-null   int64  
 9   water_equivalent           1247 non-null   float64
 10  snow_fall                  1247 non-null   float64
 11  snow_depth_6am             1247 non-null   float64
 12  wind_speed_sum             1247 non-null   int64  
 13  sunshine_percent           1247 non-null   int64

In [8]:
x = avi.iloc[:, 0:avi.shape[1]-2]
y = avi.iloc[:, avi.shape[1]-1]

print(x.shape)
print(y.shape)

(1247, 71)
(1247,)


## RandomOverSampler

In [17]:
print(x.shape)
print(y.shape)
ovs = RandomOverSampler(random_state=42)
x_res, y_res = ovs.fit_sample(x, y)
print(x_res.shape)
print(y_res.shape)

(1247, 71)
(1247,)
(1892, 71)
(1892,)


In [30]:
aviROS = pd.DataFrame(x_res)
aviROS['next_day_avi_danger'] = y_res 
print(aviROS.shape)
print(aviROS.info)

(1892, 72)
<bound method DataFrame.info of       avi_danger  avg_wind  temp_max_swing  temp_max_swing_from_avg    year  \
0            1.0     20.58             0.0                      0.0  2010.0   
1            3.0     35.12             3.0                      0.0  2010.0   
2            2.0     33.78            -3.0                      0.0  2010.0   
3            3.0     31.32             0.0                      0.0  2010.0   
4            2.0     32.44             2.0                      1.4  2010.0   
...          ...       ...             ...                      ...     ...   
1887         2.0     14.54             3.0                     13.0  2011.0   
1888         2.0     58.38            -9.0                     -3.6  2016.0   
1889         3.0     57.04             3.0                     -3.4  2011.0   
1890         3.0     37.80            18.0                     16.0  2019.0   
1891         1.0     50.11            17.0                     19.8  2017.0   

      mo

In [31]:
aviROS.to_csv('SnowWeatherCleanROS.csv')

## SMOTETomek

In [32]:
print(x.shape)
print(y.shape)
smk = SMOTETomek()
x_res_smk, y_res_smk = smk.fit_sample(x, y)
print(x_res_smk.shape)
print(y_res_smk.shape)

(1247, 71)
(1247,)
(1656, 71)
(1656,)


In [33]:
aviSMK = pd.DataFrame(x_res_smk)
aviSMK['next_day_avi_danger'] = y_res_smk 
print(aviSMK.shape)
print(aviSMK.info)

(1656, 72)
<bound method DataFrame.info of       avi_danger   avg_wind  temp_max_swing  temp_max_swing_from_avg  \
0       1.000000  20.580000        0.000000                 0.000000   
1       3.000000  35.120000        3.000000                 0.000000   
2       2.000000  33.780000       -3.000000                 0.000000   
3       3.000000  31.320000        0.000000                 0.000000   
4       2.000000  32.440000        2.000000                 1.400000   
...          ...        ...             ...                      ...   
1651    3.000000  41.676583       -2.352054                 0.503589   
1652    2.313287  25.662643       -3.686713                -1.217063   
1653    2.440366  30.376790        4.926589                 0.458709   
1654    1.858640  44.825070       17.429320                18.168584   
1655    3.000000  37.505135       -0.261060                11.829443   

             year      month        day  temp_max  temp_min  water_equivalent  \
0     2010.

In [34]:
aviSMK.to_csv('SnowWeatherCleanSMK.csv')