In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek

In [2]:
avi = pd.read_csv('SnowWeatherCleanFE.csv')
avi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1224 entries, 0 to 1223
Data columns (total 74 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 1224 non-null   int64  
 1   avi_danger                 1224 non-null   float64
 2   avg_wind                   1224 non-null   float64
 3   temp_max_swing             1224 non-null   float64
 4   temp_max_swing_from_avg    1224 non-null   float64
 5   year                       1224 non-null   float64
 6   month                      1224 non-null   float64
 7   day                        1224 non-null   float64
 8   temp_max                   1224 non-null   int64  
 9   temp_min                   1224 non-null   int64  
 10  water_equivalent           1224 non-null   float64
 11  snow_fall                  1224 non-null   float64
 12  snow_depth_6am             1224 non-null   float64
 13  wind_speed_sum             1224 non-null   int64

In [3]:
avi = avi.drop(['Unnamed: 0'], axis=1)

In [4]:
x = avi.iloc[:, 0:avi.shape[1]-1]
y = avi.iloc[:, avi.shape[1]-1]

print(x.shape)
print(y.shape)

(1224, 72)
(1224,)


## RandomOverSampler

In [5]:
print(x.shape)
print(y.shape)
ovs = RandomOverSampler(random_state=42)
x_res, y_res = ovs.fit_sample(x, y)
print(x_res.shape)
print(y_res.shape)

(1224, 72)
(1224,)
(1876, 72)
(1876,)


In [6]:
aviROS = pd.DataFrame(x_res)
aviROS['next_day_avi_danger'] = y_res 
print(aviROS.shape)
print(aviROS.info)

(1876, 73)
<bound method DataFrame.info of       avi_danger  avg_wind  temp_max_swing  temp_max_swing_from_avg    year  \
0            1.0     20.58             0.0                      0.0  2010.0   
1            3.0     35.12             3.0                      0.0  2010.0   
2            2.0     33.78            -3.0                      0.0  2010.0   
3            3.0     31.32             0.0                      0.0  2010.0   
4            2.0     32.44             2.0                      1.4  2010.0   
...          ...       ...             ...                      ...     ...   
1871         3.0     32.88            -3.0                     -5.2  2016.0   
1872         3.0     57.04             3.0                     -3.4  2011.0   
1873         3.0     47.87            -7.0                     -8.8  2018.0   
1874         3.0     35.79             3.0                     -7.4  2016.0   
1875         3.0     28.86           -14.0                      0.0  2011.0   

      mo

In [7]:
aviROS.to_csv('SnowWeatherCleanROS.csv')

## SMOTETomek

In [8]:
print(x.shape)
print(y.shape)
smk = SMOTETomek()
x_res_smk, y_res_smk = smk.fit_sample(x, y)
print(x_res_smk.shape)
print(y_res_smk.shape)

(1224, 72)
(1224,)
(1640, 72)
(1640,)


In [9]:
aviSMK = pd.DataFrame(x_res_smk)
aviSMK['next_day_avi_danger'] = y_res_smk 
print(aviSMK.shape)
print(aviSMK.info)

(1640, 73)
<bound method DataFrame.info of       avi_danger   avg_wind  temp_max_swing  temp_max_swing_from_avg  \
0       1.000000  20.580000        0.000000                 0.000000   
1       3.000000  35.120000        3.000000                 0.000000   
2       2.000000  33.780000       -3.000000                 0.000000   
3       3.000000  31.320000        0.000000                 0.000000   
4       2.000000  32.440000        2.000000                 1.400000   
...          ...        ...             ...                      ...   
1635    2.922218  19.457401       11.377743                12.863303   
1636    2.366296  45.862156        6.366296                15.879109   
1637    3.000000  29.708633       -0.185227                 4.888864   
1638    3.000000  34.541673        0.038769                 8.832331   
1639    1.205529  25.496942       -3.383413                -1.037140   

             year      month        day  temp_max  temp_min  water_equivalent  \
0     2010.

In [10]:
aviSMK.to_csv('SnowWeatherCleanSMK.csv')