# Pre-processing and training data development

Capstone 2 problem: What is the evolution of the impact of catastrophic events on the commercial aerial traffic in Canada, between 2001 and 2018?

In [1]:
# Importing pertinent packages
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Importing the dataset
df = pd.read_csv('merged_df.csv', sep=",")

In [3]:
# Visualizing the dataset
df.head()

Unnamed: 0,date,prov_ter,VALUE,year,month,Avalanche,Drought,Earthquake,Flood,Heat Event,Hurricane / Typhoon / Tropical Storm,Landslide,Storm - Unspecified / Other,Storm Surge,Storms and Severe Thunderstorms,Tornado,Wildfire,Winter Storm,sum_events
0,2001-02-01,AB,49568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2001-02-01,BC,89058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2001-02-01,MB,19869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2001-02-01,NB,4966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2001-02-01,NFL,7964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Visualization of the dataset shape
df.shape

(2365, 19)

In [5]:
# Determining datatypes in the dataset
df.dtypes

date                                     object
prov_ter                                 object
VALUE                                     int64
year                                    float64
month                                   float64
Avalanche                               float64
Drought                                 float64
Earthquake                              float64
Flood                                   float64
Heat Event                              float64
Hurricane / Typhoon / Tropical Storm    float64
Landslide                               float64
Storm - Unspecified / Other             float64
Storm Surge                             float64
Storms and Severe Thunderstorms         float64
Tornado                                 float64
Wildfire                                float64
Winter Storm                            float64
sum_events                              float64
dtype: object

In [6]:
# Changing the datatype of date to datetime
df['date'] =  pd.to_datetime(df['date'], format='%Y-%d-%m')
df.dtypes

date                                    datetime64[ns]
prov_ter                                        object
VALUE                                            int64
year                                           float64
month                                          float64
Avalanche                                      float64
Drought                                        float64
Earthquake                                     float64
Flood                                          float64
Heat Event                                     float64
Hurricane / Typhoon / Tropical Storm           float64
Landslide                                      float64
Storm - Unspecified / Other                    float64
Storm Surge                                    float64
Storms and Severe Thunderstorms                float64
Tornado                                        float64
Wildfire                                       float64
Winter Storm                                   float64
sum_events

Only prov_ter is a categorical value.

In [7]:
# Getting dummy variables for the categorical values
prov_ter_dummy = pd.get_dummies(df['prov_ter'])

In [8]:
# Concatening the dummy variables with the original dataset.
df = pd.concat([df, prov_ter_dummy], axis=1)
df = df.drop('prov_ter', axis = 1)

In [9]:
# Converting date in year and month (floats)
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

df = df.drop('date', axis = 1)

df['year'] = df['year'].astype(float)
df['month'] = df['month'].astype(float)

In [10]:
# Changing the datatype of VALUE to float.
df['VALUE'] = df['VALUE'].astype(float)

In [11]:
# Visualizing the new version of the dataset
df.head()

Unnamed: 0,VALUE,year,month,Avalanche,Drought,Earthquake,Flood,Heat Event,Hurricane / Typhoon / Tropical Storm,Landslide,...,BC,MB,NB,NFL,NS,NWT,ON,QC,SK,YU
0,49568.0,2001.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,89058.0,2001.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,19869.0,2001.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
3,4966.0,2001.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
4,7964.0,2001.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2365 entries, 0 to 2364
Data columns (total 28 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   VALUE                                 2365 non-null   float64
 1   year                                  2365 non-null   float64
 2   month                                 2365 non-null   float64
 3   Avalanche                             2365 non-null   float64
 4   Drought                               2365 non-null   float64
 5   Earthquake                            2365 non-null   float64
 6   Flood                                 2365 non-null   float64
 7   Heat Event                            2365 non-null   float64
 8   Hurricane / Typhoon / Tropical Storm  2365 non-null   float64
 9   Landslide                             2365 non-null   float64
 10  Storm - Unspecified / Other           2365 non-null   float64
 11  Storm Surge      

There is no need to standardize the aerial trafic value since there is only one numeric type of data in the data set.

In [13]:
# Spliting data into training and testing set
X = df.drop('VALUE', axis = 1)
y = df.VALUE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [14]:
# Saving data
df.to_csv('df.csv', index=False)