## Training Prep

The purpose of this notebook is to provide scaled data for the models we will be using

---

**INPUT**: flights_cleaned <br>
**OUTPUT**: flights_scaled and scaler.pkl <br>
**Next Steps**: PCA (where required) and/or Modeling

---

### Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pickle import dump

---

### Import Cleaned Data

In [2]:
df = pd.read_csv("flights_cleaned.csv")

In [3]:
df.head(2)

Unnamed: 0,branded_code_share,origin,origin_city_name,dest,dest_city_name,crs_dep_time,crs_arr_time,arr_delay,cancelled,crs_elapsed_time,...,fl_month,fl_day_of_week,fl_week_of_month,mkt_op_combo,fl_type,m_hist_dep_delay,med_hist_dep_delay,m_hist_arr_delay,med_hist_arr_delay,delay_type
0,1,144,"Grand Rapids, MI",271,"Philadelphia, PA",16,18,11.0,0.0,118.0,...,11,0,5,7,0,12.382206,-1.0,7.657341,-4.0,0
1,1,271,"Philadelphia, PA",144,"Grand Rapids, MI",14,16,6.0,0.0,130.0,...,11,0,5,7,0,10.304525,-1.0,5.44982,-5.0,0


In [9]:
df.drop(columns = ['origin_city_name', 'dest_city_name'], inplace = True)

In [10]:
df.columns

Index(['branded_code_share', 'origin', 'dest', 'crs_dep_time', 'crs_arr_time',
       'arr_delay', 'cancelled', 'crs_elapsed_time', 'air_time', 'distance',
       'fl_month', 'fl_day_of_week', 'fl_week_of_month', 'mkt_op_combo',
       'fl_type', 'm_hist_dep_delay', 'med_hist_dep_delay', 'm_hist_arr_delay',
       'med_hist_arr_delay', 'delay_type'],
      dtype='object')

### Work with our data seperate from our targets

In [11]:
y = df[['arr_delay', 'cancelled', 'delay_type']]

In [12]:
X = df.drop(columns = ['arr_delay', 'cancelled', 'delay_type']).values

In [13]:
scaler = StandardScaler()

In [14]:
scaler.fit(X)

StandardScaler()

In [15]:
X_scaled = scaler.transform(X)

*Export scaler to use on evaluation data*

In [16]:
#save the scaler
dump(scaler, open('scaler.pkl', 'wb'))

In [17]:
#prep train and test files
trainX = pd.DataFrame(data = X_scaled,
              columns = ['branded_code_share', 'origin', 'dest', 'crs_dep_time', 'crs_arr_time',
                         'crs_elapsed_time', 'air_time', 'distance','fl_month', 'fl_day_of_week', 
                         'fl_week_of_month', 'mkt_op_combo','fl_type', 'm_hist_dep_delay',
                         'med_hist_dep_delay', 'm_hist_arr_delay','med_hist_arr_delay'])

In [18]:
train = trainX.merge(y, right_index=True, left_index=True)

In [19]:
train.shape

(15768083, 20)

In [21]:
train.to_csv("flights_scaled.csv", index=False)

In [22]:
train_samp = train.sample(10000)

In [23]:
train_samp.to_csv("flights_scaled_sample.csv", index=False)