## Training Prep

The purpose of this notebook is to provide scaled data for the models we will be using

---

**INPUT**: flights_cleaned (train or test) <br>
**OUTPUT**: flights_scaled (train or test) and scaler.pkl <br>
**Next Steps**: PCA (where required) and/or Modeling

---

### Import Packages

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pickle import dump

---

*import cleaned data*

In [40]:
df = pd.read_csv("flights_cleaned.csv")

In [41]:
df.head(2)

Unnamed: 0,branded_code_share,crs_dep_time,crs_arr_time,arr_delay,cancelled,crs_elapsed_time,air_time,distance,fl_month,fl_day_of_week,fl_type,state_travel_type,m_hist_dep_delay,med_hist_dep_delay,m_hist_arr_delay,med_hist_arr_delay,origin_cat,dest_cat,mkt_op_combo_cat,delay_type
0,1,16,18,11.0,0.0,118.0,91.0,573.0,11,0,0,1,12.382206,-1.0,7.657341,-4.0,144,271,7,0
1,1,14,16,6.0,0.0,130.0,93.0,573.0,11,0,0,1,10.304525,-1.0,5.44982,-5.0,271,144,7,0


In [42]:
df.columns

Index(['branded_code_share', 'crs_dep_time', 'crs_arr_time', 'arr_delay',
       'cancelled', 'crs_elapsed_time', 'air_time', 'distance', 'fl_month',
       'fl_day_of_week', 'fl_type', 'state_travel_type', 'm_hist_dep_delay',
       'med_hist_dep_delay', 'm_hist_arr_delay', 'med_hist_arr_delay',
       'origin_cat', 'dest_cat', 'mkt_op_combo_cat', 'delay_type'],
      dtype='object')

*work with regression df*

In [43]:
y = df[['arr_delay', 'cancelled', 'delay_type']]

In [45]:
X = df.drop(columns = ['arr_delay', 'cancelled', 'delay_type']).values

In [46]:
scaler = StandardScaler()

In [47]:
scaler.fit(X)

StandardScaler()

In [48]:
X_scaled = scaler.transform(X)

In [49]:
#save the scaler
dump(scaler, open('scaler.pkl', 'wb'))

In [51]:
#prep train and test files
trainX = pd.DataFrame(data = X_scaled,
              columns = ['branded_code_share', 'crs_dep_time', 'crs_arr_time',
                         'crs_elapsed_time', 'air_time', 'distance', 'fl_month',
                         'fl_day_of_week', 'fl_type', 'state_travel_type', 
                         'm_hist_dep_delay','med_hist_dep_delay', 'm_hist_arr_delay', 
                         'med_hist_arr_delay','origin_cat', 'dest_cat', 
                         'mkt_op_combo_cat'])

In [54]:
train = trainX.merge(y, right_index=True, left_index=True)

In [57]:
train.shape

(15768096, 20)

In [67]:
train = train[train.crs_elapsed_time.isna()==False]

In [69]:
train.to_csv("flights_scaled.csv", index=False)