## Training Prep

The purpose of this notebook is to provide test_train splits for the various models we will be working with and scale the data

Notebook Input: Clean Model Input
Notebook Output: <br>
    - model_train.csv <br>
    - model_test.csv <br>
    - scaler.pkl <br>

---

### Import Packages

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pickle import dump

---

*import cleaned data*

In [2]:
df = pd.read_csv("flights_cleaned.csv")

In [3]:
df.head()

Unnamed: 0,branded_code_share,crs_dep_time,crs_arr_time,arr_delay,cancelled,crs_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,fl_month,fl_day_of_week,fl_type,state_travel_type,origin_cat,dest_cat,mkt_op_combo_cat
0,1,16,18,11.0,0.0,118.0,91.0,573.0,0.0,0.0,0.0,0.0,0.0,11,0,0,1,144,271,7
1,1,14,16,6.0,0.0,130.0,93.0,573.0,0.0,0.0,0.0,0.0,0.0,11,0,0,1,271,144,7
2,1,9,10,-18.0,0.0,79.0,55.0,196.0,0.0,0.0,0.0,0.0,0.0,11,0,0,1,30,271,7
3,1,7,8,2.0,0.0,75.0,35.0,196.0,0.0,0.0,0.0,0.0,0.0,11,0,0,1,271,30,7
4,1,9,10,34.0,0.0,93.0,63.0,406.0,10.0,0.0,0.0,0.0,24.0,11,0,0,1,75,271,7


In [4]:
df.columns

Index(['branded_code_share', 'crs_dep_time', 'crs_arr_time', 'arr_delay',
       'cancelled', 'crs_elapsed_time', 'air_time', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'fl_month', 'fl_day_of_week', 'fl_type',
       'state_travel_type', 'origin_cat', 'dest_cat', 'mkt_op_combo_cat'],
      dtype='object')

*work with regression df*

In [5]:
df_regression = df[['branded_code_share', 'crs_dep_time', 'crs_arr_time', 
                    'arr_delay','crs_elapsed_time', 'air_time', 'distance',
                    'fl_month', 'fl_day_of_week', 'fl_type','state_travel_type',
                    'origin_cat', 'dest_cat', 'mkt_op_combo_cat']].copy()

In [6]:
df_regression.columns

Index(['branded_code_share', 'crs_dep_time', 'crs_arr_time', 'arr_delay',
       'crs_elapsed_time', 'air_time', 'distance', 'fl_month',
       'fl_day_of_week', 'fl_type', 'state_travel_type', 'origin_cat',
       'dest_cat', 'mkt_op_combo_cat'],
      dtype='object')

In [9]:
y_regression = df_regression.arr_delay.values

In [13]:
X_regression = df_regression.drop(columns = ["arr_delay"]).values

In [15]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression,
                                                                    y_regression,
                                                                    test_size=0.25,
                                                                    random_state=1)

In [22]:
scaler_r = StandardScaler()

In [23]:
scaler.fit(X_train_reg)

StandardScaler()

In [24]:
X_train_reg_scaled = scaler.transform(X_train_reg)

In [25]:
X_test_reg_scaled = scaler.transform(X_test_reg)

In [26]:
#save the scaler
dump(scaler, open('scaler_r.pkl', 'wb'))

In [32]:
#prep train and test files
trainX = pd.DataFrame(data = X_train_reg_scaled,
              columns = ['branded_code_share', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'air_time', 'distance', 'fl_month',
       'fl_day_of_week', 'fl_type', 'state_travel_type', 'origin_cat',
       'dest_cat', 'mkt_op_combo_cat'])
trainY = pd.DataFrame(data = y_train_reg,
                     columns = ['arr_delay'])
train = trainX.merge(trainY, right_index=True, left_index=True)

In [34]:
testX = pd.DataFrame(data = X_test_reg_scaled,
              columns = ['branded_code_share', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'air_time', 'distance', 'fl_month',
       'fl_day_of_week', 'fl_type', 'state_travel_type', 'origin_cat',
       'dest_cat', 'mkt_op_combo_cat'])
testY = pd.DataFrame(data = y_test_reg,
                     columns = ['arr_delay'])
test = testX.merge(testY, right_index=True, left_index=True)

In [39]:
train.to_csv("regression_train.csv", index=False)
test.to_csv("regression_test.csv", index=False)