In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor


In [2]:
# Adjust settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
%matplotlib inline

# to make this notebook's output stable across runs
np.random.seed(42)

In [3]:
# Load preprocessed dataset

data_path = '../data/processed/'
df = pd.read_pickle(os.path.join(data_path, 'final.pkl'))

In [4]:
df.head()

Unnamed: 0,flt_leg,flt_dep_airpt,flt_arr_airpt,flt_offblock,flt_onblock,flt_ac_reg,flt_change_code,flt_dep_delay,flt_ac_type,flt_tt,flt_sched_tt,flt_act_gt,flt_sched_dep,flt_sched_arr,gnd_sched_tat,block_delay,routing,sched_gt,act_gt,sched_dep_d,cp_crew,ca_crew,cp_count,ca_count,cc_cp_ca,cc_count,day_of_week,hour_of_day_dep,hour_of_day_arr,cc_types,cc_roles
0,272024970,New Jessica,East Carmen,2019-06-01 03:50:00,2019-06-01 07:01:00,ECLBAX,other problem,25.0,320,0.0,0.0,94.0,2019-06-01 03:25:00,2019-06-01 06:45:00,,16.0,New Jessica_East Carmen,95.0,94.0,2019-06-01,"[Andrew Patterson, Joshua Ellis]","[Caleb Davidson, Cassandra Lewis, Sean Weeks, ...",2,4,,0,5,3,6,0,[]
1,272022230,East Carmen,South Nathaniel,2019-06-01 08:35:00,2019-06-01 10:41:00,ECLBAX,other problem,15.0,320,60.0,60.0,120.0,2019-06-01 08:20:00,2019-06-01 10:35:00,95.0,6.0,East Carmen_South Nathaniel,75.0,120.0,2019-06-01,"[Mikayla Harris, Rachel Smith]","[Cathy Meyer, Charles Watson, Jessica Holmes, ...",2,4,,0,5,8,10,3,"[cp, cp, ca, ca, ca, ca]"
2,272212848,South Nathaniel,East Carmen,2019-06-01 12:41:00,2019-06-01 14:52:00,ECLBAX,rotational problem,51.0,320,68.0,75.0,86.0,2019-06-01 11:50:00,2019-06-01 14:15:00,,37.0,South Nathaniel_East Carmen,80.0,86.0,2019-06-01,"[Mikayla Harris, Rachel Smith]","[Cathy Meyer, Charles Watson, Jessica Holmes, ...",2,4,,0,5,11,14,1,[]
3,271997824,East Carmen,Joneshaven,2019-06-01 16:18:00,2019-06-01 17:32:00,ECLBAX,rotational problem,43.0,320,64.0,70.0,32.0,2019-06-01 15:35:00,2019-06-01 17:00:00,80.0,32.0,East Carmen_Joneshaven,50.0,32.0,2019-06-01,"[Candace Brooks, Sergio Cummings]","[Amanda Aguilar, Antonio Robbins, Jackie Black...",2,4,both,5,5,15,17,3,"[cp, cp, ca, ca, ca, ca]"
4,271998033,Joneshaven,East Carmen,2019-06-01 18:04:00,2019-06-01 19:14:00,ECLBAX,other problem,14.0,320,42.0,50.0,,2019-06-01 17:50:00,2019-06-01 19:10:00,,4.0,Joneshaven_East Carmen,,,2019-06-01,"[Candace Brooks, Sergio Cummings]","[Amanda Aguilar, Antonio Robbins, Jackie Black...",2,4,,0,5,17,19,1,[]


In [5]:
# Drop columns which will not be used --> Is this the best place to do so?
df = df.drop(['flt_leg', 'flt_offblock', 'flt_onblock', 'flt_sched_dep', 'flt_sched_arr', 'cp_crew', 'ca_crew', 'cc_roles'], axis=1)

In [6]:
# One-hot encode all catgorical variables
df_one_hot = pd.get_dummies(df, drop_first=True)
df_one_hot.dropna(axis=0, how='any', inplace=True)

In [7]:
# Generate train/test splits for both intermediate models

X_train_offblock, X_test_offblock, y_train_offblock, y_test_offblock = train_test_split(df_one_hot.drop(['block_delay'], axis=1), df_one_hot['block_delay'], test_size=0.33, random_state=42)
X_train_onblock, X_test_onblock, y_train_onblock, y_test_onblock = train_test_split(df_one_hot.drop(['flt_dep_delay'], axis=1), df_one_hot['flt_dep_delay'], test_size=0.33, random_state=42)

In [8]:
# Implement second baseline model as random forest regression
rand_for_off = RandomForestRegressor(n_estimators = 10, random_state=42)
rand_for_off.fit(X_train_offblock, y_train_offblock)
rand_for_off.predict(X_test_offblock)

rand_for_on = RandomForestRegressor(n_estimators = 10, random_state=42)
rand_for_on.fit(X_train_onblock, y_train_onblock)
rand_for_on.predict(X_test_onblock)

array([18.1, 18.5, 34.9, ..., 51.1, 17.1, 23.8])

In [9]:
r2_offblock = rand_for_off.score(X_test_offblock, y_test_offblock)
rmse_offblock = np.sqrt(mean_squared_error(y_test_offblock, rand_for_off.predict(X_test_offblock)))
print('The r^2 for offblock time is ' + str(round(r2_offblock, 4)))
print('The RMSE for offblock time is ' + str(round(rmse_offblock, 2)) + 'minutes.')
print('\n')

r2_onblock = rand_for_on.score(X_test_onblock, y_test_onblock)
rmse_onblock = np.sqrt(mean_squared_error(y_test_onblock, rand_for_on.predict(X_test_onblock)))
print('The r^2 for onblock time is ' + str(round(r2_onblock, 4)))
print('The RMSE for onblock time is ' + str(round(rmse_onblock, 2)) + 'minutes.')
print('\n')

The r^2 for offblock time is 0.9357
The RMSE for offblock time is 6.43minutes.


The r^2 for onblock time is 0.963
The RMSE for onblock time is 4.64minutes.




In [11]:
rand_for_on.estimator_params

('criterion',
 'max_depth',
 'min_samples_split',
 'min_samples_leaf',
 'min_weight_fraction_leaf',
 'max_features',
 'max_leaf_nodes',
 'min_impurity_decrease',
 'random_state',
 'ccp_alpha')