# Building regression models for block_delay and ground_delay
Identifying the most suitable model out of different modeling approaches including, linear regression, nonlinear regression, random forest regression, boosted tree regression & support verctor regression. For each model approach different hyperparameters will be analysed supported by k-fold cross-validation.

In [80]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from pprint import pprint


In [38]:
# Adjust settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
%matplotlib inline

# to make this notebook's output stable across runs
np.random.seed(42)

In [39]:
# Load preprocessed dataset

data_path = '../data/processed/'
df = pd.read_pickle(os.path.join(data_path, 'final.pkl'))

In [40]:
df.head()

Unnamed: 0,flt_leg,flt_dep_airpt,flt_arr_airpt,flt_offblock,flt_onblock,flt_ac_reg,flt_change_code,flt_dep_delay,flt_ac_type,flt_tt,flt_sched_tt,flt_act_gt,flt_sched_dep,flt_sched_arr,gnd_sched_tat,block_delay,routing,sched_gt,act_gt,sched_dep_d,cp_crew,ca_crew,cp_count,ca_count,cc_cp_ca,cc_count,day_of_week,hour_of_day_dep,hour_of_day_arr,cc_types,cc_roles
0,272024970,New Jessica,East Carmen,2019-06-01 03:50:00,2019-06-01 07:01:00,ECLBAX,other problem,25.0,320,0.0,0.0,94.0,2019-06-01 03:25:00,2019-06-01 06:45:00,,16.0,New Jessica_East Carmen,95.0,94.0,2019-06-01,"[Andrew Patterson, Joshua Ellis]","[Caleb Davidson, Cassandra Lewis, Sean Weeks, ...",2,4,,0,5,3,6,0,[]
1,272022230,East Carmen,South Nathaniel,2019-06-01 08:35:00,2019-06-01 10:41:00,ECLBAX,other problem,15.0,320,60.0,60.0,120.0,2019-06-01 08:20:00,2019-06-01 10:35:00,95.0,6.0,East Carmen_South Nathaniel,75.0,120.0,2019-06-01,"[Mikayla Harris, Rachel Smith]","[Cathy Meyer, Charles Watson, Jessica Holmes, ...",2,4,,0,5,8,10,3,"[cp, cp, ca, ca, ca, ca]"
2,272212848,South Nathaniel,East Carmen,2019-06-01 12:41:00,2019-06-01 14:52:00,ECLBAX,rotational problem,51.0,320,68.0,75.0,86.0,2019-06-01 11:50:00,2019-06-01 14:15:00,,37.0,South Nathaniel_East Carmen,80.0,86.0,2019-06-01,"[Mikayla Harris, Rachel Smith]","[Cathy Meyer, Charles Watson, Jessica Holmes, ...",2,4,,0,5,11,14,1,[]
3,271997824,East Carmen,Joneshaven,2019-06-01 16:18:00,2019-06-01 17:32:00,ECLBAX,rotational problem,43.0,320,64.0,70.0,32.0,2019-06-01 15:35:00,2019-06-01 17:00:00,80.0,32.0,East Carmen_Joneshaven,50.0,32.0,2019-06-01,"[Candace Brooks, Sergio Cummings]","[Amanda Aguilar, Antonio Robbins, Jackie Black...",2,4,both,5,5,15,17,3,"[cp, cp, ca, ca, ca, ca]"
4,271998033,Joneshaven,East Carmen,2019-06-01 18:04:00,2019-06-01 19:14:00,ECLBAX,other problem,14.0,320,42.0,50.0,,2019-06-01 17:50:00,2019-06-01 19:10:00,,4.0,Joneshaven_East Carmen,,,2019-06-01,"[Candace Brooks, Sergio Cummings]","[Amanda Aguilar, Antonio Robbins, Jackie Black...",2,4,,0,5,17,19,1,[]


In [41]:
# Drop columns which will not be used --> Is this the best place to do so?
df = df.drop(['flt_leg', 'flt_offblock', 'flt_onblock', 'flt_sched_dep', 'flt_sched_arr', 'cp_crew', 'ca_crew', 'cc_roles'], axis=1)

In [42]:
# One-hot encode all catgorical variables
df_one_hot = pd.get_dummies(df, drop_first=True)
df_one_hot.dropna(axis=0, how='any', inplace=True)

In [84]:
# Generate train/test splits for both intermediate models
X_train_block, X_test_block, y_train_block, y_test_block = train_test_split(df_one_hot.drop(['block_delay'], axis=1), df_one_hot['block_delay'], test_size=0.33, random_state=42)
X_train_ground, X_test_ground, y_train_ground, y_test_ground = train_test_split(df_one_hot.drop(['flt_dep_delay'], axis=1), df_one_hot['flt_dep_delay'], test_size=0.33, random_state=42)

# Create dataframe to save model evaluation parameters
eval = pd.DataFrame(columns= ['Group', 'Model', 'Parameters', 'R^2 test', 'RMSE test', 'R^2 train', 'RMSE train'])


## Linear regression model

In [85]:
# Create a linear regression model for block and ground delay
lr_block = LinearRegression(fit_intercept=True)
lr_ground = LinearRegression(fit_intercept=True)

# Fit models to training data
lr_block.fit(X_train_block, y_train_block)
lr_ground.fit(X_train_ground, y_train_ground)

# Predict values for train and test data
lr_pred_block_train = lr_block.predict(X_train_block)
lr_pred_ground_train = lr_ground.predict(X_train_ground)

lr_pred_block_test = lr_block.predict(X_test_block)
lr_pred_ground_test = lr_ground.predict(X_test_ground)

# Save r^2 and RMSE for both models in dataframe for later comparison

eval = eval.append({
    'Group': 'Block',
    'Model': 'Linear',
    'R^2 test': r2_score(y_test_block, lr_pred_block_test),
    'RMSE test': mean_squared_error(y_test_block, lr_pred_block_test, squared=False),
    'R^2 train': r2_score(y_train_block, lr_pred_block_train),
    'RMSE train': mean_squared_error(y_train_block, lr_pred_block_train, squared=False)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'Ground',
    'Model': 'Linear',
    'R^2 test': r2_score(y_test_ground, lr_pred_ground_test),
    'RMSE test': mean_squared_error(y_test_ground, lr_pred_ground_test, squared=False),
    'R^2 train': r2_score(y_train_ground, lr_pred_ground_train),
    'RMSE train': mean_squared_error(y_train_ground, lr_pred_ground_train, squared=False)
    }, ignore_index=True)

eval

Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Linear,,-576794100.0,608655.169879,0.944774,6.00093
1,Ground,Linear,,-9507970.0,74374.004184,0.964766,4.590025


## Ridge regression

In [86]:
# Range of regularization parameter alpha
alpha = [int(x) for x in np.linspace(5, 15, 11)]

# Create random grid
param_grid = {'alpha': alpha}

# Create a ridge regression model for block and ground delay
rid_block = Ridge(fit_intercept=True)
rid_ground = Ridge(fit_intercept=True)

# Initiate the grid search models
grid_block = GridSearchCV(estimator=rid_block, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_ground = GridSearchCV(estimator=rid_ground, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_block.fit(X_train_block, y_train_block)
grid_ground.fit(X_train_ground, y_train_ground)

# Print best parameters for the models
print(grid_block.best_params_)
print(grid_ground.best_params_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 11}
{'alpha': 7}


In [87]:
# Create ridge regression models with best alpha values
rid_block = Ridge(alpha=11, fit_intercept=True)
rid_ground = Ridge(alpha=7, fit_intercept=True)

# Fit models to training data
rid_block.fit(X_train_block, y_train_block)
rid_ground.fit(X_train_ground, y_train_ground)

# Predict values for train and test data
rid_pred_block_train = rid_block.predict(X_train_block)
rid_pred_ground_train = rid_ground.predict(X_train_ground)

rid_pred_block_test = rid_block.predict(X_test_block)
rid_pred_ground_test = rid_ground.predict(X_test_ground)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'Block',
    'Model': 'Ridge',
    'R^2 test': r2_score(y_test_block, rid_pred_block_test),
    'RMSE test': mean_squared_error(y_test_block, rid_pred_block_test, squared=False),
    'R^2 train': r2_score(y_train_block, rid_pred_block_train),
    'RMSE train': mean_squared_error(y_train_block, rid_pred_block_train, squared=False)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'Ground',
    'Model': 'Ridge',
    'R^2 test': r2_score(y_test_ground, rid_pred_ground_test),
    'RMSE test': mean_squared_error(y_test_ground, rid_pred_ground_test, squared=False),
    'R^2 train': r2_score(y_train_ground, rid_pred_ground_train),
    'RMSE train': mean_squared_error(y_train_ground, rid_pred_ground_train, squared=False)
    }, ignore_index=True)

eval

Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Linear,,-576794100.0,608655.169879,0.944774,6.00093
1,Ground,Linear,,-9507970.0,74374.004184,0.964766,4.590025
2,Block,Ridge,,0.933855,6.517928,0.942135,6.142618
3,Ground,Ridge,,0.9585878,4.908418,0.963839,4.650016


## Lasso Regression

In [88]:
# Range of regularization parameter alpha
alpha = [int(x) for x in np.linspace(0, 10, 11)]

# Create random grid
param_grid = {'alpha': alpha}

# Create a lasso regression model for block and ground delay
las_block = Lasso(fit_intercept=True)
las_ground = Lasso(fit_intercept=True)

# Initiate the grid search models
grid_block = GridSearchCV(estimator=las_block, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_ground = GridSearchCV(estimator=las_ground, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_block.fit(X_train_block, y_train_block)
grid_ground.fit(X_train_ground, y_train_ground)

# Print best parameters for the models
print(grid_block.best_params_)
print(grid_ground.best_params_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 0}
{'alpha': 0}


In [89]:
# Create lasso regression models with best alpha values
las_block = Lasso(alpha=0, fit_intercept=True)
las_ground = Lasso(alpha=0, fit_intercept=True)

# Fit models to training data
las_block.fit(X_train_block, y_train_block)
las_ground.fit(X_train_ground, y_train_ground)

# Predict values for train and test data
las_pred_block_train = las_block.predict(X_train_block)
las_pred_ground_train = las_ground.predict(X_train_ground)

las_pred_block_test = las_block.predict(X_test_block)
las_pred_ground_test = las_ground.predict(X_test_ground)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'Block',
    'Model': 'Lasso',
    'R^2 test': r2_score(y_test_block, las_pred_block_test),
    'RMSE test': mean_squared_error(y_test_block, las_pred_block_test, squared=False),
    'R^2 train': r2_score(y_train_block, las_pred_block_train),
    'RMSE train': mean_squared_error(y_train_block, las_pred_block_train, squared=False)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'Ground',
    'Model': 'Lasso',
    'R^2 test': r2_score(y_test_ground, las_pred_ground_test),
    'RMSE test': mean_squared_error(y_test_ground, las_pred_ground_test, squared=False),
    'R^2 train': r2_score(y_train_ground, las_pred_ground_train),
    'RMSE train': mean_squared_error(y_train_ground, las_pred_ground_train, squared=False)
    }, ignore_index=True)

eval

Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Linear,,-576794100.0,608655.169879,0.944774,6.00093
1,Ground,Linear,,-9507970.0,74374.004184,0.964766,4.590025
2,Block,Ridge,,0.933855,6.517928,0.942135,6.142618
3,Ground,Ridge,,0.9585878,4.908418,0.963839,4.650016
4,Block,Lasso,,0.9303925,6.686346,0.944775,6.000883
5,Ground,Lasso,,0.9573824,4.979337,0.964767,4.589917


## Huber Regression

## Random Forest

## Support Vector Regression

In [12]:
# Create the parameter grid to sample from during fitting

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of leafs in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
    }

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [13]:
# Use the random grid to search for the best hyperparameters
# First create the base model to tune
rf_block = RandomForestRegressor()
# Random search of parameters, using 5 fold cross validation, search across 100 different combinations,
# and use all available cores
rf_random = RandomizedSearchCV(estimator=rf_block, param_distributions=random_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train_onblock, y_train_onblock)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [14]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [20]:
# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [90, 100, 110],
    'max_features': [1, 2, 3],
    'min_samples_leaf': [1, 2, 4, 8],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [700, 800, 900]    
}

# Create a based model
rf = RandomForestRegressor()

# Initiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=100)

# Fit the grid search to the data
grid_search.fit(X_train_onblock, y_train_onblock)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [21]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 800}

In [15]:
# Implement second baseline model as random forest regression
rand_for_off = RandomForestRegressor(n_estimators = 800, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=100, bootstrap=True, random_state=42)
rand_for_off.fit(X_train_offblock, y_train_offblock)
rand_for_off.predict(X_test_offblock)

rand_for_on = RandomForestRegressor(n_estimators = 10, random_state=42)
rand_for_on.fit(X_train_onblock, y_train_onblock)
rand_for_on.predict(X_test_onblock)

array([18.1, 18.5, 34.9, ..., 51.1, 17.1, 23.8])

In [22]:
r2_offblock = rand_for_off.score(X_test_offblock, y_test_offblock)
rmse_offblock = np.sqrt(mean_squared_error(y_test_offblock, rand_for_off.predict(X_test_offblock)))
print('The r^2 for offblock time is ' + str(round(r2_offblock, 4)))
print('The RMSE for offblock time is ' + str(round(rmse_offblock, 2)) + 'minutes.')
print('\n')

r2_onblock = rand_for_on.score(X_test_onblock, y_test_onblock)
rmse_onblock = np.sqrt(mean_squared_error(y_test_onblock, rand_for_on.predict(X_test_onblock)))
print('The r^2 for onblock time is ' + str(round(r2_onblock, 4)))
print('The RMSE for onblock time is ' + str(round(rmse_onblock, 2)) + 'minutes.')
print('\n')

The r^2 for offblock time is 0.9411
The RMSE for offblock time is 6.15minutes.


The r^2 for onblock time is 0.963
The RMSE for onblock time is 4.64minutes.




In [25]:
# Implement second baseline model as random forest regression
rand_for_off = RandomForestRegressor(n_estimators = 800, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=90, bootstrap=True, random_state=42)
rand_for_off.fit(X_train_offblock, y_train_offblock)
rand_for_off.predict(X_test_offblock)

array([ 8.97125,  8.9725 , 26.40625, ..., 46.69125, 10.06875, 18.86125])

In [26]:
r2_offblock = rand_for_off.score(X_test_offblock, y_test_offblock)
rmse_offblock = np.sqrt(mean_squared_error(y_test_offblock, rand_for_off.predict(X_test_offblock)))
print('The r^2 for offblock time is ' + str(round(r2_offblock, 4)))
print('The RMSE for offblock time is ' + str(round(rmse_offblock, 2)) + 'minutes.')
print('\n')

The r^2 for offblock time is 0.9411
The RMSE for offblock time is 6.15minutes.


