# Building regression models for arr_delay
Identifying the most suitable model out of different modeling approaches including, linear regression, random forest regression, boosted tree regression & support verctor regression. For each model approach different hyperparameters will be analysed supported by k-fold cross-validation.

In [91]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import QuantileTransformer
# from pprint import pprint


In [92]:
# Adjust settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline

In [93]:
# Load preprocessed dataset
data_path = '../data/processed/'
df = pd.read_pickle(os.path.join(data_path, 'final_one_hot.pkl'))

In [94]:
# Generate train/test splits
X_train_arr, X_test_arr, y_train_arr, y_test_arr = train_test_split(df.drop(['arr_delay'], axis=1), df['arr_delay'], test_size=0.33, random_state=42)

# Create dataframe to save model evaluation parameters
eval = pd.DataFrame(columns= ['Group', 'Model', 'R^2 test', 'RMSE test', 'R^2 train', 'RMSE train'])

## Preprocessing of data (non-linear transformation)
Using QuantileTransformer to prprocess continous data

In [95]:
# qt = QuantileTransformer(output_distribution='normal', random_state=42)
# continuous = df[['dep_delay', 'gnd_mingt', 'arr_delay', 'sched_gt', 'sched_gt_before',  'sched_trans_time', 'mod_sched_arr', 'density']]
# df_cont = pd.DataFrame(qt.fit_transform(continuous))

# # Pairplot of all numerical values including dependent variable
# sns.pairplot(df_cont, diag_kind='kde')

## 1) Linear regression model

In [96]:
# Create a linear regression model for arr and dep delay
lr_arr = LinearRegression(fit_intercept=True)

# Fit models to training data
lr_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
lr_pred_arr_train = lr_arr.predict(X_train_arr)

lr_pred_arr_test = lr_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Linear',
    'R^2 test': r2_score(y_test_arr, lr_pred_arr_test),
    'RMSE test': mean_squared_error(y_test_arr, lr_pred_arr_test, squared=False),
    'R^2 train': r2_score(y_train_arr, lr_pred_arr_train),
    'RMSE train': mean_squared_error(y_train_arr, lr_pred_arr_train, squared=False)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206


## 2) Ridge regression

In [97]:
# Range of regularization parameter alpha
alpha = [2, 3, 4] #[1, 5, 10, 20], [4, 5, 6], [2, 3, 4]

# Create random grid
param_grid = {'alpha': alpha}

# Create a ridge regression model for delay
rid_arr = Ridge(fit_intercept=True)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=rid_arr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(grid_arr.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
{'alpha': 3}


In [98]:
# Create ridge regression models with best alpha values
rid_arr = Ridge(alpha=3, fit_intercept=True)

# Fit models to training data
rid_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
rid_pred_arr_train = rid_arr.predict(X_train_arr)

rid_pred_arr_test = rid_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Ridge',
    'R^2 test': r2_score(y_test_arr, rid_pred_arr_test),
    'RMSE test': mean_squared_error(y_test_arr, rid_pred_arr_test, squared=False),
    'R^2 train': r2_score(y_train_arr, rid_pred_arr_train),
    'RMSE train': mean_squared_error(y_train_arr, rid_pred_arr_train, squared=False)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206
1,arr,Ridge,0.901,6.567,0.908,6.262


## 3) Lasso Regression

In [99]:
# Range of regularization parameter alpha
alpha = [0, 0.3, 0.7] #[0, 1, 10, 100, 1000]

# Create random grid
param_grid = {'alpha': alpha}

# Create a lasso regression model for arr delay
las_arr = Lasso(fit_intercept=True)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=las_arr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(grid_arr.best_params_)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
{'alpha': 0}


In [100]:
# Create lasso regression models with best alpha values
las_arr = Lasso(alpha=0, fit_intercept=True)

# Fit models to training data
las_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
las_pred_arr_train = las_arr.predict(X_train_arr)

las_pred_arr_test = las_arr.predict(X_test_arr)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Lasso',
    'R^2 test': r2_score(y_test_arr, las_pred_arr_test),
    'RMSE test': mean_squared_error(y_test_arr, las_pred_arr_test, squared=False),
    'R^2 train': r2_score(y_train_arr, las_pred_arr_train),
    'RMSE train': mean_squared_error(y_train_arr, las_pred_arr_train, squared=False)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206
1,arr,Ridge,0.901,6.567,0.908,6.262
2,arr,Lasso,0.9,6.615,0.91,6.206


## 4) Huber Regression

In [101]:
# Create the parameter grid

# Range of epsilon
epsilon = [1, 2, 3] #[1, 5, 10], [4, 5, 6], [2, 3, 4]

# Range of alpha
alpha = [0.4, 0.5, 0.7] #[0, 1, 10, 100], [0, 0.3, 0.7], [0.6, 0.7, 0.8]

# Create random grid
param_grid = {
    'epsilon': epsilon,
    'alpha': alpha
    }

# Create a Huber regression model for arr delay
hub_arr = HuberRegressor(fit_intercept=True)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=hub_arr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(grid_arr.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
{'alpha': 0.5, 'epsilon': 2}


In [102]:
# Creating Huber regression model with best alpha and apsilon values
hub_arr = HuberRegressor(alpha=0.5, epsilon=2, fit_intercept=True)

# Fit models to training data
hub_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
hub_pred_arr_train = hub_arr.predict(X_train_arr)

hub_pred_arr_test = hub_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Huber',
    'R^2 test': round(r2_score(y_test_arr, hub_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, hub_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, hub_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, hub_pred_arr_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206
1,arr,Ridge,0.901,6.567,0.908,6.262
2,arr,Lasso,0.9,6.615,0.91,6.206
3,arr,Huber,0.879,7.265,0.879,7.198


## 5) ElasticNet

In [None]:
# Create the parameter grid

# Range alpha
alpha = [0] #[0, 1, 2]

# Range of l1_ratio
l1_ratio = [0] #[0, 0.5, 1]

# Range of max_depth
max_iter = [10, 50, 100] #[100, 300, 500]

# Create random grid
param_grid = {
    'alpha': alpha,
    'l1_ratio': l1_ratio,
    'max_iter': max_iter
    }

# Create a Huber regression model for arr delay
en_arr = ElasticNet(random_state=42)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=en_arr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(grid_arr.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
{'alpha': 0, 'l1_ratio': 0, 'max_iter': 100}


In [None]:
# Creating ElasticNet regression model with best parameters
en_arr = ElasticNet(alpha=0, l1_ratio=0, max_iter=100, random_state=42)

# Fit models to training data
en_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
en_pred_arr_train = en_arr.predict(X_train_arr)

en_pred_arr_test = en_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'ElasticNet',
    'R^2 test': round(r2_score(y_test_arr, en_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, en_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, en_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, en_pred_arr_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206
1,arr,Ridge,0.901,6.567,0.908,6.262
2,arr,Lasso,0.9,6.615,0.91,6.206
3,arr,Huber,0.879,7.265,0.879,7.198
4,arr,GradientBoosting,0.898,6.687,0.919,5.887
5,arr,ElasticNet,0.9,6.619,0.91,6.21


## 6) Decision Tree Regression

In [None]:
# Randomized search and then

In [None]:
# Create the parameter grid to sample from during fitting

# Maximum number of leafs in tree
max_depth = [19, 20, 21] # [1, 2, 4, 8, 16, 32, 63, 128], [25, 30, 35],[15, 20, 25]
# Minimum impurity decrease
min_impurity_decrease = [0.05, 0.1, 0.15] # [0.1, 0.4, 0.6, 0.9]
# Minimum number of samples required at each leaf node
min_samples_leaf = [17, 18, 19] # [1, 2, 4, 8, 16], [15, 20, 25], [19, 20, 21]

# Create the random grid
random_grid = {
    'max_depth': max_depth,
    'min_impurity_decrease': min_impurity_decrease,
    'min_samples_leaf': min_samples_leaf,
    }

# Create a decision tree regression model for arr delay
dt_arr = DecisionTreeRegressor(random_state=42)

# Initiate the grid search models
dt_arr_random = RandomizedSearchCV(estimator=dt_arr, param_distributions=random_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
dt_arr_random.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(dt_arr_random.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
{'min_samples_leaf': 19, 'min_impurity_decrease': 0.1, 'max_depth': 20}


In [None]:
# Creating Decision Tree regression models with best parameters
dt_arr = DecisionTreeRegressor(max_depth=20, min_impurity_decrease=0.1, min_samples_leaf=19, random_state=42)

# Fit models to training data
dt_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
dt_pred_arr_train = dt_arr.predict(X_train_arr)

dt_pred_arr_test = dt_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Decision Tree',
    'R^2 test': round(r2_score(y_test_arr, dt_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, dt_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, dt_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, dt_pred_arr_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206
1,arr,Ridge,0.901,6.567,0.908,6.262
2,arr,Lasso,0.9,6.615,0.91,6.206
3,arr,Huber,0.879,7.265,0.879,7.198
4,arr,GradientBoosting,0.898,6.687,0.919,5.887
5,arr,ElasticNet,0.9,6.619,0.91,6.21
6,arr,Decision Tree,0.874,7.41,0.886,6.984


## 7) Gradient Boosting

In [None]:
# Randomized Search and then cv

In [103]:
# Create the parameter grid

# Range for learning_rate
learning_rate = [0.1, 0.2, 0.3] #[0, 0.25, 0.5, 0.75, 1]

# Range of n_estimators
n_estimators = [100, 150, 200] #[1, 2, 4, 8, 16, 32, 64, 128]

# Range of max_depth
max_depth = [3, 4, 5] #[1, 2, 3]


# Create random grid
param_grid = {
    'learning_rate': learning_rate,
    'n_estimators': n_estimators,
    'max_depth': max_depth
    }

# Create a Huber regression model for arr delay
gb_arr = GradientBoostingRegressor(random_state=42)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=gb_arr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(grid_arr.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}


In [104]:
# Creating GradientBoosting regression model with best parameters
gb_arr = GradientBoostingRegressor(learning_rate=0.2, max_depth=3, n_estimators=150, random_state=42)

# Fit models to training data
gb_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
gb_pred_arr_train = gb_arr.predict(X_train_arr)

gb_pred_arr_test = gb_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'GradientBoosting',
    'R^2 test': round(r2_score(y_test_arr, gb_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, gb_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, gb_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, gb_pred_arr_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206
1,arr,Ridge,0.901,6.567,0.908,6.262
2,arr,Lasso,0.9,6.615,0.91,6.206
3,arr,Huber,0.879,7.265,0.879,7.198
4,arr,GradientBoosting,0.898,6.687,0.919,5.887


## 8) Random Forest

In [116]:
# Create the parameter grid to sample from during fitting

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(200, 1000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of leafs in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
    }

# Create a decision tree regression model for arr delay
rf_arr = RandomForestRegressor(random_state=42)

# Initiate the grid search models
rf_arr_random = RandomizedSearchCV(estimator=rf_arr, param_distributions=random_grid, n_iter=100, cv=3, n_jobs=-1, random_state=42, verbose=1)

# Fit the grid search models
rf_arr_random.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(rf_arr_random.best_params_)

UsageError: Line magic function `%%time` not found.


In [125]:
# Create the parameter grid

# Number of trees in random forest
n_estimators = [760, 770, 780] #[800, 1000, 1200], [700, 800], [690, 700, 710], [710, 750, 790], [740, 750, 760], [760, 770, 780]
# Maximum number of leafs in tree
max_depth = [50] #[30, 40, 50], [50, 60]
# Minimum number of samples required to split a node
min_samples_split = [25, 27, 30] #[9, 10, 11], [11, 12, 13], [13, 15, 17], [17, 20, 25], [25, 40, 60], [30, 40, 50], [25, 27, 30]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2] #[1, 2, 3]


# Create random grid
param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    }

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=rf_arr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(grid_arr.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
{'max_depth': 50, 'min_samples_leaf': 2, 'min_samples_split': 30, 'n_estimators': 760}


In [126]:
# Creating GradientBoosting regression model with best parameters
rf_arr = RandomForestRegressor(max_depth=50, min_samples_leaf=2, min_samples_split=30, n_estimators=760, random_state=42)

# Fit models to training data
rf_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
rf_pred_arr_train = rf_arr.predict(X_train_arr)

rf_pred_arr_test = rf_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'RandomForest',
    'R^2 test': round(r2_score(y_test_arr, rf_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, rf_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, rf_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, rf_pred_arr_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,R^2 test,RMSE test,R^2 train,RMSE train
0,arr,Linear,-3025800008616.141,36386737.571,0.91,6.206
1,arr,Ridge,0.901,6.567,0.908,6.262
2,arr,Lasso,0.9,6.615,0.91,6.206
3,arr,Huber,0.879,7.265,0.879,7.198
4,arr,GradientBoosting,0.898,6.687,0.919,5.887
5,arr,ElasticNet,0.9,6.619,0.91,6.21
6,arr,Decision Tree,0.874,7.41,0.886,6.984
7,arr,GradientBoosting,0.894,6.817,0.935,5.269


## 9) Support Vector Regression

In [65]:
# Create the parameter grid
# Range alpha
kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# Create random grid
param_grid = {
    'kernel': kernel
    }

# Create a Huber regression model for arr delay
svr_arr = SVR()

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=svr_arr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(grid_arr.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


KeyboardInterrupt: 

In [66]:
# Creating SVR regression model with best parameters
svr_arr = SVR(kernel='linear')

# Fit models to training data
svr_arr.fit(X_train_arr, y_train_arr)

# Predict values for train and test data
svr_pred_arr_train = svr_arr.predict(X_train_arr)

svr_pred_arr_test = svr_arr.predict(X_test_arr)

# Save r^2 and RMSE for model in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'SVR',
    'R^2 test': round(r2_score(y_test_arr, svr_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, svr_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, svr_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, svr_pred_arr_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

Confidence over the day:
RMSE in Abhängigkeit der Flüge des Tages
RMSE in Abhängigkeit der Uhrzeit


PCA
Lineare Regression Drop der Variablen

## Finding the best overall model

In [None]:
# Initializing the different estimators to be tested
est1 = LinearRegression()
est2 = Ridge()
est3 = Lasso()
est4 = HuberRegressor()
est5 = GradientBoostingRegressor()
est6 = ElasticNet()
est7 = DecisionTreeRegressor()
est8 = RandomForestRegressor()
est9 = SVR()