# Building regression models for arr_delay and dep_delay
Identifying the most suitable model out of different modeling approaches including, linear regression, nonlinear regression, random forest regression, boosted tree regression & support verctor regression. For each model approach different hyperparameters will be analysed supported by k-fold cross-validation.

In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import QuantileTransformer
from pprint import pprint


In [4]:
# Adjust settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
%matplotlib inline

In [5]:
# Load preprocessed dataset

data_path = '../data/processed/'
df = pd.read_pickle(os.path.join(data_path, 'final.pkl'))

In [6]:
df.head()

Unnamed: 0,flt_ac_reg,ground_delay,flt_ac_type,flt_tt,flt_sched_tt,block_delay,routing,sched_gt,act_gt,cp_count,ca_count,cc_cp_ca,cc_count,day_of_week,hour_of_day_dep,hour_of_day_arr,cc_types
9,ECLBAX,25.0,320,0.0,0.0,16.0,New Jessica_East Carmen,95.0,94.0,2,4,none,0,5,3,6,0
117,ECLBAX,15.0,320,60.0,60.0,6.0,East Carmen_South Nathaniel,75.0,120.0,2,4,both,6,5,8,10,3
194,ECLBAX,51.0,320,68.0,75.0,37.0,South Nathaniel_East Carmen,80.0,86.0,2,4,none,0,5,11,14,1
268,ECLBAX,43.0,320,64.0,70.0,32.0,East Carmen_Joneshaven,50.0,32.0,2,4,both,6,5,15,17,3
416,ECLBAX,20.0,320,73.0,45.0,22.0,East Carmen_Joneshaven,45.0,38.0,2,4,none,0,6,6,8,0


In [7]:
# One-hot encode all catgorical variables
df_one_hot = pd.get_dummies(df, drop_first=True)
df_one_hot.dropna(axis=0, how='any', inplace=True)
df_one_hot.shape

(7975, 389)

In [8]:
# Generate train/test splits for both intermediate models
X_train_arr, X_test_arr, y_train_arr, y_test_arr = train_test_split(df_one_hot.drop(['arr_delay'], axis=1), df_one_hot['arr_delay'], test_size=0.33, random_state=42)
X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(df_one_hot.drop(['dep_delay'], axis=1), df_one_hot['dep_delay'], test_size=0.33, random_state=42)

# Create dataframe to save model evaluation parameters
eval = pd.DataFrame(columns= ['Group', 'Model', 'Parameters', 'R^2 test', 'RMSE test', 'R^2 train', 'RMSE train'])


## Linear regression model

In [9]:
# Create a linear regression model for arr and dep delay
lr_arr = LinearRegression(fit_intercept=True)
lr_dep = LinearRegression(fit_intercept=True)

# Fit models to training data
lr_arr.fit(X_train_arr, y_train_arr)
lr_dep.fit(X_train_dep, y_train_dep)

# Predict values for train and test data
lr_pred_arr_train = lr_arr.predict(X_train_arr)
lr_pred_dep_train = lr_dep.predict(X_train_dep)

lr_pred_arr_test = lr_arr.predict(X_test_arr)
lr_pred_dep_test = lr_dep.predict(X_test_dep)

# Save r^2 and RMSE for both models in dataframe for later comparison

eval = eval.append({
    'Group': 'arr',
    'Model': 'Linear',
    'R^2 test': r2_score(y_test_arr, lr_pred_arr_test),
    'RMSE test': mean_squared_error(y_test_arr, lr_pred_arr_test, squared=False),
    'R^2 train': r2_score(y_train_arr, lr_pred_arr_train),
    'RMSE train': mean_squared_error(y_train_arr, lr_pred_arr_train, squared=False)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'dep',
    'Model': 'Linear',
    'R^2 test': r2_score(y_test_dep, lr_pred_dep_test),
    'RMSE test': mean_squared_error(y_test_dep, lr_pred_dep_test, squared=False),
    'R^2 train': r2_score(y_train_dep, lr_pred_dep_train),
    'RMSE train': mean_squared_error(y_train_dep, lr_pred_dep_train, squared=False)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Linear,,-254598900000000.0,307185900.0,0.919,5.574
1,Ground,Linear,,-3.108956e+16,3187293000.0,0.913,5.412


## Ridge regression

In [10]:
# Range of regularization parameter alpha
alpha = [int(x) for x in np.linspace(1, 3, 20)]

# Create random grid
param_grid = {'alpha': alpha}

# Create a ridge regression model for arr and dep delay
rid_arr = Ridge(fit_intercept=True)
rid_dep = Ridge(fit_intercept=True)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=rid_arr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_dep = GridSearchCV(estimator=rid_dep, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)
grid_dep.fit(X_train_dep, y_train_dep)

# Print best parameters for the models
print(grid_arr.best_params_)
print(grid_dep.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'alpha': 2}
{'alpha': 2}


In [11]:
# Create ridge regression models with best alpha values
rid_arr = Ridge(alpha=2, fit_intercept=True)
rid_dep = Ridge(alpha=2, fit_intercept=True)

# Fit models to training data
rid_arr.fit(X_train_arr, y_train_arr)
rid_dep.fit(X_train_dep, y_train_dep)

# Predict values for train and test data
rid_pred_arr_train = rid_arr.predict(X_train_arr)
rid_pred_dep_train = rid_dep.predict(X_train_dep)

rid_pred_arr_test = rid_arr.predict(X_test_arr)
rid_pred_dep_test = rid_dep.predict(X_test_dep)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Ridge',
    'R^2 test': r2_score(y_test_arr, rid_pred_arr_test),
    'RMSE test': mean_squared_error(y_test_arr, rid_pred_arr_test, squared=False),
    'R^2 train': r2_score(y_train_arr, rid_pred_arr_train),
    'RMSE train': mean_squared_error(y_train_arr, rid_pred_arr_train, squared=False)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'dep',
    'Model': 'Ridge',
    'R^2 test': r2_score(y_test_dep, rid_pred_dep_test),
    'RMSE test': mean_squared_error(y_test_dep, rid_pred_dep_test, squared=False),
    'R^2 train': r2_score(y_train_dep, rid_pred_dep_train),
    'RMSE train': mean_squared_error(y_train_dep, rid_pred_dep_train, squared=False)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Linear,,-254598900000000.0,307185900.0,0.919,5.574
1,Ground,Linear,,-3.108956e+16,3187293000.0,0.913,5.412
2,Block,Ridge,,0.908,5.847,0.917,5.626
3,Ground,Ridge,,0.9,5.712,0.911,5.459


## Lasso Regression

In [12]:
# Range of regularization parameter alpha
alpha = [int(x) for x in np.linspace(0, 10, 11)]

# Create random grid
param_grid = {'alpha': alpha}

# Create a lasso regression model for arr and dep delay
las_arr = Lasso(fit_intercept=True)
las_dep = Lasso(fit_intercept=True)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=las_arr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_dep = GridSearchCV(estimator=las_dep, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)
grid_dep.fit(X_train_dep, y_train_dep)

# Print best parameters for the models
print(grid_arr.best_params_)
print(grid_dep.best_params_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 0}
{'alpha': 0}


In [13]:
# Create lasso regression models with best alpha values
las_arr = Lasso(alpha=0, fit_intercept=True)
las_dep = Lasso(alpha=0, fit_intercept=True)

# Fit models to training data
las_arr.fit(X_train_arr, y_train_arr)
las_dep.fit(X_train_dep, y_train_dep)

# Predict values for train and test data
las_pred_arr_train = las_arr.predict(X_train_arr)
las_pred_dep_train = las_dep.predict(X_train_dep)

las_pred_arr_test = las_arr.predict(X_test_arr)
las_pred_dep_test = las_dep.predict(X_test_dep)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Lasso',
    'R^2 test': r2_score(y_test_arr, las_pred_arr_test),
    'RMSE test': mean_squared_error(y_test_arr, las_pred_arr_test, squared=False),
    'R^2 train': r2_score(y_train_arr, las_pred_arr_train),
    'RMSE train': mean_squared_error(y_train_arr, las_pred_arr_train, squared=False)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'dep',
    'Model': 'Lasso',
    'R^2 test': r2_score(y_test_dep, las_pred_dep_test),
    'RMSE test': mean_squared_error(y_test_dep, las_pred_dep_test, squared=False),
    'R^2 train': r2_score(y_train_dep, las_pred_dep_train),
    'RMSE train': mean_squared_error(y_train_dep, las_pred_dep_train, squared=False)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Linear,,-254598900000000.0,307185900.0,0.919,5.574
1,Ground,Linear,,-3.108956e+16,3187293000.0,0.913,5.412
2,Block,Ridge,,0.908,5.847,0.917,5.626
3,Ground,Ridge,,0.9,5.712,0.911,5.459
4,Block,Lasso,,0.906,5.893,0.918,5.576
5,Ground,Lasso,,0.898,5.784,0.913,5.413


## Huber Regression

In [14]:
# Create the parameter grid

# Range of epsilon
epsilon = [int(x) for x in np.linspace(100, 1000, 10)]

# Range of alpha
alpha = [int(x) for x in np.linspace(0, 1, 10)]

# Create random grid
param_grid = {
    'epsilon': epsilon,
    'alpha': alpha
    }

# Create a Huber regression model for arr and dep delay
hub_arr = HuberRegressor(fit_intercept=True)
hub_dep = HuberRegressor(fit_intercept=True)

# Initiate the grid search models
grid_arr = GridSearchCV(estimator=hub_arr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_dep = GridSearchCV(estimator=hub_dep, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search models
grid_arr.fit(X_train_arr, y_train_arr)
grid_dep.fit(X_train_dep, y_train_dep)

# Print best parameters for the models
print(grid_arr.best_params_)
print(grid_dep.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'alpha': 1, 'epsilon': 200}
{'alpha': 1, 'epsilon': 200}


In [15]:
# Creating Huber regression models with best alpha and apsilon values
hub_arr = HuberRegressor(alpha=1, epsilon=200, fit_intercept=True)
hub_dep = HuberRegressor(alpha=1, epsilon=200, fit_intercept=True)

# Fit models to training data
hub_arr.fit(X_train_arr, y_train_arr)
hub_dep.fit(X_train_dep, y_train_dep)

# Predict values for train and test data
hub_pred_arr_train = hub_arr.predict(X_train_arr)
hub_pred_dep_train = hub_dep.predict(X_train_dep)

hub_pred_arr_test = hub_arr.predict(X_test_arr)
hub_pred_dep_test = hub_dep.predict(X_test_dep)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Huber',
    'R^2 test': round(r2_score(y_test_arr, hub_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, hub_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, hub_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, hub_pred_arr_train, squared=False), 3)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'dep',
    'Model': 'Huber',
    'R^2 test': round(r2_score(y_test_dep, hub_pred_dep_test), 3),
    'RMSE test': round(mean_squared_error(y_test_dep, hub_pred_dep_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_dep, hub_pred_dep_train), 3),
    'RMSE train': round(mean_squared_error(y_train_dep, hub_pred_dep_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Linear,,-254598900000000.0,307185900.0,0.919,5.574
1,Ground,Linear,,-3.108956e+16,3187293000.0,0.913,5.412
2,Block,Ridge,,0.908,5.847,0.917,5.626
3,Ground,Ridge,,0.9,5.712,0.911,5.459
4,Block,Lasso,,0.906,5.893,0.918,5.576
5,Ground,Lasso,,0.898,5.784,0.913,5.413
6,Block,Huber,,0.892,6.323,0.895,6.324
7,Ground,Huber,,0.884,6.163,0.885,6.215


## Decision Tree Regression

In [29]:
# Create the parameter grid to sample from during fitting

# Method of selecting samples for training each tree
ccp_alpha = [float(x) for x in np.linspace(0, 0.2, num=11)]
# Measurement of the quality of a split
criterion = ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
# Maximum number of leafs in tree
max_depth = [int(x) for x in np.linspace(80, 100, num=11)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum leaf nodes
max_leaf_nodes = [int(x) for x in np.linspace(140, 180, num=11)]
# Minimum impurity decrease
min_impurity_decrease = [float(x) for x in np.linspace(0.1, 0.3, num=11)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(5, 15, num=11)]
# Minimum weightes fraction of the sum of total of weights required to be at a leaf node
min_weight_fraction_leaf = [float(x) for x in np.linspace(0, 0.1, num=11)]
# Strategy used to split at each node.
splitter = ['best', 'random']

# Create the random grid
random_grid = {
    'ccp_alpha': ccp_alpha,
    'criterion': criterion,
    'max_depth': max_depth,
    'max_features': max_features,
    'max_leaf_nodes': max_leaf_nodes,
    'min_impurity_decrease': min_impurity_decrease,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'min_weight_fraction_leaf': min_weight_fraction_leaf,
    'splitter': splitter
    }

# Create a decision tree regression model for arr and dep delay
dt_arr = DecisionTreeRegressor()
dt_dep = DecisionTreeRegressor()

# Initiate the grid search models
dt_arr_random = RandomizedSearchCV(estimator=dt_arr, param_distributions=random_grid, n_iter=100, cv=5, n_jobs=-1, random_state=42, verbose=1)
dt_dep_random = RandomizedSearchCV(estimator=dt_dep, param_distributions=random_grid, n_iter=100, cv=5, n_jobs=-1, random_state=42, verbose=1)

# Fit the grid search models
dt_arr_random.fit(X_train_arr, y_train_arr)
dt_dep_random.fit(X_train_dep, y_train_dep)

# Print best parameters for the models
print(dt_arr_random.best_params_)
print(dt_dep_random.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'splitter': 'best', 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 5, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.18, 'max_leaf_nodes': 176, 'max_features': 'auto', 'max_depth': 90, 'criterion': 'friedman_mse', 'ccp_alpha': 0.18}
{'splitter': 'random', 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 7, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.12, 'max_leaf_nodes': 160, 'max_features': 'auto', 'max_depth': 92, 'criterion': 'friedman_mse', 'ccp_alpha': 0.14}


In [27]:
# Create the parameter grid to sample from during fitting

# Method of selecting samples for training each tree
ccp_alpha = [0.1]
# Measurement of the quality of a split
criterion = ['friedman_mse']
# Maximum number of leafs in tree
max_depth = [91]
# Number of features to consider at every split
max_features = ['auto']
# Maximum leaf nodes
max_leaf_nodes = [166]
# Minimum impurity decrease
min_impurity_decrease = [float(x) for x in np.linspace(0.2, 0.5, num=11)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Minimum number of samples required to split a node
min_samples_split = [10, 11, 12]
# Minimum weightes fraction of the sum of total of weights required to be at a leaf node
min_weight_fraction_leaf = [0]
# Strategy used to split at each node.
splitter = ['random']

# Create the random grid
param_grid = {
    'ccp_alpha': ccp_alpha,
    'criterion': criterion,
    'max_depth': max_depth,
    'max_features': max_features,
    'max_leaf_nodes': max_leaf_nodes,
    'min_impurity_decrease': min_impurity_decrease,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'min_weight_fraction_leaf': min_weight_fraction_leaf,
    'splitter': splitter
    }

# Create a decision tree regression model for arr and dep delay
dt_arr = DecisionTreeRegressor()
dt_dep = DecisionTreeRegressor()

# Initiate the grid search models
dt_arr_grid = GridSearchCV(estimator=dt_arr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
dt_dep_grid = GridSearchCV(estimator=dt_dep, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search models
dt_arr_grid.fit(X_train_arr, y_train_arr)
dt_dep_grid.fit(X_train_dep, y_train_dep)

# Print best parameters for the models
print(dt_arr_grid.best_params_)
print(dt_dep_grid.best_params_)

Fitting 5 folds for each of 33 candidates, totalling 165 fits
Fitting 5 folds for each of 33 candidates, totalling 165 fits
{'ccp_alpha': 0.1, 'criterion': 'friedman_mse', 'max_depth': 91, 'max_features': 'auto', 'max_leaf_nodes': 166, 'min_impurity_decrease': 0.2, 'min_samples_leaf': 1, 'min_samples_split': 12, 'min_weight_fraction_leaf': 0, 'splitter': 'random'}
{'ccp_alpha': 0.1, 'criterion': 'friedman_mse', 'max_depth': 91, 'max_features': 'auto', 'max_leaf_nodes': 166, 'min_impurity_decrease': 0.35, 'min_samples_leaf': 1, 'min_samples_split': 12, 'min_weight_fraction_leaf': 0, 'splitter': 'random'}


In [None]:
# Creating Decision Tree regression models with best parameters
dt_arr = HuberRegressor(ccp_alpha=0.1, criterion='friedman_mse', max_depth='', max_features='auto',
    max_leaf_nodes='', min_impurity_decrease='', min_samples_leaf=1, min_samples_split)
dt_dep = HuberRegressor(alpha=1, epsilon=200, fit_intercept=True)

# Fit models to training data
dt_arr.fit(X_train_arr, y_train_arr)
dt_dep.fit(X_train_dep, y_train_dep)

# Predict values for train and test data
hub_pred_arr_train = dt_arr.predict(X_train_arr)
hub_pred_dep_train = dt_dep.predict(X_train_dep)

hub_pred_arr_test = dt_arr.predict(X_test_arr)
hub_pred_dep_test = dt_dep.predict(X_test_dep)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'arr',
    'Model': 'Huber',
    'R^2 test': round(r2_score(y_test_arr, hub_pred_arr_test), 3),
    'RMSE test': round(mean_squared_error(y_test_arr, hub_pred_arr_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_arr, hub_pred_arr_train), 3),
    'RMSE train': round(mean_squared_error(y_train_arr, hub_pred_arr_train, squared=False), 3)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'dep',
    'Model': 'Huber',
    'R^2 test': round(r2_score(y_test_dep, hub_pred_dep_test), 3),
    'RMSE test': round(mean_squared_error(y_test_dep, hub_pred_dep_test, squared=False), 3),
    'R^2 train': round(r2_score(y_train_dep, hub_pred_dep_train), 3),
    'RMSE train': round(mean_squared_error(y_train_dep, hub_pred_dep_train, squared=False), 3)
    }, ignore_index=True)

eval.round(decimals=3)

In [None]:
# Use the random grid to search for the best hyperparameters
# First create the base model to tune
rf_arr = RandomForestRegressor()
# Random search of parameters, using 5 fold cross validation, search across 100 different combinations,
# and use all available cores
rf_random = RandomizedSearchCV(estimator=rf_arr, param_distributions=random_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train_arr, y_train_arr)

## Random Forest

In [29]:
# Create the parameter grid to sample from during fitting

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of leafs in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
    }

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [30]:
# Use the random grid to search for the best hyperparameters
# First create the base model to tune
rf_arr = RandomForestRegressor()
# Random search of parameters, using 5 fold cross validation, search across 100 different combinations,
# and use all available cores
rf_random = RandomizedSearchCV(estimator=rf_arr, param_distributions=random_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train_arr, y_train_arr)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [31]:
rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [33]:
# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [90, 100, 110],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [1, 2, 4],
    'n_estimators': [1300, 1400, 1500]    
}

# Create a based model
rf = RandomForestRegressor()

# Initiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=100)

# Fit the grid search to the data
grid_search.fit(X_train_arr, y_train_arr)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


KeyboardInterrupt: 

In [None]:
grid_search.best_params_

## Support Vector Regression

In [None]:
# Implement second baseline model as random forest regression
rand_for_off = RandomForestRegressor(n_estimators = 800, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=100, bootstrap=True, random_state=42)
rand_for_off.fit(X_train_dep, y_train_dep)
rand_for_off.predict(X_test_dep)

rand_for_on = RandomForestRegressor(n_estimators = 10, random_state=42)
rand_for_on.fit(X_train_arr, y_train_arr)
rand_for_on.predict(X_test_arr)

array([18.1, 18.5, 34.9, ..., 51.1, 17.1, 23.8])

In [None]:
r2_offarr = rand_for_off.score(X_test_dep, y_test_dep)
rmse_offarr = np.sqrt(mean_squared_error(y_test_dep, rand_for_off.predict(X_test_dep)))
print('The r^2 for offarr time is ' + str(round(r2_offarr, 4)))
print('The RMSE for offarr time is ' + str(round(rmse_offarr, 2)) + 'minutes.')
print('\n')

r2_arr = rand_for_on.score(X_test_arr, y_test_arr)
rmse_onarr = np.sqrt(mean_squared_error(y_test_arr, rand_for_on.predict(X_test_arr)))
print('The r^2 for onarr time is ' + str(round(r2_onarr, 4)))
print('The RMSE for onarr time is ' + str(round(rmse_onarr, 2)) + 'minutes.')
print('\n')

The r^2 for offblock time is 0.9411
The RMSE for offblock time is 6.15minutes.


The r^2 for onblock time is 0.963
The RMSE for onblock time is 4.64minutes.




In [None]:
# Implement second baseline model as random forest regression
rand_for_off = RandomForestRegressor(n_estimators = 800, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=90, bootstrap=True, random_state=42)
rand_for_off.fit(X_train_dep, y_train_dep)
rand_for_off.predict(X_test_dep)

array([ 8.97125,  8.9725 , 26.40625, ..., 46.69125, 10.06875, 18.86125])

In [None]:
r2_offblock = rand_for_off.score(X_test_dep, y_test_dep)
rmse_offblock = np.sqrt(mean_squared_error(y_test_dep, rand_for_off.predict(X_test_dep)))
print('The r^2 for offblock time is ' + str(round(r2_offblock, 4)))
print('The RMSE for offblock time is ' + str(round(rmse_offblock, 2)) + 'minutes.')
print('\n')

The r^2 for offblock time is 0.9411
The RMSE for offblock time is 6.15minutes.




Confidence over the day:
RMSE in Abhängigkeit der Flüge des Tages
RMSE in Abhängigkeit der Uhrzeit


PCA
Lineare Regression Drop der Variablen