In [22]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import QuantileTransformer
# from pprint import pprint

In [8]:
# Adjust settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
%matplotlib inline

In [9]:
# Load preprocessed dataset
data_path = '../data/processed/'
df = pd.read_pickle(os.path.join(data_path, 'final_one_hot.pkl'))

In [10]:
# Generate train/test splits
X_train_arr, X_test_arr, y_train_arr, y_test_arr = train_test_split(df.drop(['arr_delay'], axis=1), df['arr_delay'], test_size=0.33, random_state=42)

# Create dataframe to save model evaluation parameters
eval = pd.DataFrame(columns= ['Group', 'Model', 'R^2 test', 'RMSE test', 'R^2 train', 'RMSE train'])

In [39]:
%%time

# Setup different regressors
regr_01 = LinearRegression()
regr_02 = Ridge()
regr_03 = Lasso()
regr_04 = HuberRegressor()
regr_05 = GradientBoostingRegressor()
regr_06 = ElasticNet()
regr_07 = DecisionTreeRegressor()
regr_08 = RandomForestRegressor()
regr_09 = SVR()

# Initialize hyperparameters for regressor
param_01 = {}

param_02 = {}
param_02['regressor__alpha'] = [3] # [1, 5, 10], [4, 5, 6], [2, 3, 4]
param_02['regressor'] = [regr_02]

param_03 = {}
param_03['regressor__alpha'] = [0, 1, 10, 100]
param_03['regressor'] = [regr_03]

param_04 = {}
param_05 = {}
param_06 = {}
param_07 = {}

# Create Pipeline
pipeline = Pipeline([('regressor', regr_03)])
params = [param_02, param_03, param_04, param_05, param_06, param_07]

# Initiate Gridsearch model
gs_arr = GridSearchCV(pipeline, params, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)

# Train GridSearch
gs_arr.fit(X_train_arr, y_train_arr)

# Print best parameters for the models
print(gs_arr.best_estimator_, gs_arr.best_params_)

r2_arr_delay = gs_arr.score(X_test_arr, y_test_arr)
rmse_arr_delay = np.sqrt(mean_squared_error(y_test_arr, gs_arr.predict(X_test_arr)))
print('The r^2 for arrival delay is ' + str(round(r2_arr_delay, 4)))
print('The RMSE for arrival delay is ' + str(round(rmse_arr_delay, 2)) + 'minutes.')
print('\n')


Fitting 3 folds for each of 9 candidates, totalling 27 fits
Pipeline(steps=[('regressor', Ridge(alpha=3))]) {'regressor': Ridge(alpha=3), 'regressor__alpha': 3}
The r^2 for arrival delay is -6.567
The RMSE for arrival delay is 6.57minutes.


CPU times: user 206 ms, sys: 122 ms, total: 328 ms
Wall time: 2.96 s
