# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from datetime import datetime
seed = 343
kfolds = KFold(n_splits=4, shuffle=True, random_state=seed)

In [2]:
# Useful Functions
def cv_rmse(model, x, y):
    rmse = np.sqrt(-cross_val_score(model, x, y,
                                    scoring="neg_mean_squared_error",
                                    cv=kfolds))
    return rmse

def get_total_minutes(td):
    hours, minutes = map(int, td.split(':'))
    return hours * 60 + minutes

# Preprocessing

In [3]:
base_df = pd.read_csv("flights.csv")

# Label Encoder (proper way to handle Categorical Data)
lb_encode = LabelEncoder()
categorical_variables = ['AirlineName', 'AirportDest', 'AirportOrig']
for i in categorical_variables:
    base_df[i] = lb_encode.fit_transform(base_df[i])
    
# Handle date and time variables
date_variables = ['ArrTime', 'DepTime']
for i in date_variables:
    base_df[i] = np.array([datetime.strptime(d, "%Y/%m/%d-%H:%M").timestamp() for d in base_df[i]])
base_df['FlightDuration'] = np.array([get_total_minutes(td) for td in base_df['FlightDuration']])

train_df, test_df = train_test_split(base_df, test_size=0.2)
train_x, train_y = train_df.drop(['Price'], axis=1), train_df.Price
test_df = test_df.drop(['Price'], axis=1)

# Linear Regression

In [4]:
lr = LinearRegression()
linear_regression_model = make_pipeline(RobustScaler(), lr).fit(train_x, train_y)
cv_rmse(linear_regression_model, train_x, train_y).mean()

774.1159301167121

# XGBoost

In [5]:
xgb = XGBRegressor(learning_rate=0.1, n_estimators=200, max_depth=10,
                   min_child_weight=6, gamma=0, subsample=0.7,
                   colsample_bytree=0.8, objective='reg:squarederror',
                   nthread=4, scale_pos_weight=1, seed=seed, reg_alpha=0.00006)
cv_rmse(xgb, train_x, train_y).mean()

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


322.32832823610966

# LightGBM

In [6]:
lightgbm_model = LGBMRegressor(objective='regression', num_leaves=5,
                               learning_rate=0.05, n_estimators=8020,
                               max_bin=100, bagging_fraction=0.8,
                               feature_fraction_seed=343, bagging_seed=343,
                               min_data_in_leaf=6, min_sum_hessian_in_leaf=11)
cv_rmse(lightgbm_model, train_x, train_y).mean()

383.95747265396005

# Stacking

In [7]:
linear_regression_pipeline = make_pipeline(RobustScaler(),
                                           LinearRegression())
xgboost_pipeline = make_pipeline(RobustScaler(),
                                 XGBRegressor(learning_rate=0.1, n_estimators=200, max_depth=10,
                                              min_child_weight=6, gamma=0, subsample=0.7,
                                              colsample_bytree=0.8, objective='reg:squarederror',
                                              nthread=4, scale_pos_weight=1, seed=seed, reg_alpha=0.00006))
lightgbm_pipeline = make_pipeline(RobustScaler(),
                                  LGBMRegressor(objective='regression', num_leaves=5,
                                                learning_rate=0.05, n_estimators=8020,
                                                max_bin=100, bagging_fraction=0.8,
                                                feature_fraction_seed=343, bagging_seed=343,
                                                min_data_in_leaf=6, min_sum_hessian_in_leaf=11))

stack_gen = StackingCVRegressor(regressors=(linear_regression_pipeline, xgboost_pipeline, lightgbm_pipeline),
                                meta_regressor=xgboost_pipeline,
                                use_features_in_secondary=True)
cv_rmse(stack_gen, train_x, train_y).mean()

320.4849081763634