# Introduction

Baseline experiment - linear regression.

# Set up Environment

In [34]:
# handle datasets
import pandas as pd
import numpy as np

# divide training and validation datasets
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# model building
from sklearn.linear_model import Ridge

# model evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

# to persist the model and the scaler
import joblib

# display all columns of dataframes in the notebook
pd.pandas.set_option('display.max_columns', None)

# ignore warnings
import warnings
warnings.simplefilter(action='ignore')

# set up random seed for reproducibility
RANDOM_SEED = 42

# Load Data

In [3]:
file_path = '../data/interim/'
df_train_val = pd.read_csv(file_path + 'train_val_data.csv')
df_test = pd.read_csv(file_path + 'test_data.csv')

# Separate Train and Validation Datasets

In [4]:
# the last 10000 flights are set as validation dataset
X_train, X_val, y_train, y_val = train_test_split(df_train_val,
                                                  df_train_val['AOBTtoAIBT'],
                                                  shuffle=False,
                                                  test_size=10000)

X_test = df_test.copy()
y_test = df_test['AOBTtoAIBT']

X_train.shape, X_val.shape, X_test.shape

((734965, 40), (10000, 40), (10000, 40))

In [5]:
X_train.head(3)

Unnamed: 0,AOBTtoAIBT,UniqueCarrierCode,OriginAirportID,OriginCityMarketID,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,SOBTtoSIBT,Distance,DistanceGroup,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25,SIBTQuarter,SIBTMonth,SIBTDayOfMonth,SIBTDayOfWeek,SIBTHour
0,218,DL,14869,34614,UT,-2,0,-1,223,1590,7,0,0,0,0,1,0,0,2,0,0,0,0,2,1,1,0,1,0,0,3,1,0,3,1,1,1,1,7,6
1,81,F9,13204,31454,FL,-4,0,-1,88,404,2,0,0,0,0,3,0,1,0,0,0,0,1,1,1,0,1,3,0,3,1,0,3,1,0,1,1,1,7,6
2,88,EV,10980,30980,TN,302,1,12,57,106,1,0,0,0,0,3,0,0,0,0,0,1,0,1,0,1,0,4,0,2,0,3,1,0,0,1,1,1,7,6


# Selected Features

In [25]:
# load selected features
features = pd.read_csv(file_path + 'selected_features.csv').iloc[:, 0].to_list()

# note that there is no header in this csv. Hence, +1
print('Number of features: ', len(features))

Number of features:  36


In [26]:
print(features)

['UniqueCarrierCode', 'OriginAirportID', 'OriginCityMarketID', 'OriginState', 'OBTDelay', 'OBTDel15', 'OBTDelayGroups', 'SOBTtoSIBT', 'Distance', 'DistanceGroup', 'Num_Arr_SIBT-30', 'Num_Arr_SIBT-25', 'Num_Arr_SIBT-20', 'Num_Arr_SIBT-15', 'Num_Arr_SIBT-10', 'Num_Arr_SIBT-0', 'Num_Arr_SIBT+5', 'Num_Arr_SIBT+10', 'Num_Arr_SIBT+20', 'Num_Arr_SIBT+25', 'Num_Dep_SIBT-30', 'Num_Dep_SIBT-25', 'Num_Dep_SIBT-20', 'Num_Dep_SIBT-15', 'Num_Dep_SIBT-10', 'Num_Dep_SIBT-0', 'Num_Dep_SIBT+5', 'Num_Dep_SIBT+10', 'Num_Dep_SIBT+15', 'Num_Dep_SIBT+20', 'Num_Dep_SIBT+25', 'SIBTQuarter', 'SIBTMonth', 'SIBTDayOfMonth', 'SIBTDayOfWeek', 'SIBTHour']


# Categorical Variables

## Group Rare Labels

In [10]:
# capture categorical variables in a list
cat_vars = ['UniqueCarrierCode', 'OriginAirportID', 'OriginCityMarketID', 'OriginState']

Labels that are under-represented in the dataset tend to cause over-fitting of models. They will be replaced by 'Rare'.

In [11]:
rare_perc = 0.01

def find_frequent_labels(df, var, rare_perc):
    # function finds the labels that are shared by more than
    # a certain % of the houses in the dataset
    temp = df.groupby(var)['AOBTtoAIBT'].count() / len(df)
    return temp[temp >= rare_perc].index


for var in cat_vars:

    # find the frequent categories
    frequent_ls = find_frequent_labels(X_train, var, rare_perc)
    print(var)
    print(frequent_ls)
    print()

    # replace rare categories by the string "Rare"
    X_train[var] = np.where(X_train[var].isin(
        frequent_ls), X_train[var], 'Rare')

    X_test[var] = np.where(X_test[var].isin(
        frequent_ls), X_test[var], 'Rare')

UniqueCarrierCode
Index(['9E', 'AA', 'DL', 'EV', 'NK', 'OO', 'UA', 'WN'], dtype='object', name='UniqueCarrierCode')

OriginAirportID
Int64Index([10693, 10721, 10821, 11057, 11066, 11278, 11292, 11298, 11433,
            11618, 11697, 12191, 12266, 12339, 12451, 12889, 12892, 12953,
            13198, 13204, 13232, 13303, 13487, 13495, 13930, 14027, 14100,
            14492, 14524, 15016, 15304],
           dtype='int64', name='OriginAirportID')

OriginCityMarketID
Int64Index([30194, 30325, 30647, 30693, 30721, 30852, 30977, 31057, 31066,
            31123, 31136, 31295, 31453, 31454, 31650, 31703, 32211, 32337,
            32457, 32467, 32575, 33195, 33198, 33495, 33667, 34027, 34100,
            34492, 34524],
           dtype='int64', name='OriginCityMarketID')

OriginState
Index(['AL', 'AR', 'AZ', 'CA', 'CO', 'FL', 'GA', 'IL', 'IN', 'KY', 'LA', 'MA',
       'MD', 'MI', 'MN', 'MO', 'MS', 'NC', 'NJ', 'NV', 'NY', 'OH', 'PA', 'SC',
       'TN', 'TX', 'VA', 'WI'],
      dtype='object', n

## Encode Categorical Variables

Next, transform the strings of the categorical variables into numbers, so that we capture the monotonic relationship between the label and the target.

In [12]:
# this function will assign discrete values to the strings of the variables,
# so that the smaller value corresponds to the category that shows the smaller
# mean AOBTtoAIBT


def replace_categories(train, test, var, target):

    # order the categories in a variable from that with the lowest
    # house sale price, to that with the highest
    ordered_labels = train.groupby([var])[target].mean().sort_values().index

    # create a dictionary of ordered categories to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}

    # use the dictionary to replace the categorical strings by integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)


for var in cat_vars:
    replace_categories(X_train, X_test, var, 'AOBTtoAIBT')

In [13]:
# check absence of na
[var for var in features if X_train[var].isnull().sum() > 0]

[]

In [14]:
# check absence of na
[var for var in features if X_test[var].isnull().sum() > 0]

[]

# Feature Scaling

For use in linear models, features need to be either scaled or normalised.

In [28]:
# set up and train the scaler
scaler = StandardScaler()
scaler.fit(X_train[features])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [30]:
# transform the train and test set, and add on the Id and SalePrice variables
X_train = scaler.transform(X_train[features])
X_test = scaler.transform(X_test[features])

# Train the Linear Regressor: Ridge

In [32]:
# set up the model
ridge = Ridge(alpha=0.005, random_state=RANDOM_SEED)

# train the model
ridge.fit(X_train, y_train)

Ridge(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

In [33]:
model_path = '..\\models\\'
pkl_filename = 'ridge_regressor.pkl'

# Save to file in the current working directory
with open(model_path + pkl_filename, 'wb') as file:
    joblib.dump(ridge, file)

In [35]:
# evaluate the model:
# ====================

# make predictions for train set
pred_train = ridge.predict(X_train)

# determine metrics
print('train rmse: {:.3f}'.format(sqrt(mean_squared_error(y_train, pred_train))))
print('train mae: {:.3f}'.format(mean_absolute_error(y_train, pred_train)))
print('train r2: {:.3f}'.format(r2_score(y_train, pred_train)))
print()

# make predictions for test set
pred_test = ridge.predict(X_test)

# determine metrics
print('test rmse: {:.3f}'.format(sqrt(mean_squared_error(y_test, pred_test))))
print('test mae: {:.3f}'.format(mean_absolute_error(y_test, pred_test)))
print('test r2: {:.3f}'.format(r2_score(y_test, pred_test)))
print()

print('Median OBTtoIBT in training set: ', int(y_train.median()))

train rmse: 12.856
train mae: 8.669
train r2: 0.939

test rmse: 12.794
test mae: 8.826
test r2: 0.941

Median OBTtoIBT in training set:  106
