### Linear Regression Model

In [136]:
%matplotlib inline
# import required modules for prediction tasks
import numpy as np
import pandas as pd
import math
import random
import requests
import zipfile
import StringIO
import re
import json
import os

# sklearn functions used for the linear regression model
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [137]:
# first step is to load the actual data and exclude rows that are unnecessary
print('loading data...')
df = pd.read_csv('cache/BigFlightTable.csv', nrows=2000)

loading data...


In [138]:
print 'columns found: '
print df.columns

columns found: 
Index([u'Unnamed: 0', u'index', u'ORIGIN_CITY_NAME', u'ARR_DEL15', u'FL_NUM',
       u'CANCELLED', u'ARR_DELAY', u'MONTH', u'DIVERTED', u'DAY_OF_MONTH',
       u'DEST_CITY_NAME', u'ORIGIN', u'DEP_TIME', u'DEST', u'ARR_DELAY_NEW',
       u'DAY_OF_WEEK', u'YEAR', u'AIRLINE_ID', u'QUARTER', u'DISTANCE',
       u'ORIGIN_STATE_NM', u'ARR_TIME', u'UNIQUE_CARRIER', u'ORIGIN_WAC',
       u'TAIL_NUM', u'AIRCRAFT_YEAR', u'AIRCRAFT_MFR', u'AIRCRAFT_AGE'],
      dtype='object')


In [139]:
print 'generating additional features'
df['HOUR_OF_ARR'] = df['ARR_TIME'].astype(int) / 10
df['HOUR_OF_DEP'] = df['DEP_TIME'].astype(int) / 10

generating additional features


In [140]:
# split data into numerical and categorical features
print 'splitting into numerical/categorical features'
numericalFeat = df[['DISTANCE', 'AIRCRAFT_AGE']].copy().astype('float') # Numerical features
num_numFeatures = 2
categoricalFeat = df[['MONTH', 'DAY_OF_MONTH', 'ORIGIN', 
                    'DEST', 'HOUR_OF_ARR', 'HOUR_OF_DEP', 
                    'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR']].copy() # Categorical features

splitting into numerical/categorical features


In [141]:
# for the next step, all features need to be encoded as integers --> create lookup Tables!
def transformToID(df, col):
    vals = df[col].unique()
    LookupTable = dict(zip(vals, np.arange(len(vals))))
    for key in LookupTable.keys():
        df.loc[df[col] == key, col] = LookupTable[key]
    return LookupTable

In [142]:
print 'indexing UNIQUE_CARRIER'
carrierTable = transformToID(categoricalFeat, 'UNIQUE_CARRIER')
with open('cache/carrierTable.json', 'wb') as outfile:
    json.dump(carrierTable, outfile)
print 'indexing AIRCRAFT_MFR'
mfrTable = transformToID(categoricalFeat, 'AIRCRAFT_MFR')
with open('cache/manufacturerTable.json', 'wb') as outfile:
    json.dump(mfrTable, outfile)
    

print 'indexing DEST'
destTable = transformToID(categoricalFeat, 'DEST')
with open('cache/destTable.json', 'wb') as outfile:
    json.dump(destTable, outfile)
print 'indexing ORIGIN'
originTable = transformToID(categoricalFeat, 'ORIGIN')
with open('cache/originTable.json', 'wb') as outfile:
    json.dump(originTable, outfile)

indexing UNIQUE_CARRIER
indexing AIRCRAFT_MFR
indexing DEST
indexing ORIGIN


In [143]:
# Encode categorical variables as binary ones
print 'encoding categorical variables'
encoder = OneHotEncoder() 
categoricals_encoded = encoder.fit_transform(categoricalFeat)

# convert numerical features to sparse matrix
numericals_sparse = sparse.csr_matrix(numericalFeat)

# get data matrix & response variable
X_all = sparse.hstack((numericals_sparse, categoricals_encoded))
y_all = df['ARR_DELAY'].values

# construct test/train set (15%)
print 'splitting test/train set'
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.15, random_state = 42)

# before starting the regression, numerical features need to be standardized!
X_train_numericals = X_train[:, 0:num_numFeatures+1].toarray()
X_test_numericals = X_test[:, 0:num_numFeatures+1].toarray()

# use sklearn tools...
print 'normalizing numerical features'
scaler = StandardScaler() 
scaler.fit(X_train_numericals) # get std/mean from train set

# save scaler to cache (for later prediction)
with open('cache/scalerValues.csv', 'wb') as f:
    f.write('mean: ' + str(list(scaler.mean_)) + '\n')
    f.write('std : ' + str(list(scaler.std_)) + '\n')

X_train_numericals = sparse.csr_matrix(scaler.transform(X_train_numericals)) 
X_test_numericals = sparse.csr_matrix(scaler.transform(X_test_numericals))

# update sets
X_train[:, 0:num_numFeatures+1] = X_train_numericals
X_test[:, 0:num_numFeatures+1] = X_test_numericals

encoding categorical variables
splitting test/train set
normalizing numerical features


In [144]:
# stochastic gradient based ridge regression
SGD_params = {'alpha': 10.0 ** -np.arange(1,8)}
SGD_model = GridSearchCV(SGDRegressor(random_state = 42, verbose=1), \
                         SGD_params, scoring = 'mean_absolute_error', cv = 4) # cross validate 4 times

In [145]:
# train the model, this might take some time...
SGD_model.fit(X_train, y_train)

-- Epoch 1
Norm: 12.38, NNZs: 368, Bias: 0.058037, T: 1275, Avg. loss: 1462.066434
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 18.14, NNZs: 368, Bias: 0.089444, T: 2550, Avg. loss: 1440.205982
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 20.58, NNZs: 368, Bias: 0.088980, T: 3825, Avg. loss: 1424.935557
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 23.51, NNZs: 368, Bias: 0.109310, T: 5100, Avg. loss: 1415.373491
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 25.41, NNZs: 368, Bias: 0.119647, T: 6375, Avg. loss: 1407.352229
Total training time: 0.00 seconds.
-- Epoch 1
Norm: 15.89, NNZs: 364, Bias: 0.074917, T: 1275, Avg. loss: 1896.165625
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 21.09, NNZs: 364, Bias: 0.096697, T: 2550, Avg. loss: 1866.711929
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 24.78, NNZs: 364, Bias: 0.110052, T: 3825, Avg. loss: 1846.926397
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 27.75, NNZs: 364, Bias: 0.130703, T: 51

GridSearchCV(cv=4, error_score='raise',
       estimator=SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=42, shuffle=True, verbose=1, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-01,   1.00000e-02,   1.00000e-03,   1.00000e-04,
         1.00000e-05,   1.00000e-06,   1.00000e-07])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='mean_absolute_error', verbose=0)

In [146]:
def rmse(y, y_pred):
    return np.sqrt(((y - y_pred)**2).mean())

print 'computing statistics:'
y_pred = SGD_model.predict(X_test)
print 'RMSE:' + str(rmse(y_test, y_pred))
print 'MAS:' + str(mean_absolute_error(y_test, y_pred))

computing statistics:
RMSE:34.0828162513
MAS:23.4329222247
