In [81]:
%matplotlib inline
# import required modules for prediction tasks
import numpy as np
import pandas as pd
import math
import random
import requests
import zipfile
import StringIO
import re
import json
import os
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [56]:
# need to create a lookup table for the values (i.e. flight numbers, city and so on and then drop all duplicates!)
db = pd.read_csv('cache/BigFlightTable.csv')
# remove all unnecessary columns
db = db[['ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'AIRCRAFT_AGE', 'DEST', 'ARR_TIME', \
         'DEP_TIME', 'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR', 'FL_NUM', 'MONTH', \
         'DAY_OF_MONTH', 'DISTANCE', 'ORIGIN']]
print str(db.count()[0]) + ' entries'
db.head()

4376277 entries


Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN
0,"New York, NY","Los Angeles, CA",27,LAX,1238,914,AA,3,BOEING,1,1,1,2475,JFK
1,"New York, NY","Los Angeles, CA",27,LAX,1226,857,AA,4,BOEING,1,1,2,2475,JFK
2,"New York, NY","Los Angeles, CA",28,LAX,1324,1005,AA,6,BOEING,1,1,4,2475,JFK
3,"New York, NY","Los Angeles, CA",29,LAX,1217,917,AA,1,BOEING,1,1,6,2475,JFK
4,"New York, NY","Los Angeles, CA",26,LAX,1204,859,AA,4,BOEING,1,1,9,2475,JFK


In [57]:
# drop everything except for the 5 days before christmas! i.e. 20.12, 21.12, 22.12, 23.12, 24.12.
db = db[db.MONTH == 12]
db = db[db.DAY_OF_MONTH <= 24]
db = db[db.DAY_OF_MONTH >= 20]

print '5 days have ' + str(db.count()[0]) + ' flights'
# we are only interested in flights NY to Chicago!
city_from = 'Chicago, IL'
city_to = 'New York, NY'

db = db[db.ORIGIN_CITY_NAME == city_from]
db = db[db.DEST_CITY_NAME == city_to]

print 'Found ' + str(db.count()[0]) + ' flights from ' + city_from + ' to ' + city_to + ' for 20.12 - 24.12'

5 days have 58334 flights
Found 85 flights from Chicago, IL to New York, NY for 20.12 - 24.12


In [146]:
mdl = None
# with open('cache/models/2014model.json') as f:
#     mdl = json.load(f)
with open('cache/model.json') as f:
    mdl = json.load(f)

In [147]:
# the structure of the model is given as 
# print 'splitting into numerical/categorical features'
# numericalFeat = df[['DISTANCE', 'AIRCRAFT_AGE']].copy().astype('float') # Numerical features
# num_numFeatures = 2
# categoricalFeat = df[['MONTH', 'DAY_OF_MONTH', 'ORIGIN', 
#                     'DEST', 'HOUR_OF_ARR', 'HOUR_OF_DEP', 
#                     'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR']].copy() # Categorical features

In [148]:
# with open('cache/first_model/carrierTable.json', 'rb') as infile:
#     carrierTable = json.load(infile)

# with open('cache/first_model/manufacturerTable.json', 'rb') as infile:
#     mfrTable = json.load(infile)
    
# with open('cache/first_model/destTable.json', 'rb') as infile:
#     destTable = json.load(infile)

# with open('cache/first_model/originTable.json', 'rb') as infile:
#     originTable = json.load(infile)

In [214]:
# categorical feature encoder, fitted on the keys
encoder = OneHotEncoder(sparse=True, n_values=mdl['encoder']['values']) 

# input is a datarow
# prediction of day in the next year!
def predictDelayTime(row, s_mean, s_std, coeff, intercept):
    distance = row['DISTANCE'] # <-- look this up!
    aircraft_age = row['AIRCRAFT_AGE'] # <-- look this up!
    
    # normalize numerical features according to scaler
    distance = (distance - s_mean[0]) / s_std[0]
    aircraft_age = (aircraft_age + 1 - s_mean[1]) / s_std[1]
    
    month = row['MONTH']
    day_of_month = row['DAY_OF_MONTH'] 
    origin = row['ORIGIN']
    dest = row['DEST']
    
    hour_of_arr = int(row['ARR_TIME']) / 10
    hour_of_dep = int(row['DEP_TIME']) / 10
    carrier = row['UNIQUE_CARRIER']
    day_of_week = datetime(year=2015, month=row.MONTH, day=row.DAY_OF_MONTH).weekday() # <-- get via datetimeobject
    mfr = row['AIRCRAFT_MFR']
    
    # for nonindexed categorical features, do lookup!
    origin = originTable[origin]
    dest = destTable[dest]
    mfr = mfrTable[mfr]
    carrier = carrierTable[carrier]
    
    # write into df
    df['MONTH'] = month
    df['DAY_OF_MONTH'] = day_of_month
    df['ORIGIN'] = origin
    df['DEST'] = dest
    df['HOUR_OF_ARR'] = hour_of_arr
    df['HOUR_OF_DEP'] = hour_of_dep
    df['UNIQUE_CARRIER'] = carrier
    df['DAY_OF_WEEK'] = day_of_week
    df['AIRCRAFT_MFR'] = mfr
    
    categoricalFeat = df[['MONTH', 'DAY_OF_MONTH', 'ORIGIN', 
                    'DEST', 'HOUR_OF_ARR', 'HOUR_OF_DEP', 
                    'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR']].copy() # Categorical features
    
    categoricals_encoded = encoder.fit_transform(categoricalFeat)

    num_features = np.array([distance, aircraft_age])
    cat_features = categoricals_encoded.toarray().T.ravel()
    w = np.hstack([num_features, cat_features])
    #now construct the data vector for the linear model
    # 2 random features (dist, aircraft_age)
    # 31 days of month
    # len(originTable.keys()) origin airports
    # len(destTable.keys()) dest airports
    # 241 hours (arrival hour)
    # 241 hours  (departure hour)
    # len(carrierTable.keys())
    # 7 weekdays
    # len(mfrTable.keys())
    
#     w = np.zeros( 2 + 12 + 31 + len(originTable.keys()) + len(destTable.keys()) \
#     + 2 * 24 + len(carrierTable.keys()) + 7 + len(mfrTable.keys()) )
    
#     # fill with features
#     w[0] = distance
#     w[1] = aircraft_age
#     w[1 + month] = 1
#     w[1 + 12 + day_of_month] = 1
#     w[2 + 12 + 31 + origin] = 1
#     w[2 + 12 + 31 + len(originTable.keys()) + dest] = 1
#     w[2 + 12 + 31 + len(originTable.keys()) + len(destTable.keys()) + hour_of_arr] = 1
#     w[2 + 12 + 31 + len(originTable.keys()) + len(destTable.keys()) + 24 + hour_of_dep] = 1
#     w[2 + 12 + 31 + len(originTable.keys()) + len(destTable.keys()) + 24 + 24 + carrier] = 1
#     w[2 + 12 + 31 + len(originTable.keys()) + len(destTable.keys()) + 24 + 24 + len(carrierTable.keys()) + day_of_week] = 1
#     w[2 + 12 + 31 + len(originTable.keys()) + len(destTable.keys()) + 24 + 24 + len(carrierTable.keys()) + 7 + mfr] = 1
    
    y_pred = np.dot(w, coeff) + intercept
    
    return y_pred[0]

In [215]:
# read out tables
carrierTable = mdl['CARRIER']
mfrTable = mdl['MANUFACTURER']
destTable = mdl['DEST']
originTable = mdl['ORIGIN']

In [216]:
test_row = db.iloc[0]
test_row

ORIGIN_CITY_NAME     Chicago, IL
DEST_CITY_NAME      New York, NY
AIRCRAFT_AGE                   4
DEST                         JFK
ARR_TIME                    1253
DEP_TIME                     951
UNIQUE_CARRIER                B6
DAY_OF_WEEK                    7
AIRCRAFT_MFR             EMBRAER
FL_NUM                       906
MONTH                         12
DAY_OF_MONTH                  21
DISTANCE                     740
ORIGIN                       ORD
Name: 4039115, dtype: object

In [217]:
# make prediction
predictDelayTime(test_row, mdl['scaler_mean'], mdl['scaler_std'], mdl['coeff'], mdl['intercept'])

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a co

4.142780935604816

In [143]:
dest = 'JFK'
dest = destTable[dest]

In [122]:
len(mdl['coeff'])

376

In [145]:
mdl['encoder']['values']

[2, 32, 40, 41, 241, 241, 1, 8, 4]

In [131]:
mfrTable.keys()

[u'MCDONNELL DOUGLAS', u'SMALL', u'CESSNA', u'BOEING']

NameError: name 'distance' is not defined