In [81]:
%matplotlib inline
# import required modules for prediction tasks
import numpy as np
import pandas as pd
import math
import random
import requests
import zipfile
import StringIO
import re
import json
import os
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [257]:
from datetime import timedelta, datetime, tzinfo
from pytz import timezone
import pytz

def convertToUTC(naive, zonestring="America/New_York"):
    local = pytz.timezone (zonestring)
    local_dt = local.localize(naive, is_dst=None)
    return local_dt.astimezone (pytz.utc)
convertToUTC(datetime(year=2015, month=12, day=22, hour=9, minute=45))

datetime.datetime(2015, 12, 22, 15, 45, tzinfo=<UTC>)

In [56]:
# need to create a lookup table for the values (i.e. flight numbers, city and so on and then drop all duplicates!)
db = pd.read_csv('cache/BigFlightTable.csv')
# remove all unnecessary columns
db = db[['ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'AIRCRAFT_AGE', 'DEST', 'ARR_TIME', \
         'DEP_TIME', 'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR', 'FL_NUM', 'MONTH', \
         'DAY_OF_MONTH', 'DISTANCE', 'ORIGIN']]
print str(db.count()[0]) + ' entries'
db.head()

4376277 entries


Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN
0,"New York, NY","Los Angeles, CA",27,LAX,1238,914,AA,3,BOEING,1,1,1,2475,JFK
1,"New York, NY","Los Angeles, CA",27,LAX,1226,857,AA,4,BOEING,1,1,2,2475,JFK
2,"New York, NY","Los Angeles, CA",28,LAX,1324,1005,AA,6,BOEING,1,1,4,2475,JFK
3,"New York, NY","Los Angeles, CA",29,LAX,1217,917,AA,1,BOEING,1,1,6,2475,JFK
4,"New York, NY","Los Angeles, CA",26,LAX,1204,859,AA,4,BOEING,1,1,9,2475,JFK


In [258]:
# drop everything except for the 5 days before christmas! i.e. 20.12, 21.12, 22.12, 23.12, 24.12.
db = db[db.MONTH == 12]
db = db[db.DAY_OF_MONTH <= 24]
db = db[db.DAY_OF_MONTH >= 20]

print '5 days have ' + str(db.count()[0]) + ' flights'
# we are only interested in flights NY to Chicago!
city_from = 'Chicago, IL'
city_to = 'New York, NY'
zone_from = 'America/Chicago'
zone_to = 'America/New_York'

db = db[db.ORIGIN_CITY_NAME == city_from]
db = db[db.DEST_CITY_NAME == city_to]

print 'Found ' + str(db.count()[0]) + ' flights from ' + city_from + ' to ' + city_to + ' for 20.12 - 24.12'

5 days have 85 flights
Found 85 flights from Chicago, IL to New York, NY for 20.12 - 24.12


In [218]:
mdl = None
with open('cache/model.json') as f:
    mdl = json.load(f)
    
# categorical feature encoder, fitted on the keys
encoder = OneHotEncoder(sparse=True, n_values=mdl['encoder']['values']) 

In [220]:
# input is a datarow
# prediction of day in the next year!
def predictDelayTime(row, mdl):
    
    s_mean, s_std, coeff, intercept = mdl['scaler_mean'], mdl['scaler_std'], mdl['coeff'], mdl['intercept']
    
    # read out tables
    carrierTable = mdl['CARRIER']
    mfrTable = mdl['MANUFACTURER']
    destTable = mdl['DEST']
    originTable = mdl['ORIGIN']
    
    distance = row['DISTANCE'] # <-- look this up!
    aircraft_age = row['AIRCRAFT_AGE'] # <-- look this up!
    
    # normalize numerical features according to scaler
    distance = (distance - s_mean[0]) / s_std[0]
    aircraft_age = (aircraft_age + 1 - s_mean[1]) / s_std[1]
    
    month = row['MONTH']
    day_of_month = row['DAY_OF_MONTH'] 
    origin = row['ORIGIN']
    dest = row['DEST']
    
    hour_of_arr = int(row['ARR_TIME']) / 10
    hour_of_dep = int(row['DEP_TIME']) / 10
    carrier = row['UNIQUE_CARRIER']
    day_of_week = datetime(year=2015, month=row.MONTH, day=row.DAY_OF_MONTH).weekday() # <-- get via datetimeobject
    mfr = row['AIRCRAFT_MFR']
    
    # for nonindexed categorical features, do lookup!
    origin = originTable[origin]
    dest = destTable[dest]
    mfr = mfrTable[mfr]
    carrier = carrierTable[carrier]
    
    # write into df
    df['MONTH'] = month
    df['DAY_OF_MONTH'] = day_of_month
    df['ORIGIN'] = origin
    df['DEST'] = dest
    df['HOUR_OF_ARR'] = hour_of_arr
    df['HOUR_OF_DEP'] = hour_of_dep
    df['UNIQUE_CARRIER'] = carrier
    df['DAY_OF_WEEK'] = day_of_week
    df['AIRCRAFT_MFR'] = mfr
    
    # order here is important! make sure it is the same as in the model!
    categoricalFeat = df[['MONTH', 'DAY_OF_MONTH', 'ORIGIN', 
                    'DEST', 'HOUR_OF_ARR', 'HOUR_OF_DEP', 
                    'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR']].copy() # Categorical features
    
    # construct the data vector for the linear model
    categoricals_encoded = encoder.fit_transform(categoricalFeat)
    num_features = np.array([distance, aircraft_age])
    cat_features = categoricals_encoded.toarray().T.ravel()
    w = np.hstack([num_features, cat_features])


    y_pred = np.dot(w, coeff) + intercept
    
    return y_pred[0]

In [221]:
test_row = db.iloc[0]
test_row

ORIGIN_CITY_NAME     Chicago, IL
DEST_CITY_NAME      New York, NY
AIRCRAFT_AGE                   4
DEST                         JFK
ARR_TIME                    1253
DEP_TIME                     951
UNIQUE_CARRIER                B6
DAY_OF_WEEK                    7
AIRCRAFT_MFR             EMBRAER
FL_NUM                       906
MONTH                         12
DAY_OF_MONTH                  21
DISTANCE                     740
ORIGIN                       ORD
Name: 4039115, dtype: object

In [222]:
# make prediction
predictDelayTime(test_row, mdl)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a co

4.142780935604816

In [282]:
# create for each day info
db['PREDICTED_DELAY'] = 0
db['FLIGHT_TIME'] = 0
db['PREDICTED_FLIGHT_TIME'] = 0
for index, row in db.iterrows():
    print 'processing {idx}'.format(idx=index)
    y_pred = predictDelayTime(row, mdl)
    db.set_value(index, 'PREDICTED_DELAY', y_pred)

    arr_time = datetime(year=2015, month=row['MONTH'], day=row['DAY_OF_MONTH'], \
                        hour= int(row['ARR_TIME'] / 100), minute=int(row['ARR_TIME'] % 100))
    dep_time = datetime(year=2015, month=row['MONTH'], day=row['DAY_OF_MONTH'], \
                        hour= int(row['DEP_TIME'] / 100), minute=int(row['DEP_TIME'] % 100))
    
    flight_time_in_min =  (convertToUTC(arr_time) - convertToUTC(dep_time))
    flight_time_in_min = int(flight_time_in_min.total_seconds() / 60)
    
    db.set_value(index, 'FLIGHT_TIME', flight_time_in_min)
    db.set_value(index, 'PREDICTED_FLIGHT_TIME', y_pred + flight_time_in_min)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a co

processing 4039115
processing 4040262
processing 4041136
processing 4042190
processing 4042593
processing 4044554
processing 4050113
processing 4051789
processing 4053089
processing 4109777
processing 4109781
processing 4109786
processing 4151899
processing 4151900
processing 4151901
processing 4151902
processing 4151903
processing 4152761
processing 4152762
processing 4152763
processing 4152764
processing 4156337
processing 4156338
processing 4156339
processing 4156340
processing 4156341
processing 4156342
processing 4160253
processing 4160254
processing 4160255
processing 4160256
processing 4160257
processing 4160258
processing 4178653
processing 4178654
processing 4178655
processing 4178656
processing 4180336
processing 4191323
processing 4233710
processing 4233803
processing 4233808
processing 4233810
processing 4233814
processing 4233816
processing 4233817
processing 4234008
processing 4234238
processing 4235308
processing 4235310
processing 4235313
processing 4235317
processing 4

In [283]:
db.head()

Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN,PREDICTED_DELAY,PREDICTED_DELAT,FLIGHT_TIME,PREDICTED_FLIGHT_TIME
4039115,"Chicago, IL","New York, NY",4,JFK,1253,951,B6,7,EMBRAER,906,12,21,740,ORD,4,4.142781,182,186
4040262,"Chicago, IL","New York, NY",6,JFK,1309,952,B6,3,EMBRAER,906,12,24,740,ORD,9,9.446175,197,206
4041136,"Chicago, IL","New York, NY",3,JFK,1242,948,B6,1,EMBRAER,906,12,22,740,ORD,10,10.290854,174,184
4042190,"Chicago, IL","New York, NY",1,JFK,1301,946,B6,6,EMBRAER,906,12,20,740,ORD,6,6.522068,195,201
4042593,"Chicago, IL","New York, NY",1,JFK,1257,954,B6,2,EMBRAER,906,12,23,740,ORD,11,11.609299,183,194


In [284]:
db2 = db[db.DAY_OF_MONTH == 22]

In [285]:
db2

Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN,PREDICTED_DELAY,PREDICTED_DELAT,FLIGHT_TIME,PREDICTED_FLIGHT_TIME
4041136,"Chicago, IL","New York, NY",3,JFK,1242,948,B6,1,EMBRAER,906,12,22,740,ORD,10,10.290854,174,184
4053089,"Chicago, IL","New York, NY",6,JFK,2355,2040,B6,1,AIRBUS,1106,12,22,740,ORD,41,41.969662,195,236
4156337,"Chicago, IL","New York, NY",68,LGA,1808,1456,WN,1,SMALL,314,12,22,725,MDW,13,13.750126,192,205
4156338,"Chicago, IL","New York, NY",2,LGA,2043,1709,WN,1,BOEING,477,12,22,725,MDW,19,19.215199,214,233
4156339,"Chicago, IL","New York, NY",14,LGA,859,547,WN,1,BOEING,752,12,22,725,MDW,-1,-1.048058,192,190
4156340,"Chicago, IL","New York, NY",15,LGA,1029,726,WN,1,BOEING,1945,12,22,725,MDW,5,5.528131,183,188
4156341,"Chicago, IL","New York, NY",2,LGA,1449,1201,WN,1,BOEING,1986,12,22,725,MDW,14,14.210904,168,182
4156342,"Chicago, IL","New York, NY",14,LGA,1229,934,WN,1,BOEING,2514,12,22,725,MDW,9,9.149758,175,184
4235669,"Chicago, IL","New York, NY",16,LGA,845,552,UA,1,AIRBUS,387,12,22,733,ORD,-3,-3.902609,173,169
4235690,"Chicago, IL","New York, NY",16,LGA,1924,1617,UA,1,AIRBUS,406,12,22,733,ORD,20,20.439727,187,207


In [289]:
dbres = db2.sort('PREDICTED_FLIGHT_TIME').head(6)
dbres.to_csv('data/best_flights.csv')