In [315]:
%matplotlib inline
# import required modules for prediction tasks
import numpy as np
import pandas as pd
import math
import random
import requests
import zipfile
import StringIO
import re
import json
import os
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [316]:
from datetime import timedelta, datetime, tzinfo
from pytz import timezone
import pytz

def convertToUTC(naive, zonestring="America/New_York"):
    local = pytz.timezone (zonestring)
    local_dt = local.localize(naive, is_dst=None)
    return local_dt.astimezone (pytz.utc)
convertToUTC(datetime(year=2015, month=12, day=22, hour=9, minute=45))

datetime.datetime(2015, 12, 22, 14, 45, tzinfo=<UTC>)

In [317]:
# need to create a lookup table for the values (i.e. flight numbers, city and so on and then drop all duplicates!)
db = pd.read_csv('cache/BigFlightTable.csv')
# remove all unnecessary columns
db = db[['ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'AIRCRAFT_AGE', 'DEST', 'ARR_TIME', \
         'DEP_TIME', 'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR', 'FL_NUM', 'MONTH', \
         'DAY_OF_MONTH', 'DISTANCE', 'ORIGIN']]
print str(db.count()[0]) + ' entries'
db.head()

4376277 entries


Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN
0,"New York, NY","Los Angeles, CA",27,LAX,1238,914,AA,3,BOEING,1,1,1,2475,JFK
1,"New York, NY","Los Angeles, CA",27,LAX,1226,857,AA,4,BOEING,1,1,2,2475,JFK
2,"New York, NY","Los Angeles, CA",28,LAX,1324,1005,AA,6,BOEING,1,1,4,2475,JFK
3,"New York, NY","Los Angeles, CA",29,LAX,1217,917,AA,1,BOEING,1,1,6,2475,JFK
4,"New York, NY","Los Angeles, CA",26,LAX,1204,859,AA,4,BOEING,1,1,9,2475,JFK


In [318]:
# drop everything except for the 5 days before christmas! i.e. 20.12, 21.12, 22.12, 23.12, 24.12.
db = db[db.MONTH == 12]
db = db[db.DAY_OF_MONTH <= 24]
db = db[db.DAY_OF_MONTH >= 20]

print '5 days have ' + str(db.count()[0]) + ' flights'
# we are only interested in flights NY to Chicago!
city_to = 'Chicago, IL'
city_from = 'New York, NY'
zone_to = 'America/Chicago'
zone_from = 'America/New_York'

db = db[db.ORIGIN_CITY_NAME == city_from]
db = db[db.DEST_CITY_NAME == city_to]

print 'Found ' + str(db.count()[0]) + ' flights from ' + city_from + ' to ' + city_to + ' for 20.12 - 24.12'

5 days have 58334 flights
Found 87 flights from New York, NY to Chicago, IL for 20.12 - 24.12


In [319]:
mdl = None
with open('cache/models/2014model.json') as f:
    mdl = json.load(f)

# categorical feature encoder, fitted on the keys
encoder = OneHotEncoder(sparse=True, n_values=mdl['encoder']['values']) 

In [323]:
# input is a datarow
# prediction of day in the next year!
def predictDelayTime(row, mdl):
    
    s_mean, s_std, coeff, intercept = mdl['scaler_mean'], mdl['scaler_std'], mdl['coeff'], mdl['intercept']
    
    # read out tables
    carrierTable = mdl['CARRIER']
    mfrTable = mdl['MANUFACTURER']
    destTable = mdl['DEST']
    originTable = mdl['ORIGIN']
    
    distance = row['DISTANCE'] # <-- look this up!
    aircraft_age = row['AIRCRAFT_AGE'] # <-- look this up!
    
    # normalize numerical features according to scaler
    distance = (distance - s_mean[0]) / s_std[0]
    aircraft_age = (aircraft_age + 1 - s_mean[1]) / s_std[1]
    
    month = row['MONTH']
    day_of_month = row['DAY_OF_MONTH'] 
    origin = row['ORIGIN']
    dest = row['DEST']
    
    hour_of_arr = int(row['ARR_TIME']) / 10
    hour_of_dep = int(row['DEP_TIME']) / 10
    carrier = row['UNIQUE_CARRIER']
    day_of_week = datetime(year=2015, month=row.MONTH, day=row.DAY_OF_MONTH).weekday() # <-- get via datetimeobject
    mfr = row['AIRCRAFT_MFR']
    
    # for nonindexed categorical features, do lookup!
    origin = originTable[origin]
    dest = destTable[dest]
    mfr = mfrTable[mfr]
    carrier = carrierTable[carrier]
    
    # write into df
    df['MONTH'] = month
    df['DAY_OF_MONTH'] = day_of_month
    df['ORIGIN'] = origin
    df['DEST'] = dest
    df['HOUR_OF_ARR'] = hour_of_arr
    df['HOUR_OF_DEP'] = hour_of_dep
    df['UNIQUE_CARRIER'] = carrier
    df['DAY_OF_WEEK'] = day_of_week
    df['AIRCRAFT_MFR'] = mfr
    
    # order here is important! make sure it is the same as in the model!
    categoricalFeat = df[['MONTH', 'DAY_OF_MONTH', 'ORIGIN', 
                    'DEST', 'HOUR_OF_ARR', 'HOUR_OF_DEP', 
                    'UNIQUE_CARRIER', 'DAY_OF_WEEK', 'AIRCRAFT_MFR']].copy() # Categorical features
    
    # construct the data vector for the linear model
    categoricals_encoded = encoder.fit_transform(categoricalFeat)
    num_features = np.array([distance, aircraft_age])
    cat_features = categoricals_encoded.toarray().T.ravel()
    w = np.hstack([num_features, cat_features])


    y_pred = np.dot(w, coeff) + intercept
    
    return y_pred[0]

In [324]:
test_row = db.iloc[0]
test_row

ORIGIN_CITY_NAME    New York, NY
DEST_CITY_NAME       Chicago, IL
AIRCRAFT_AGE                   9
DEST                         ORD
ARR_TIME                    2227
DEP_TIME                    2032
UNIQUE_CARRIER                B6
DAY_OF_WEEK                    1
AIRCRAFT_MFR             EMBRAER
FL_NUM                       105
MONTH                         12
DAY_OF_MONTH                  22
DISTANCE                     740
ORIGIN                       JFK
Name: 4034820, dtype: object

In [325]:
# make prediction
predictDelayTime(test_row, mdl)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a co

26.198260425446726

In [326]:
# create for each day info
db['PREDICTED_DELAY'] = 0.
db['FLIGHT_TIME'] = 0
db['PREDICTED_FLIGHT_TIME'] = 0
for index, row in db.iterrows():
    print 'processing {idx}'.format(idx=index)
    y_pred = predictDelayTime(row, mdl)
    db.set_value(index, 'PREDICTED_DELAY', y_pred)

    arr_time = datetime(year=2015, month=row['MONTH'], day=row['DAY_OF_MONTH'], \
                        hour= int(row['ARR_TIME'] / 100), minute=int(row['ARR_TIME'] % 100))
    dep_time = datetime(year=2015, month=row['MONTH'], day=row['DAY_OF_MONTH'], \
                        hour= int(row['DEP_TIME'] / 100), minute=int(row['DEP_TIME'] % 100))
    
    flight_time_in_min =  (convertToUTC(arr_time) - convertToUTC(dep_time))
    flight_time_in_min = int(flight_time_in_min.total_seconds() / 60)
    
    db.set_value(index, 'FLIGHT_TIME', flight_time_in_min)
    db.set_value(index, 'PREDICTED_FLIGHT_TIME', y_pred + flight_time_in_min)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a co

processing 4034820
processing 4036572
processing 4038546
processing 4042096
processing 4042580
processing 4043110
processing 4043446
processing 4044468
processing 4047361
processing 4050926
processing 4109591
processing 4109682
processing 4109776
processing 4109783
processing 4155935
processing 4155936
processing 4155937
processing 4155938
processing 4156107
processing 4156108
processing 4156109
processing 4156110
processing 4156111
processing 4160018
processing 4160019
processing 4160020
processing 4160021
processing 4160022
processing 4160023
processing 4178460
processing 4178461
processing 4178462
processing 4178463
processing 4178464
processing 4180125
processing 4180126
processing 4180127
processing 4180128
processing 4180129
processing 4180130
processing 4203523
processing 4233800
processing 4233809
processing 4233812
processing 4233815
processing 4233818
processing 4233828
processing 4233832
processing 4233851
processing 4234066
processing 4234128
processing 4234172
processing 4

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [327]:
db.head()

Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN,PREDICTED_DELAY,FLIGHT_TIME,PREDICTED_FLIGHT_TIME
4034820,"New York, NY","Chicago, IL",9,ORD,2227,2032,B6,1,EMBRAER,105,12,22,740,JFK,26.19826,115,141
4036572,"New York, NY","Chicago, IL",8,ORD,2100,1928,B6,2,EMBRAER,105,12,23,740,JFK,18.105004,92,110
4038546,"New York, NY","Chicago, IL",5,ORD,2106,1925,B6,7,EMBRAER,105,12,21,740,JFK,10.635419,101,111
4042096,"New York, NY","Chicago, IL",1,ORD,2125,1926,B6,3,EMBRAER,105,12,24,740,JFK,18.002437,119,137
4042580,"New York, NY","Chicago, IL",1,ORD,2049,1925,B6,6,EMBRAER,105,12,20,740,JFK,10.223136,84,94


In [328]:
db2 = db[db.DAY_OF_MONTH == 22]

In [329]:
db2

Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN,PREDICTED_DELAY,FLIGHT_TIME,PREDICTED_FLIGHT_TIME
4034820,"New York, NY","Chicago, IL",9,ORD,2227,2032,B6,1,EMBRAER,105,12,22,740,JFK,26.19826,115,141
4050926,"New York, NY","Chicago, IL",7,ORD,1036,830,B6,1,AIRBUS,905,12,22,740,JFK,8.811953,126,134
4156107,"New York, NY","Chicago, IL",14,MDW,1423,1304,WN,1,BOEING,333,12,22,725,LGA,7.796969,79,86
4156108,"New York, NY","Chicago, IL",2,MDW,738,627,WN,1,BOEING,490,12,22,725,LGA,-5.391739,71,65
4156109,"New York, NY","Chicago, IL",2,MDW,1732,1606,WN,1,BOEING,1010,12,22,725,LGA,13.853864,86,99
4156110,"New York, NY","Chicago, IL",2,MDW,2304,2151,WN,1,BOEING,1272,12,22,725,LGA,21.202949,73,94
4156111,"New York, NY","Chicago, IL",15,MDW,1253,1119,WN,1,BOEING,3211,12,22,725,LGA,8.157781,94,102
4235492,"New York, NY","Chicago, IL",16,ORD,1235,1102,UA,1,AIRBUS,242,12,22,733,LGA,7.641258,93,100
4235745,"New York, NY","Chicago, IL",16,ORD,916,757,UA,1,AIRBUS,463,12,22,733,LGA,0.540536,79,79
4237249,"New York, NY","Chicago, IL",15,ORD,1352,1227,UA,1,BOEING,1764,12,22,733,LGA,9.059469,85,94


In [330]:
dbres = db2.sort('PREDICTED_FLIGHT_TIME').head(6)
dbres.to_csv('data/best_flights.csv')

In [333]:
dbres2 = db2.sort('PREDICTED_FLIGHT_TIME')
dbres2.to_csv('data/some_flights.csv')

In [334]:
db2.sort('PREDICTED_FLIGHT_TIME')

Unnamed: 0,ORIGIN_CITY_NAME,DEST_CITY_NAME,AIRCRAFT_AGE,DEST,ARR_TIME,DEP_TIME,UNIQUE_CARRIER,DAY_OF_WEEK,AIRCRAFT_MFR,FL_NUM,MONTH,DAY_OF_MONTH,DISTANCE,ORIGIN,PREDICTED_DELAY,FLIGHT_TIME,PREDICTED_FLIGHT_TIME
4156108,"New York, NY","Chicago, IL",2,MDW,738,627,WN,1,BOEING,490,12,22,725,LGA,-5.391739,71,65
4235745,"New York, NY","Chicago, IL",16,ORD,916,757,UA,1,AIRBUS,463,12,22,733,LGA,0.540536,79,79
4156107,"New York, NY","Chicago, IL",14,MDW,1423,1304,WN,1,BOEING,333,12,22,725,LGA,7.796969,79,86
4237736,"New York, NY","Chicago, IL",16,ORD,845,714,UA,1,BOEING,1094,12,22,733,LGA,0.609249,91,91
4237531,"New York, NY","Chicago, IL",16,ORD,746,610,UA,1,AIRBUS,683,12,22,733,LGA,-2.842361,96,93
4156110,"New York, NY","Chicago, IL",2,MDW,2304,2151,WN,1,BOEING,1272,12,22,725,LGA,21.202949,73,94
4237249,"New York, NY","Chicago, IL",15,ORD,1352,1227,UA,1,BOEING,1764,12,22,733,LGA,9.059469,85,94
4237552,"New York, NY","Chicago, IL",16,ORD,1030,855,UA,1,AIRBUS,711,12,22,733,LGA,4.410556,95,99
4156109,"New York, NY","Chicago, IL",2,MDW,1732,1606,WN,1,BOEING,1010,12,22,725,LGA,13.853864,86,99
4235492,"New York, NY","Chicago, IL",16,ORD,1235,1102,UA,1,AIRBUS,242,12,22,733,LGA,7.641258,93,100
