# Predictive Delay Analytics

In [2]:
%matplotlib inline
# import required modules for prediction tasks
import numpy as np
import pandas as pd
import math
import random

## 0. Data acquisition and cleaning

In [None]:
%%time
# reads all predefined months for a year and merge into one data frame
rawData2014 = pd.DataFrame.from_csv('cache/predictionData/complete2014Data.csv')

Wall time: 1min 49s


In [3]:
print rawData2014.columns
rawData2014.head(5)

Index([u'index', u'FL_DATE', u'UNIQUE_CARRIER', u'TAIL_NUM', u'FL_NUM',
       u'ORIGIN', u'DEST', u'CRS_DEP_TIME', u'DEP_TIME', u'DEP_DELAY',
       u'TAXI_OUT', u'WHEELS_OFF', u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME',
       u'ARR_TIME', u'ARR_DELAY', u'CANCELLED', u'CANCELLATION_CODE',
       u'AIR_TIME', u'DISTANCE', u'CARRIER_DELAY', u'WEATHER_DELAY',
       u'NAS_DELAY', u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY',
       u'AIRCRAFT_YEAR', u'AIRCRAFT_MFR', u'LAT', u'LONG'],
      dtype='object')


Unnamed: 0,index,FL_DATE,UNIQUE_CARRIER,TAIL_NUM,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,AIRCRAFT_YEAR,AIRCRAFT_MFR,LAT,LONG
0,0,2014-01-01,AA,N338AA,1,JFK,LAX,900,914.0,14.0,...,2475,,,,,,1987.0,BOEING,40.633333,-73.783333
1,1,2014-01-02,AA,N338AA,1,JFK,LAX,900,857.0,-3.0,...,2475,,,,,,1987.0,BOEING,40.633333,-73.783333
2,2,2014-01-03,AA,N323AA,1,JFK,LAX,900,,,...,2475,,,,,,,,40.633333,-73.783333
3,3,2014-01-04,AA,N327AA,1,JFK,LAX,900,1005.0,65.0,...,2475,0.0,59.0,0.0,0.0,0.0,1986.0,BOEING,40.633333,-73.783333
4,4,2014-01-05,AA,N323AA,1,JFK,LAX,900,1050.0,110.0,...,2475,0.0,110.0,0.0,0.0,0.0,,,40.633333,-73.783333


### Cleaning the data

When cleaning the data set, we have to remove the following entries:

- flights that have been cancelled or diverted. We focus on predicting the delay. As a result, we also remove the columns associated with diverted flights.
- colmuns that give the answer. This is the case of many colmuns related to the arrival of the plane
- rows where a value is missing

Note that data points have to be cleaned in this order because most flights have empty entries for the 'diverted' columns.

In [4]:
#entries to be dropped in the analysis
columns_dropped = ['index', 'TAIL_NUM', 'FL_NUM', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', \
                   'WHEELS_ON', 'TAXI_IN', 'ARR_TIME', 'CANCELLED', 'CANCELLATION_CODE', 'AIR_TIME', \
                   'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']

In [5]:
def clean(data, list_col):
    ''' 
    Creates a dataset by excluding undesirable columns

    Parameters:
    -----------

    data: pandas.DataFrame
       Flight dataframe  

    list_col: <list 'string'>
        Comumns to exclude from the data set
    '''
    
    data.drop(data[data.CANCELLED == 0].index, inplace=True)
    data.drop(list_col, axis=1, inplace=True)
    data.dropna(axis = 0, inplace = True)
    return

In [None]:
%%time
data2014 = clean(rawData2014.copy(), columns_dropped)
print data2014.columns

In [None]:
%%time
# save the data to avoid computing them again
file_path = "cache/predictionData/predictionData2014.csv"
data2014.to_csv(path_or_buf= file_path)

In [121]:
%%time
# recover data2014 from cache/predictionData folder
file_path = "cache/predictionData/predictionData2014.csv"
data2014 = pd.read_csv(file_path)
data2014.drop('Unnamed: 0', axis= 1, inplace = True)
data2014.columns

Wall time: 10 s


In [122]:
# test that clean did the job
print "size of raw data set: ", len(rawData2014)
print "number of cancelled: ", len(rawData2014[(rawData2014.CANCELLED == 1)])
print "size of data set: ", len(data2014)

size of raw data set: 

NameError: name 'rawData2014' is not defined

### Restricting the dataset

The dataset has more than 4 millions entries, which makes any data manipulation extremely costly - let alone model fitting. We will therefore make some restrictions on the airports and the airlines considered.

In [123]:
data2014.groupby('UNIQUE_CARRIER').size()

UNIQUE_CARRIER
AA    137960
AS    157312
B6    225303
DL    644768
EV    616920
F9     81758
FL     69486
HA     73370
MQ      2995
OO    586067
UA    471093
US    381688
VX     57056
WN    597821
dtype: int64




In [124]:
def restrict_carrier(data, droplist):
    for item in droplist:
        data.drop(data[data.UNIQUE_CARRIER == item].index, inplace= True)
    return 

In [125]:
%%time
drop_airline = [ 'AA', 'AS', 'B6', 'EV', 'F9', 'FL', 'HA', 'MQ', 'OO', 'US', 'VX', 'WN']
restrict_carrier(data2014, drop_airline)
print "number of points", len(data2014)
print "airlines", set(data2014.UNIQUE_CARRIER)

number of points 1115861
airlines set(['DL', 'UA'])
Wall time: 12.5 s


We now focus on the main airports. We look for airports that have on average 50 domestic flight everyday.

In [126]:
def restrict_airport(data, threshold):
    dict_count = data2014.groupby("DEST").agg(['count']).LAT.to_dict()['count']
    
    for key in dict_count:
        if dict_count[key] < threshold:
            data.drop(data[data.DEST == key].index, inplace=True)
            data.drop(data[data.ORIGIN == key].index, inplace=True)
    
    print data2014.groupby("DEST").agg(['count']).LAT.to_dict()['count']
    
    return

** WARNING ** \
RUN THIS CELL ONLY ONCE!

In [127]:
%%time
restrict_airport(data2014, 60*365)
dataRes = data2014.copy()
print "number of airports: ", len(set(data2014))
print "dataset size: ", len(data2014)


KeyboardInterrupt


In [15]:
%%time
# save the restricted data to avoid computing them again
file_path = "cache/predictionData/restrictedPredictionData2014.csv"
dataRes.to_csv(path_or_buf= file_path)

Wall time: 2.25 s


In [129]:
dataRes.head(3)

Unnamed: 0,FL_DATE,UNIQUE_CARRIER,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,DISTANCE,AIRCRAFT_YEAR,AIRCRAFT_MFR,LAT,LONG
0,2014-01-01,DL,SLC,ATL,940,1521,-3,1590,1992,BOEING,40.788333,-111.966667
1,2014-01-01,DL,ATL,SLC,1645,1900,-2,1590,1988,BOEING,33.636667,-84.428056
2,2014-01-01,DL,MCO,ATL,1330,1500,-8,404,2003,BOEING,28.429444,-81.308889


In [130]:
%%time
# recover file
file_path = "cache/predictionData/dataRes.csv"
dataRes = pd.read_csv(file_path)
dataRes.drop('Unnamed: 0', axis= 1, inplace = True)
print dataRes.columns

Index([u'FL_DATE', u'UNIQUE_CARRIER', u'ORIGIN', u'DEST', u'CRS_DEP_TIME',
       u'CRS_ARR_TIME', u'ARR_DELAY', u'DISTANCE', u'AIRCRAFT_YEAR',
       u'AIRCRAFT_MFR', u'LAT', u'LONG'],
      dtype='object')
Wall time: 809 ms


### Encoding categorical variables

In [174]:
from time import strptime
days = {0:"Mon", 1:"Tues", 2:"Wed", 3:"Thurs", 4:"Fri", 5:"Sat", 6:"Sun"}
months = {1:"Jan", 2:"Feb", 3:"Mar", 4:"Apr", 5:"May", 6:"June", 7:"July", 8:"Aug", 9:"Sep", \
          10:"Oct", 11:"Nov", 12:"Dec"}

In [175]:
def adjust_time(data):
    monlist = np.empty(len(data), dtype = str)
    daylist = np.empty(len(data), dtype = str)
    
    for i in xrange(len(data)):
        date= strptime(data.FL_DATE.iloc[i], "%Y-%M-%d")
        monlist[i] = months[date.tm_min]
        daylist[i] = days[date.tm_wday]

    return monlist, daylist

In [176]:
dataRes.columns

Index([u'FL_DATE', u'UNIQUE_CARRIER', u'ORIGIN', u'DEST', u'CRS_DEP_TIME',
       u'CRS_ARR_TIME', u'ARR_DELAY', u'DISTANCE', u'AIRCRAFT_YEAR',
       u'AIRCRAFT_MFR', u'LAT', u'LONG'],
      dtype='object')

In [177]:
%%time
monlist, daylist = adjust_time(dataRes)
print "OK"
dataRes['MONTH'] = pd.Series(monlist, index=dataRes.index)
dataRes['DAY'] = pd.Series(daylist, index=dataRes.index)
if 'FL_DATE' in dataRes.columns:
    dataRes.drop('FL_DATE', axis = 1, inplace= True)
print dataRes.columns

OK
Index([u'UNIQUE_CARRIER', u'ORIGIN', u'DEST', u'CRS_DEP_TIME', u'CRS_ARR_TIME',
       u'ARR_DELAY', u'DISTANCE', u'AIRCRAFT_YEAR', u'AIRCRAFT_MFR', u'LAT',
       u'LONG', u'MONTH', u'DAY'],
      dtype='object')
Wall time: 15.6 s


### Adjusting numerical data

Let's change the script to put time in minutes.

In [182]:
%%time
ti = lambda x: x/100*60+x%100
print "before change: ", dataRes.CRS_ARR_TIME[:2]
dataRes['CRS_ARR_TIME_COR'] = dataRes.CRS_ARR_TIME.map(ti)
dataRes['CRS_DEP_TIME_COR'] = dataRes.CRS_DEP_TIME.map(ti)
dataRes.drop(['CRS_DEP_TIME', 'CRS_ARR_TIME'], axis = 1, inplace = True)
print
print "after change: ", dataRes.CRS_ARR_TIME_COR[:2]

before change:  0    1521
1    1900
Name: CRS_ARR_TIME, dtype: int64

after change:  0     921
1    1140
Name: CRS_ARR_TIME_COR, dtype: int64
Wall time: 1.23 s


We need to center and normalize all continuous data

In [183]:
# # change the age of the aircraft from a string type to an integer type
dataRes.drop(dataRes[dataRes.AIRCRAFT_YEAR =='    '].index, inplace = True)
dataRes['AIRCRAFT_YEAR_COR'] = dataRes.AIRCRAFT_YEAR.map(lambda x: int(x))
dataRes.drop('AIRCRAFT_YEAR', axis = 1, inplace = True)

In [184]:
def normalize(array):
    mean = np.mean(array)
    std = np.std(array)
    return [(x - mean)/std for x in array]

In [185]:
def normalize_data(data, feature_list):
    ''' 
    Normalize data.

    Parameters:
    -----------

    data: pandas.DataFrame
       dataframe  

    feature_list: <list 'string'>
        List of features to be normalized
    '''           

    for feature in feature_list:
        if feature in data.columns:
            data[feature + '_NOR'] = normalize(data[feature].values)
            data.drop(feature, axis =1, inplace=True)
    return

In [187]:
%%time
normalize_feature = ['CRS_DEP_TIME_COR', 'CRS_ARR_TIME_COR', 'DISTANCE', 'LONG', 'LAT', 'AIRCRAFT_YEAR_COR']
normalize_data(dataRes, normalize_feature)

Wall time: 3.55 s


We are only interested in whetehr a flight will be more than 15 minutes late. So we adjust the ARR_DELAY colum to an indicator.

In [188]:
dataRes['ARR_DELAY_COR'] = dataRes.ARR_DELAY.map(lambda x: (x >= 15))
dataRes.drop('ARR_DELAY', axis = 1, inplace = True)

In [189]:
print dataRes.columns
dataRes.head(3)

Index([u'UNIQUE_CARRIER', u'ORIGIN', u'DEST', u'AIRCRAFT_MFR', u'MONTH',
       u'DAY', u'CRS_DEP_TIME_COR_NOR', u'CRS_ARR_TIME_COR_NOR',
       u'DISTANCE_NOR', u'LONG_NOR', u'LAT_NOR', u'AIRCRAFT_YEAR_COR_NOR',
       u'ARR_DELAY_COR'],
      dtype='object')


Unnamed: 0,UNIQUE_CARRIER,ORIGIN,DEST,AIRCRAFT_MFR,MONTH,DAY,CRS_DEP_TIME_COR_NOR,CRS_ARR_TIME_COR_NOR,DISTANCE_NOR,LONG_NOR,LAT_NOR,AIRCRAFT_YEAR_COR_NOR,ARR_DELAY_COR
0,DL,SLC,ATL,BOEING,J,W,-0.754789,-0.018805,0.558671,-1.102922,0.69753,-0.996933,False
1,DL,ATL,SLC,BOEING,J,W,0.733454,0.703785,0.558671,0.568072,-0.774714,-1.583051,False
2,DL,MCO,ATL,BOEING,J,W,0.050613,-0.088095,-1.256618,0.757338,-1.846675,0.61489,False


### Encode categorical variables

In [226]:
encoded_list = ['UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'AIRCRAFT_MFR', 'MONTH','DAY']

In [241]:
%%time
finalData = pd.get_dummies(dataRes, columns=encoded_list)

Wall time: 695 ms


In [245]:
finalData.head(3)

Unnamed: 0,CRS_DEP_TIME_COR_NOR,CRS_ARR_TIME_COR_NOR,DISTANCE_NOR,LONG_NOR,LAT_NOR,AIRCRAFT_YEAR_COR_NOR,ARR_DELAY_COR,UNIQUE_CARRIER_DL,UNIQUE_CARRIER_UA,ORIGIN_ATL,...,MONTH_J,MONTH_M,MONTH_N,MONTH_O,MONTH_S,DAY_F,DAY_M,DAY_S,DAY_T,DAY_W
0,-0.754789,-0.018805,0.558671,-1.102922,0.69753,-0.996933,False,1,0,0,...,1,0,0,0,0,0,0,0,0,1
1,0.733454,0.703785,0.558671,0.568072,-0.774714,-1.583051,False,1,0,1,...,1,0,0,0,0,0,0,0,0,1
2,0.050613,-0.088095,-1.256618,0.757338,-1.846675,0.61489,False,1,0,0,...,1,0,0,0,0,0,0,0,0,1


In [246]:
%%time
# save the restricted data to avoid computing them again
file_path = "cache/predictionData/finalData.csv"
finalData.to_csv(path_or_buf= file_path)

Wall time: 9.67 s


In [247]:
%%time
# recover data2014 from cache/predictionData folder
file_path = "cache/predictionData/finalData.csv"
finalData = pd.read_csv(file_path)
finalData.drop('Unnamed: 0', axis= 1, inplace = True)

Wall time: 4.21 s


## 1. Baseline classifiers

We will make prediction on the variable 'ARR_DEL15'. This variable takes the value 1 is the plane is more than 15 minutes late and 0 if not. Let's look at the baseline classifier, that is the classifiers that assign repectively 1 or 0 to 'ARR_DEL15' for every flight.

In [248]:
from __future__ import division

def baseline(data, target):
    ''' 
    Compute the baseline classifiers along a target variable for a data set data

    Parameters:
    -----------

    data: pandas.DataFrame
       dataframe  

    target: string
        Column of data along wich we compute the baseline classifiers
    '''    
    
    
    score_baseline_1 = np.size(data[data[target] == 1][target].values) / np.size(data[target].values)
    score_baseline_0 = np.size(data[data[target] == 0][target].values) / np.size(data[target].values)
    
    print "baseline classifier everyone to 0: ", int(score_baseline_0*100) , "%"
    print "baseline classifier everyone to 1: ", int(score_baseline_1*100) , "%"
   
    return score_baseline_0, score_baseline_1

In [250]:
score_baseline_0, score_baseline_1 = baseline(finalData, 'ARR_DELAY_COR')

baseline classifier everyone to 0:  78 %
baseline classifier everyone to 1:  21 %


### Split data into training/test sets

First, let's split the data set into a training set and a test set. 

In [252]:
from sklearn.cross_validation import train_test_split

In [253]:
def split(data, list_drop, target, test_size):
    ''' 
    Splits the data into a training and a test set
    Separates the training and test sets according to a feature set and a target set
    Balance the features sets by retaining only fraction of its points

    Parameters:
    -----------

    data: pandas.DataFrame
       Flight dataframe  

    list_drop: <list 'string'>
        List of columns to exclude from the features set
        
    target: string
        target column along whch we make the target set
        
    test_size: float
        size of the test set
    
    '''    
    
    #split the dataset into a training set and a test set
    dtrain, dtest = train_test_split(data, test_size = 0.3)
    
    Xtrain = dtrain.drop(list_drop, axis=1).values
    ytrain = dtrain[target].values
    Xtest = dtest.drop(list_drop, axis=1).values
    ytest = dtest[target].values
    
    return Xtrain, ytrain, Xtest, ytest

In [255]:
Xtrain, ytrain, Xtest, ytest = split(finalData, ['ARR_DELAY_COR'], 'ARR_DELAY_COR', 0.4)

## 2. Random Forest

In [256]:
from sklearn.ensemble import RandomForestClassifier

In [257]:
def score_random_forest(Xtrain, ytrain, Xtest, ytest, n_trees=10, criterion='gini', max_features='auto'):
    ''' 
    Fits a random forest with (Xtrain ,ytrain)
    Computes the score on (Xtest, ytest)

    Parameters:
    -----------

    Xtrain: numpy 2D array
       Feature training set

    ytrain: numpy 1D array
        Target training set
    
    Xtest: numpy 2D array
       Feature test set

    ytest: numpy 1D array
        Target test set
    
    n_trees: int
        number of trees in the forest
    
    criterion: string
        loss function
    
    max_features: string or int
        number of features used for every tree
        
    Outputs:
    --------
    
    score_train: float
        score on the train set
    
    score_test: float
        score on the test set
    
    clf.feature_importances_
        weights of each feature as used by the classifier
    
    ''' 

    clf= RandomForestClassifier(n_estimators=n_trees, criterion=criterion, max_features= max_features)
    clf.fit(Xtrain, ytrain)
    
    score_train = clf.score(Xtrain, ytrain)
    score_test = clf.score(Xtest, ytest)
    
    return  score_train, score_test, clf.feature_importances_

In [258]:
def best_parameters(Xtrain, ytrain, Xtest, ytest, criterions, nb_trees, nb_features):
    ''' 
    Fits sequentially random forest classifiers
    Adds each test score in a pandas.DataFrame with the number of trees, the loss function, the train score,
    and the importance of each features
    Returns a DataFrame with all scores

    Parameters:
    -----------

    Xtrain: numpy 2D array
       Feature training set

    ytrain: numpy 1D array
        Target training set
    
    Xtest: numpy 2D array
       Feature test set

    ytest: numpy 1D array
        Target test set
    
    n_trees: <list int>
        list of numbers of trees in the forest
    
    criterions: <list 'string'>
        list of loss functions
    
    nb_features: <list int>
        list of number of features in the forest
        
    Outputs:
    --------
    
    score_tab: pandas.DataFrame
        DataFrame of scores with associated parameters
    
    '''
    
    score_tab = pd.DataFrame(columns=['loss', 'nb_trees', 'nb_features', 'test_score', 'train_score', 'features_importance'])
    
    # counter will increment the index in score_tab
    counter = 0 

    for loss in criterions:
        for n_estimators in nb_trees:
            for max_features in nb_features:
                
                score_train, score_test, features_weights = \
                score_random_forest(Xtrain, ytrain, Xtest, ytest, n_trees=n_estimators, criterion=loss, max_features=max_features) 
                score_tab.loc[counter] = [loss, n_estimators, max_features, score_test, score_train, features_weights]
                counter += 1

    return score_tab

In [259]:
def classify_random_forest(data, list_drop, target, test_size=0.4, criterions = ['gini'], nb_trees=[10], nb_features = ['auto']):
    ''' 
    Combines all above functions

    Parameters:
    -----------

    See above.
        
    Outputs:
    --------
    
    score_tab: pandas.DataFrame
        DataFrame of scores with associated parameters
    
    '''
    
    Xtrain, ytrain, Xtest, ytest = split(data, list_drop, target, test_size)
    scores =  best_parameters(Xtrain, ytrain, Xtest, ytest, criterions, nb_trees, nb_features)
    return scores

In [262]:
criterions = ['gini']
nb_trees = [25, 50, 100]
nb_features = ['auto']
test_size = 0.4

In [263]:
%%time
randomForest2014 =  classify_random_forest(finalData, ['ARR_DELAY_COR'], 'ARR_DELAY_COR', test_size=test_size, criterions=criterions, nb_trees=nb_trees, nb_features=nb_features)
print randomForest2014.head(3)

   loss  nb_trees nb_features  test_score  train_score  \
0  gini        25        auto    0.726037     0.948465   
1  gini        50        auto    0.725860     0.951069   
2  gini       100        auto    0.728171     0.951831   

                                 features_importance  
0  [0.163223186479, 0.195587614632, 0.02539998298...  
1  [0.165080172005, 0.199351953592, 0.02509314710...  
2  [0.16313213511, 0.196903639617, 0.025514979356...  
Wall time: 2min 52s


### Important coefficients

Let's now look at the importance coefficients, that the average usage of each coefficients in the random forest.

In [282]:
coeffs = pd.Series(randomForest2014.ix[0, 'features_importance'], index=finalData.drop('ARR_DELAY_COR', axis =1).columns)
coeffs.sort(ascending=False, inplace=True)
print coeffs

AIRCRAFT_YEAR_COR_NOR                          0.282699
CRS_ARR_TIME_COR_NOR                           0.195588
CRS_DEP_TIME_COR_NOR                           0.163223
DAY_T                                          0.030229
DAY_S                                          0.028838
DISTANCE_NOR                                   0.025400
DAY_F                                          0.024574
DAY_W                                          0.023403
DAY_M                                          0.023141
MONTH_A                                        0.013087
MONTH_M                                        0.012611
MONTH_J                                        0.010943
AIRCRAFT_MFR_AIRBUS INDUSTRIE                  0.010772
AIRCRAFT_MFR_BOEING                            0.010439
MONTH_O                                        0.009955
LAT_NOR                                        0.009353
LONG_NOR                                       0.009280
MONTH_D                                        0

In [None]:
# save file to /data/ folder
file_path = "cache/predictionData/randomForest2014.csv"
randomForest2014.to_csv(path_or_buf= file_path)

## 3. Neural Network

*Describe Process*