# Predictive Delay Analytics

In [1]:
%matplotlib inline
# import required modules for prediction tasks
import numpy as np
import pandas as pd
import math
import random
import requests
import zipfile
import StringIO
import re
import json
import os

## 0. Data acquisition and cleaning

In [2]:
%%time
# reads all predefined months for a year and merge into one data frame
rawData2014 = pd.DataFrame.from_csv('cache/predictionData/complete2014Data.csv')

Wall time: 1min 55s


In [7]:
print rawData2014.columns
rawData2014.head(5)

Index([u'index', u'FL_DATE', u'UNIQUE_CARRIER', u'TAIL_NUM', u'FL_NUM',
       u'ORIGIN', u'DEST', u'CRS_DEP_TIME', u'DEP_TIME', u'DEP_DELAY',
       u'TAXI_OUT', u'WHEELS_OFF', u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME',
       u'ARR_TIME', u'ARR_DELAY', u'CANCELLED', u'CANCELLATION_CODE',
       u'AIR_TIME', u'DISTANCE', u'CARRIER_DELAY', u'WEATHER_DELAY',
       u'NAS_DELAY', u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY',
       u'AIRCRAFT_YEAR', u'AIRCRAFT_MFR', u'LAT', u'LONG'],
      dtype='object')


Unnamed: 0,index,FL_DATE,UNIQUE_CARRIER,TAIL_NUM,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,AIRCRAFT_YEAR,AIRCRAFT_MFR,LAT,LONG
0,0,2014-01-01,AA,N338AA,1,JFK,LAX,900,914.0,14.0,...,2475,,,,,,1987.0,BOEING,40.633333,-73.783333
1,1,2014-01-02,AA,N338AA,1,JFK,LAX,900,857.0,-3.0,...,2475,,,,,,1987.0,BOEING,40.633333,-73.783333
2,2,2014-01-03,AA,N323AA,1,JFK,LAX,900,,,...,2475,,,,,,,,40.633333,-73.783333
3,3,2014-01-04,AA,N327AA,1,JFK,LAX,900,1005.0,65.0,...,2475,0.0,59.0,0.0,0.0,0.0,1986.0,BOEING,40.633333,-73.783333
4,4,2014-01-05,AA,N323AA,1,JFK,LAX,900,1050.0,110.0,...,2475,0.0,110.0,0.0,0.0,0.0,,,40.633333,-73.783333


In [8]:
set(rawData2014.AIRCRAFT_MFR)

{nan,
 'AGUSTA SPA                    ',
 'AIRBUS                        ',
 'AIRBUS INDUSTRIE              ',
 'AVIAT AIRCRAFT INC            ',
 'BARKER JACK L                 ',
 'BEECH                         ',
 'BELL                          ',
 'BENHAM JOHN                   ',
 'BOEING                        ',
 'BOMBARDIER INC                ',
 'CANADAIR                      ',
 'CANADAIR LTD                  ',
 'CESSNA                        ',
 'CIRRUS DESIGN CORP            ',
 'DOUGLAS                       ',
 'EMBRAER                       ',
 'EMBRAER S A                   ',
 'FRIEDEMANN JON                ',
 'GROSS ROBERT                  ',
 'GULFSTREAM AEROSPACE          ',
 'KILDALL GARY                  ',
 'LAMBERT RICHARD               ',
 'LEARJET INC                   ',
 'LEBLANC GLENN T               ',
 'MARZ BARRY                    ',
 'MCDONNELL DOUGLAS             ',
 'MCDONNELL DOUGLAS AIRCRAFT CO ',
 'MCDONNELL DOUGLAS CORPORATION ',
 'PAIR MIKE E 

### Cleaning the data

When cleaning the data set, we have to remove the following entries:

- flights that have been cancelled or diverted. We focus on predicting the delay. As a result, we also remove the columns associated with diverted flights.
- colmuns that give the answer. This is the case of many colmuns related to the arrival of the plane
- rows where a value is missing

Note that data points have to be cleaned in this order because most flights have empty entries for the 'diverted' columns.

In [107]:
#entries to be dropped in the analysis
flight_data_dropped = [

In [108]:
def clean(data, list_col):
    ''' 
    Creates a dataset by excluding undesirable columns

    Parameters:
    -----------

    data: pandas.DataFrame
       Flight dataframe  

    list_col: <list 'string'>
        Comumns to exclude from the data set
    '''

    
    res = data[(data.CANCELLED == 0) & (data.DIVERTED == 0)]
    res.drop(list_col, axis=1, inplace=True)
    res.dropna(axis = 0, inplace = True)
    return res

In [174]:
%%time
data2014 = clean(rawData2014, columns_dropped)
print data2014.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Index([u'MONTH', u'DAY_OF_WEEK', u'UNIQUE_CARRIER', u'ORIGIN',
       u'ORIGIN_CITY_NAME', u'ORIGIN_STATE_ABR', u'DEST', u'DEST_CITY_NAME',
       u'DEST_STATE_ABR', u'CRS_DEP_TIME', u'CRS_ARR_TIME', u'ARR_DEL15',
       u'DISTANCE', u'DISTANCE_GROUP'],
      dtype='object')
Wall time: 4.89 s


In [161]:
%%time
# save the data to avoid computing them again
file_path = "cache/predictionData/predictionData2014.csv"
data2014.to_csv(path_or_buf= file_path)

KeyboardInterrupt: 

In [91]:
# recover data2014 from cache/predictionData folder
#file_path = "cache/predictionData/predictionData2014.csv"
#data2014 = pd.read_json(file_path)
#data2014.columns

In [110]:
# test that clean did the job
print "size of raw data set: ", len(rawData2014)
print "number of cancelled or diverted flights: ", len(rawData2014[(rawData2014.CANCELLED == 1) | (rawData2014.DIVERTED == 1)])
print "size of data set: ", len(data2014)

size of raw data set:  471949
number of cancelled or diverted flights:  32329
size of data set:  439620


### Encoding categorical variables

In [111]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [175]:
encoded_list = ['MONTH', 'DAY_OF_WEEK', 'UNIQUE_CARRIER', 'ORIGIN', 'ORIGIN_CITY_NAME', \
               'ORIGIN_STATE_ABR', 'DEST', 'DEST_CITY_NAME', 'DEST_STATE_ABR', 'DISTANCE_GROUP']

In [180]:
def encoder(data, feature_list):
    ''' 
    Creates a dataset by excluding undesirable columns

    Parameters:
    -----------

    data: pandas.DataFrame
       Flight dataframe  

    feature_list: <list 'string'>
        List of features to turn into categorical variables
    '''
    
    res = []
    
    for feature in feature_list:
        if feature in data.columns:
            print data[feature].head(1)
            res.append(pd.get_dummies(data[feature]))
            print feature, " done!"
            
    return res

In [184]:
print math.float(NaN) in set(data2014['DEST_CITY_NAME'].values)

AttributeError: 'module' object has no attribute 'float'

In [181]:
encoded2014 = encoder(data2014, encoded_list)

YEAR
2014-01-01    1
Name: MONTH, dtype: int64
MONTH  done!
YEAR
2014-01-01    3
Name: DAY_OF_WEEK, dtype: int64
DAY_OF_WEEK  done!
YEAR
2014-01-01    AA
Name: UNIQUE_CARRIER, dtype: object
UNIQUE_CARRIER  done!
YEAR
2014-01-01    JFK
Name: ORIGIN, dtype: object
ORIGIN  done!
YEAR
2014-01-01    New York, NY
Name: ORIGIN_CITY_NAME, dtype: object
ORIGIN_CITY_NAME  done!
YEAR
2014-01-01    NY
Name: ORIGIN_STATE_ABR, dtype: object
ORIGIN_STATE_ABR  done!
YEAR
2014-01-01    LAX
Name: DEST, dtype: object
DEST  done!
YEAR
2014-01-01    Los Angeles, CA
Name: DEST_CITY_NAME, dtype: object


MemoryError: 

In [166]:
data2014.head(1)

Unnamed: 0_level_0,DAY_OF_WEEK,UNIQUE_CARRIER,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,DEST,DEST_CITY_NAME,DEST_STATE_ABR,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DEL15,DISTANCE,DISTANCE_GROUP,MONTH_CAT
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2014-01-01,3,AA,JFK,"New York, NY",NY,LAX,"Los Angeles, CA",CA,900,1225,0,2475,10,"(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ..."


In [173]:
pd.get_dummies(data2014['DEST_STATE_ABR'])

Unnamed: 0_level_0,AK,AL,AR,AZ,CA,CO,CT,DE,FL,GA,...,TT,TX,UT,VA,VI,VT,WA,WI,WV,WY
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-01,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Adjusting numerical data

We need to center and normalize all continuous data

In [134]:
def normalize(array):
    mean = np.mean(array)
    std = np.std(array)
    return [(x - mean)/std for x in array]

In [138]:
def normalize_data(data, feature_list):
    ''' 
    Normalize data.

    Parameters:
    -----------

    data: pandas.DataFrame
       dataframe  

    feature_list: <list 'string'>
        List of features to be normalized
    '''           

    for feature in feature_list:
        if feature in data.columns:
            data[feature + '_NOR'] = normalize(data[feature].values)
            data.drop(feature, axis =1, inplace=True)
    return

In [146]:
normalize_feature = ['CRS_DEP_TIME', 'CRS_ARR_TIME', 'DISTANCE']
normalize_data(data2014, normalize_feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [147]:
data2014.head(1)

Unnamed: 0_level_0,ARR_DEL15,MONTH_CAT,DAY_OF_WEEK_CAT,UNIQUE_CARRIER_CAT,ORIGIN_CAT,ORIGIN_CITY_NAME_CAT,ORIGIN_STATE_ABR_CAT,DEST_CAT,DEST_CITY_NAME_CAT,DEST_STATE_ABR_CAT,CRS_DEP_TIME_NOR,DISTANCE_GROUP_CAT,CRS_ARR_TIME_NOR,DISTANCE_NOR
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2014-01-01,0,"(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...","(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...",-0.924792,"(0, 439619)\t1.0\n (0, 439618)\t1.0\n (0, ...",-0.611188,2.810306


## 1. Baseline classifiers

We will make prediction on the variable 'ARR_DEL15'. This variable takes the value 1 is the plane is more than 15 minutes late and 0 if not. Let's look at the baseline classifier, that is the classifiers that assign repectively 1 or 0 to 'ARR_DEL15' for every flight.

In [148]:
from __future__ import division

def baseline(data, target):
    ''' 
    Compute the baseline classifiers along a target variable for a data set data

    Parameters:
    -----------

    data: pandas.DataFrame
       dataframe  

    target: string
        Column of data along wich we compute the baseline classifiers
    '''    
    
    
    score_baseline_1 = np.size(data[data[target] == 1][target].values) / np.size(data[target].values)
    score_baseline_0 = np.size(data[data[target] == 0][target].values) / np.size(data[target].values)
    
    print "baseline classifier everyone to 0: ", int(score_baseline_0*100) , "%"
    print "baseline classifier everyone to 1: ", int(score_baseline_1*100) , "%"
   
    return score_baseline_0, score_baseline_1

In [149]:
score_baseline_0, score_baseline_1 = baseline(data2014, 'ARR_DEL15')

baseline classifier everyone to 0:  72 %
baseline classifier everyone to 1:  27 %


### Split data into training/test sets

First, let's split the data set into a training set and a test set. 

In [53]:
from sklearn.cross_validation import train_test_split

In [55]:
def split(data, list_drop, target, test_size):
    ''' 
    Splits the data into a training and a test set
    Separates the training and test sets according to a feature set and a target set
    Balance the features sets by retaining only fraction of its points

    Parameters:
    -----------

    data: pandas.DataFrame
       Flight dataframe  

    list_drop: <list 'string'>
        List of columns to exclude from the features set
        
    target: string
        target column along whch we make the target set
        
    test_size: float
        size of the test set
    
    '''    
    
    #split the dataset into a training set and a test set
    dtrain, dtest = train_test_split(data, test_size = 0.3)
    
    Xtrain = dtrain.drop(list_drop, axis=1).values
    ytrain = dtrain[target].values
    Xtest = dtest.drop(list_drop, axis=1).values
    ytest = dtest[target].values
    
    return Xtrain, ytrain, Xtest, ytest

In [64]:
Xtrain, ytrain, Xtest, ytest = split(data2014, ['ARR_DEL15'], 'ARR_DEL15', 0.4)

## 2. Random Forest

In [150]:
from sklearn.ensemble import RandomForestClassifier

In [151]:
def score_random_forest(Xtrain, ytrain, Xtest, ytest, n_trees=10, criterion='gini', max_features='auto'):
    ''' 
    Fits a random forest with (Xtrain ,ytrain)
    Computes the score on (Xtest, ytest)

    Parameters:
    -----------

    Xtrain: numpy 2D array
       Feature training set

    ytrain: numpy 1D array
        Target training set
    
    Xtest: numpy 2D array
       Feature test set

    ytest: numpy 1D array
        Target test set
    
    n_trees: int
        number of trees in the forest
    
    criterion: string
        loss function
    
    max_features: string or int
        number of features used for every tree
        
    Outputs:
    --------
    
    score_train: float
        score on the train set
    
    score_test: float
        score on the test set
    
    clf.feature_importances_
        weights of each feature as used by the classifier
    
    ''' 

    clf= RandomForestClassifier(n_estimators=n_trees, criterion=criterion, max_features= max_features)
    clf.fit(Xtrain, ytrain)
    
    score_train = clf.score(Xtrain, ytrain)
    score_test = clf.score(Xtest, ytest)
    
    return  score_train, score_test, clf.feature_importances_

In [152]:
def best_parameters(Xtrain, ytrain, Xtest, ytest, criterions, nb_trees, nb_features):
    ''' 
    Fits sequentially random forest classifiers
    Adds each test score in a pandas.DataFrame with the number of trees, the loss function, the train score,
    and the importance of each features
    Returns a DataFrame with all scores

    Parameters:
    -----------

    Xtrain: numpy 2D array
       Feature training set

    ytrain: numpy 1D array
        Target training set
    
    Xtest: numpy 2D array
       Feature test set

    ytest: numpy 1D array
        Target test set
    
    n_trees: <list int>
        list of numbers of trees in the forest
    
    criterions: <list 'string'>
        list of loss functions
    
    nb_features: <list int>
        list of number of features in the forest
        
    Outputs:
    --------
    
    score_tab: pandas.DataFrame
        DataFrame of scores with associated parameters
    
    '''
    
    score_tab = pd.DataFrame(columns=['loss', 'nb_trees', 'nb_features', 'test_score', 'train_score', 'features_importance'])
    
    # counter will increment the index in score_tab
    counter = 0 

    for loss in criterions:
        for n_estimators in nb_trees:
            for max_features in nb_features:
                
                score_train, score_test, features_weights = \
                score_random_forest(Xtrain, ytrain, Xtest, ytest, n_trees=n_estimators, criterion=loss, max_features=max_features) 
                score_tab.loc[counter] = [loss, n_estimators, max_features, score_test, score_train, features_weights]
                counter += 1

    return score_tab

In [153]:
def classify_random_forest(data, list_drop, target, test_size=0.4, criterions = ['gini'], nb_trees=[10], nb_features = ['auto']):
    ''' 
    Combines all above functions

    Parameters:
    -----------

    See above.
        
    Outputs:
    --------
    
    score_tab: pandas.DataFrame
        DataFrame of scores with associated parameters
    
    '''
    
    Xtrain, ytrain, Xtest, ytest = split(data, list_drop, target, test_size)
    scores =  best_parameters(Xtrain, ytrain, Xtest, ytest, criterions, nb_trees, nb_features)
    return scores

In [154]:
criterions = ['gini']
nb_trees = [25]
nb_features = [4]
test_size = 0.4

In [155]:
%%time
randomForest2014 =  classify_random_forest(data2014, ['ARR_DEL15'], 'ARR_DEL15', test_size=test_size, criterions=criterions, nb_trees=nb_trees, nb_features=nb_features)
print randomForest2014.head()

ValueError: setting an array element with a sequence.

In [None]:
# save file to /data/ folder
file_path = "cache/predictionData/randomForest2014.csv"
randomForest2014.to_csv(path_or_buf= file_path)

## 3. Neural Network

*Describe Process*

###3. Prediction Method xyz

*Describe Process*

###4. Prediction Method xyz

*Describe Process*