# Data Preparation

In [2]:
# import required modules for data preparation tasks
import requests, zipfile, StringIO
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import re
import json
import os

###1. Get the Data for 2014 from zip Files

First, we want to open and combine the zipped data files for each month that have been downloaded according to the process outlined in `01_Data Aquisition.ipynb`.

In [2]:
# reads all predefined months for a year and merge into one data frame
rawData = pd.DataFrame()
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
for m in months:
    z = zipfile.ZipFile('cache/{y}{mo}.zip'.format(y=str(2014), mo = m))
    rawData = rawData.append(pd.read_csv(z.open(z.namelist()[0])))
    print "Downloaded", m
# reset index of complete dataset for delays to prepare merging in next step
rawData.reset_index(inplace=True)

Downloaded 01
Downloaded 02
Downloaded

  data = self._reader.read(nrows)
  data = self._reader.read(nrows)


 03
Downloaded 04
Downloaded 05
Downloaded

  data = self._reader.read(nrows)
  data = self._reader.read(nrows)


 06
Downloaded 07
Downloaded 08
Downloaded 09
Downloaded 10
Downloaded 11
Downloaded 12


  data = self._reader.read(nrows)


### 2. Combine Data with External Aircraft Data

We also have two tables containing infos about the aircraft and its manufacturer available. Both files will be loaded.

In [3]:
z = zipfile.ZipFile('externalData/AircraftInformation.zip')
df_master  = pd.DataFrame.from_csv(z.open('MASTER.txt'))
df_aircrafts  = pd.DataFrame.from_csv(z.open('ACFTREF.txt'))

We can now join these two tables based on their common ID that is saved in the column `MFR MDL CODE` of the master table and in the index of the aircraft table respectively.

In [4]:
master = df_master[['MFR MDL CODE', 'YEAR MFR']].reset_index()
aircrafts = df_aircrafts['MFR'].reset_index()
master.columns = ['TAIL_NUM', 'CODE', 'YEAR']
aircrafts.columns = ['CODE', 'MFR']
joined = pd.merge(master, aircrafts, how='left', on='CODE')

We now join this aircraft information with our delay data and extend the original dataset with the two new features: The year in which the aircraft was built (to determine the age) and the manufacturer.

In [5]:
delayFinal = rawData[['TAIL_NUM','AIRLINE_ID']]
delayFinal.TAIL_NUM = delayFinal.TAIL_NUM.str.strip('N')
delaymfr = pd.merge(delayFinal, joined, how='left', on=['TAIL_NUM'])
rawData['AIRCRAFT_YEAR'] = delaymfr.YEAR
rawData['AIRCRAFT_MFR'] = delaymfr.MFR

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### 3. Combine Data with External Airport Location Data

Now we load an external dataset that contains the geolocations for each commercial airport in the world. We filter this to get only the airports in the US and then assign the respective geocode of the origin airport to our original delay dataset by merging both tables.

In [6]:
airportLocation = pd.DataFrame.from_csv('externalData/airport_codes_with_geo_name_ids_and_nl_names-2008-04-14.csv', header=None)
usAirports = airportLocation[airportLocation[4]=='US'].reset_index()
usAirports = usAirports[[0, 5, 6]]
usAirports.columns = ['ORIGIN', 'LAT', 'LONG']
rawData = pd.merge(rawData, usAirports, how='left', on='ORIGIN')

In [7]:
1.0*sum(rawData.LAT.isnull())/rawData.shape[0]

0.007050916258277116

Just 0.7% of alll flight origins could not be located, so the merge was quite successful.

### 4. Combine Data with External Weather Data

As outlined in `01_Data Aquisition.ipynb`, we scraped historical weather data for major US airports from the web. This data can be used as additional features for each flight to get information about the current weather conditions at the airport of the departure.

In [8]:
weatherFile = os.path.join('cache', 'weather_data.json')
with open(weatherFile) as infile:
    weatherDict = json.load(infile)

In [9]:
dates = []
frames = []
for datapoint in weatherDict['JFK']:
    date = datapoint['date']
    frames.append(pd.DataFrame(datapoint['data'], index=['%s-%s-%s' % (date[0:4], date[4:6], date[6:8])]))
weather_df = pd.concat(frames).reset_index()

In [10]:
# weather_df.head()
jfk_delays = rawData[rawData.ORIGIN=='JFK']

In [11]:
jfk_dalayWeather = pd.merge(jfk_delays, weather_df, how='left', left_on='FL_DATE', right_on = 'index')

In [12]:
jfk_dalayWeather.to_csv('cache/jfk_weather_2014.csv', encoding='UTF-8')

### 5. Creation of the Final Dataset

The columns we now have in the dataset are:

In [13]:
rawData.columns

Index([u'index', u'YEAR', u'QUARTER', u'MONTH', u'DAY_OF_MONTH',
       u'DAY_OF_WEEK', u'FL_DATE', u'UNIQUE_CARRIER', u'AIRLINE_ID',
       u'CARRIER', u'TAIL_NUM', u'FL_NUM', u'ORIGIN', u'ORIGIN_CITY_NAME',
       u'ORIGIN_STATE_ABR', u'ORIGIN_STATE_FIPS', u'ORIGIN_STATE_NM',
       u'ORIGIN_WAC', u'DEST', u'DEST_CITY_NAME', u'DEST_STATE_ABR',
       u'DEST_STATE_FIPS', u'DEST_STATE_NM', u'DEST_WAC', u'CRS_DEP_TIME',
       u'DEP_TIME', u'DEP_DELAY', u'DEP_DELAY_NEW', u'DEP_DEL15',
       u'DEP_DELAY_GROUP', u'DEP_TIME_BLK', u'TAXI_OUT', u'WHEELS_OFF',
       u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME', u'ARR_DELAY',
       u'ARR_DELAY_NEW', u'ARR_DEL15', u'ARR_DELAY_GROUP', u'ARR_TIME_BLK',
       u'CANCELLED', u'CANCELLATION_CODE', u'DIVERTED', u'CRS_ELAPSED_TIME',
       u'ACTUAL_ELAPSED_TIME', u'AIR_TIME', u'FLIGHTS', u'DISTANCE',
       u'DISTANCE_GROUP', u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY',
       u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY', u'FIRST_DEP_TI

However, we just need a subset of these columns for our analysis:

In [14]:
selectedColumns = [u'index', u'FL_DATE', u'UNIQUE_CARRIER', u'TAIL_NUM', u'FL_NUM', 
                   u'ORIGIN', u'DEST', u'CRS_DEP_TIME', u'DEP_TIME', u'DEP_DELAY', u'TAXI_OUT', 
                   u'WHEELS_OFF', u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME', u'ARR_DELAY', 
                   u'CANCELLED', u'CANCELLATION_CODE', u'AIR_TIME', u'DISTANCE', 
                   u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY', u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY', 
                   u'AIRCRAFT_YEAR', u'AIRCRAFT_MFR',u'LAT', u'LONG']

In [15]:
complete2014Data = rawData[selectedColumns]

The resulting dataframe `complete2014Data` will be locally stored as csv file.

In [16]:
complete2014Data.to_csv('cache/complete2014Data.csv')

### 6. Creation of the Prediction Datasets

### REMOVE LATER: Exploratory Analysis

In [None]:
mask = (completeData.ARR_TIME>2000) & (completeData.ARR_TIME<2100)
sum(completeData.ARR_DELAY>60)/float(len(completeData))

In [None]:
print sum(completeData.ARR_DELAY.fillna(0))

In [None]:
completeData = pd.DataFrame.from_csv('cache/complete2014Data.csv')

In [None]:
subset = completeData[[u'MONTH', u'DAY_OF_MONTH',
       u'UNIQUE_CARRIER', u'AIRLINE_ID',
       u'CARRIER', u'ORIGIN', u'DEST', u'CRS_DEP_TIME',
       u'DEP_TIME', u'WEATHER_DELAY', u'NAS_DELAY']]

In [None]:
subset[(subset.WEATHER_DELAY > 0) & (subset.ORIGIN == 'JFK')].sort(columns='DAY_OF_MONTH').DAY_OF_MONTH.hist(bins = np.arange(0,30,1))
#subset[(subset.DEP_TIME == 1144) & (subset.CRS_DEP_TIME == 1129)]
#z = zipfile.ZipFile('cache/201411.zip')
#test = pd.read_csv(z.open(z.namelist()[0]))

In [None]:
# percentage of missing aircraft information
sum(completeData.AIRCRAFT_YEAR.isnull())/(1.0*len(completeData.AIRCRAFT_YEAR.isnull()))

In [None]:
years = [int(year) for year in delaymfr.YEAR.str.strip().values if (not year != year and (len(year)==4))]
plt.xlim(1980,2020)
plt.hist(years, bins=np.arange(1950, 2020, 1))

In [3]:
completeData = pd.DataFrame.from_csv('cache/complete2014Data.csv')

In [26]:
mask = (completeData.ORIGIN=='ORD') & (completeData.DEST=='JFK')
completeData[mask].groupby(['UNIQUE_CARRIER', 'FL_NUM'])['ARR_DELAY'].mean()

UNIQUE_CARRIER  FL_NUM
AA              149       17.875000
                198        3.472067
B6              106       -1.574713
                606       21.972222
                906       21.308571
                1106      18.997118
                1306      -0.647059
Name: ARR_DELAY, dtype: float64

In [28]:
completeData[mask].sort('FL_DATE').head(10)

Unnamed: 0,index,FL_DATE,UNIQUE_CARRIER,TAIL_NUM,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,...,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,AIRCRAFT_YEAR,AIRCRAFT_MFR,LAT,LONG
6551,6551,2014-01-01,AA,N3HPAA,198,ORD,JFK,1300,1322,22,...,740,0.0,16.0,0.0,0.0,0.0,,,41.978611,-87.904722
57396,57396,2014-01-01,B6,N190JB,906,ORD,JFK,1030,1146,76,...,740,4.0,0.0,29.0,0.0,72.0,2005.0,EMBRAER,41.978611,-87.904722
53957,53957,2014-01-01,B6,N652JB,1106,ORD,JFK,1955,2025,30,...,740,23.0,0.0,32.0,0.0,7.0,2007.0,AIRBUS,41.978611,-87.904722
6552,6552,2014-01-02,AA,N3FDAA,198,ORD,JFK,1300,1323,23,...,740,0.0,23.0,31.0,0.0,0.0,,,41.978611,-87.904722
57776,57776,2014-01-02,B6,N712JB,1106,ORD,JFK,1955,1945,-10,...,740,,,,,,2008.0,AIRBUS,41.978611,-87.904722
56022,56022,2014-01-02,B6,N292JB,906,ORD,JFK,1030,1221,111,...,740,0.0,4.0,96.0,0.0,107.0,2008.0,EMBRAER,41.978611,-87.904722
58471,58471,2014-01-03,B6,N641JB,1106,ORD,JFK,1955,2008,13,...,740,13.0,0.0,33.0,0.0,0.0,2006.0,AIRBUS,41.978611,-87.904722
58256,58256,2014-01-03,B6,N328JB,906,ORD,JFK,1030,1055,25,...,740,25.0,0.0,10.0,0.0,0.0,2011.0,EMBRAER,41.978611,-87.904722
6553,6553,2014-01-03,AA,N3GYAA,198,ORD,JFK,1300,1317,17,...,740,0.0,17.0,1.0,0.0,0.0,,,41.978611,-87.904722
55293,55293,2014-01-04,B6,N594JB,1106,ORD,JFK,1955,105,310,...,740,10.0,0.0,55.0,0.0,300.0,2004.0,AIRBUS,41.978611,-87.904722


In [25]:
completeData.groupby('UNIQUE_CARRIER').size()

UNIQUE_CARRIER
AA     537697
AS     160257
B6     249693
DL     800375
EV     686021
F9      85474
FL      79495
HA      74732
MQ     392701
OO     613030
UA     493528
US     414665
VX      57510
WN    1174633
dtype: int64

In [31]:
completeData[completeData.ARR_DELAY > 15].ARR_DELAY.sum(), completeData.shape

(69614616.0, (5819811, 30))

In [None]:
selectedCols = [u'UNIQUE_CARRIER',
       u'CARRIER', u'TAIL_NUM', u'FL_NUM', u'ORIGIN',
        u'DEST', u'CRS_DEP_TIME',
       u'DEP_TIME', u'DEP_DELAY', u'DEP_DELAY_NEW',  u'DEP_TIME_BLK', u'TAXI_OUT', u'WHEELS_OFF',
       u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME', u'ARR_DELAY',
       u'ARR_DELAY_NEW', u'ARR_DEL15', u'ARR_DELAY_GROUP', u'ARR_TIME_BLK', u'AIR_TIME', u'FLIGHTS', u'DISTANCE',
       u'DISTANCE_GROUP', u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY',
       u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY']
reducedData = completeData[selectedCols].ix[random.sample(completeData.index, 1000)]
reducedData.to_csv('cache/reduced2014Data.csv')