# Data Preparation

In [1]:
# import required modules for data preparation tasks
import requests, zipfile, StringIO
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import re
import json
import os

###1. Get the Data for 2014 from zip Files

First, we want to open and combine the zipped data files for each month that have been downloaded according to the process outlined in `01_Data Aquisition.ipynb`.

In [2]:
# reads all predefined months for a year and merge into one data frame
rawData = pd.DataFrame()
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
for m in months:
    z = zipfile.ZipFile('cache/{y}{mo}.zip'.format(y=str(2014), mo = m))
    rawData = rawData.append(pd.read_csv(z.open(z.namelist()[0])))
    print "Downloaded", m
# reset index of complete dataset for delays to prepare merging in next step
rawData.reset_index(inplace=True)

Downloaded 01
Downloaded 02
Downloaded

  data = self._reader.read(nrows)
  data = self._reader.read(nrows)


 03
Downloaded 04
Downloaded 05
Downloaded

  data = self._reader.read(nrows)
  data = self._reader.read(nrows)


 06
Downloaded 07
Downloaded 08
Downloaded 09
Downloaded 10
Downloaded 11
Downloaded 12


  data = self._reader.read(nrows)


The columns we now have in the dataset are:

In [3]:
rawData.columns

Index([u'index', u'YEAR', u'QUARTER', u'MONTH', u'DAY_OF_MONTH',
       u'DAY_OF_WEEK', u'FL_DATE', u'UNIQUE_CARRIER', u'AIRLINE_ID',
       u'CARRIER', u'TAIL_NUM', u'FL_NUM', u'ORIGIN', u'ORIGIN_CITY_NAME',
       u'ORIGIN_STATE_ABR', u'ORIGIN_STATE_FIPS', u'ORIGIN_STATE_NM',
       u'ORIGIN_WAC', u'DEST', u'DEST_CITY_NAME', u'DEST_STATE_ABR',
       u'DEST_STATE_FIPS', u'DEST_STATE_NM', u'DEST_WAC', u'CRS_DEP_TIME',
       u'DEP_TIME', u'DEP_DELAY', u'DEP_DELAY_NEW', u'DEP_DEL15',
       u'DEP_DELAY_GROUP', u'DEP_TIME_BLK', u'TAXI_OUT', u'WHEELS_OFF',
       u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME', u'ARR_DELAY',
       u'ARR_DELAY_NEW', u'ARR_DEL15', u'ARR_DELAY_GROUP', u'ARR_TIME_BLK',
       u'CANCELLED', u'CANCELLATION_CODE', u'DIVERTED', u'CRS_ELAPSED_TIME',
       u'ACTUAL_ELAPSED_TIME', u'AIR_TIME', u'FLIGHTS', u'DISTANCE',
       u'DISTANCE_GROUP', u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY',
       u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY', u'FIRST_DEP_TI

However, we just need a subset of these columns for our analysis:

In [4]:
selectedColumns = [u'index', u'FL_DATE', u'UNIQUE_CARRIER', u'TAIL_NUM', u'FL_NUM', 
                   u'ORIGIN', u'DEST', u'CRS_DEP_TIME', u'DEP_TIME', u'DEP_DELAY', u'TAXI_OUT', 
                   u'WHEELS_OFF', u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME', u'ARR_DELAY', 
                   u'CANCELLED', u'CANCELLATION_CODE', u'AIR_TIME', u'DISTANCE', 
                   u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY', u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY',
                   u'ORIGIN_CITY_NAME', u'DEST_CITY_NAME']
rawData = rawData[selectedColumns]

### 2. Combine Data with External Aircraft Data

We also have two tables containing infos about the aircraft and its manufacturer available. Both files will be loaded.

In [5]:
z = zipfile.ZipFile('externalData/AircraftInformation.zip')
df_master  = pd.DataFrame.from_csv(z.open('MASTER.txt'))
df_aircrafts  = pd.DataFrame.from_csv(z.open('ACFTREF.txt'))

We can now join these two tables based on their common ID that is saved in the column `MFR MDL CODE` of the master table and in the index of the aircraft table respectively.

In [6]:
master = df_master[['MFR MDL CODE', 'YEAR MFR']].reset_index()
aircrafts = df_aircrafts['MFR'].reset_index()
master.columns = ['TAIL_NUM', 'CODE', 'YEAR']
aircrafts.columns = ['CODE', 'MFR']
joined = pd.merge(master, aircrafts, how='left', on='CODE')

We now join this aircraft information with our delay data and extend the original dataset with the two new features: The year in which the aircraft was built (to determine the age) and the manufacturer.

In [7]:
delayFinal = rawData[['TAIL_NUM','UNIQUE_CARRIER']]
delayFinal.TAIL_NUM = delayFinal.TAIL_NUM.str.strip('N')
delaymfr = pd.merge(delayFinal, joined, how='left', on=['TAIL_NUM'])
rawData['AIRCRAFT_YEAR'] = delaymfr.YEAR
rawData['AIRCRAFT_MFR'] = delaymfr.MFR

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### 3. Combine Data with External Airport Location Data

Now we load an external dataset that contains the geolocations for each commercial airport in the world. We filter this to get only the airports in the US and then assign the respective geocode of the origin airport to our original delay dataset by merging both tables.

In [8]:
airportLocation = pd.DataFrame.from_csv('externalData/airport_codes_with_geo_name_ids_and_nl_names-2008-04-14.csv', header=None)
usAirports = airportLocation[airportLocation[4]=='US'].reset_index()
usAirports = usAirports[[0, 5, 6]]
usAirports.columns = ['ORIGIN', 'LAT', 'LONG']
complete2014Data = pd.merge(rawData, usAirports, how='left', on='ORIGIN')

In [9]:
1.0*sum(complete2014Data.LAT.isnull())/complete2014Data.shape[0]

0.007050916258277116

Just 0.7% of alll flight origins could not be located, so the merge was quite successful.

### 4. Save the Main Final Dataset

The resulting dataframe `complete2014Data` will be locally stored as csv file.

In [10]:
complete2014Data.to_csv('cache/complete2014Data.csv')

### 5. Create a Subset with External Weather Data for Selected Airports

As outlined in `01_Data Aquisition.ipynb`, we scraped historical weather data for major US airports from the web. This data can be used as additional features for each flight to get information about the current weather conditions at the airport of the departure. The script assumes that there is the `weather_data.json` file in the cache folder and that this file contains the respective weather information for the JFK airport in new york for each day in 2014.

In [3]:
# load the weather file
weatherFile = os.path.join('cache', 'weather_data.json')
with open(weatherFile) as infile:
    weatherDict = json.load(infile)

In [6]:
# extract the weather data for new york and boston out of the json file and save it in weather_df
dates = []
frames = []

# create df for weather in new york
for datapoint in weatherDict['JFK']:
    date = datapoint['date']
    frames.append(pd.DataFrame(datapoint['data'], index=['%s-%s-%s' % (date[0:4], date[4:6], date[6:8])]))
weather_jfk = pd.concat(frames).reset_index()

# create df for weather in boston
for datapoint in weatherDict['BOS']:
    date = datapoint['date']
    frames.append(pd.DataFrame(datapoint['data'], index=['%s-%s-%s' % (date[0:4], date[4:6], date[6:8])]))
weather_bos = pd.concat(frames).reset_index()

# get just the departures for the John F. Kennedy airport in New York City and Logan airport in Boston
jfk_delays = complete2014Data[complete2014Data.ORIGIN=='JFK']
bos_delays = complete2014Data[complete2014Data.ORIGIN=='BOS']

# merge delays with weather_df created above
jfk_dalayWeather = pd.merge(jfk_delays, weather_jfk, how='left', left_on='FL_DATE', right_on = 'index')
bos_dalayWeather = pd.merge(bos_delays, weather_bos, how='left', left_on='FL_DATE', right_on = 'index')

jfk_bos_comparison = pd.concat([jfk_dalayWeather, bos_dalayWeather]).reset_index()

In [11]:
# save everything in a csv
jfk_bos_comparison.to_csv('cache/jfk_bos_weather_2014.csv', encoding='UTF-8')