# Data Preparation

In [25]:
import requests, zipfile, StringIO
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import re

###1. Get data for 2014 from zips

In [20]:
# reads all predefined months for a year and merge into one data frame
completeData = pd.DataFrame()
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
for m in months:
    z = zipfile.ZipFile('cache/2014%s.zip' % m)
    completeData = completeData.append(pd.read_csv(z.open(z.namelist()[0])))
    print "Downloaded", m

Downloaded 01
Downloaded 02
Downloaded 03
Downloaded 04
Downloaded 05
Downloaded 06
Downloaded 07
Downloaded 08
Downloaded 09
Downloaded 10
Downloaded 11
Downloaded 12


In [24]:
completeData.to_csv('cache/complete2014Data.csv')

### 2. Combine Data with external aircraft data

In [51]:
# load info about aircrafts and their manufacturers
z = zipfile.ZipFile('aircraftData/AircraftInformation.zip')
df_master  = pd.DataFrame.from_csv(z.open('MASTER.txt'))
df_aircrafts  = pd.DataFrame.from_csv(z.open('ACFTREF.txt'))

In [27]:
# merge both tables on the manufacturer code
master = df_master[['MFR MDL CODE', 'YEAR MFR']].reset_index()
aircrafts = df_aircrafts['MFR'].reset_index()
master.columns = ['TAIL_NUM', 'CODE', 'YEAR']
aircrafts.columns = ['CODE', 'MFR']
joined = pd.merge(master, aircrafts, how='left', on='CODE')

In [30]:
# reset index of complete dataset for delays
completeData.reset_index(inplace=True)

In [31]:
delayFinal = completeData[['TAIL_NUM','AIRLINE_ID']]
delayFinal.TAIL_NUM = delayFinal.TAIL_NUM.str.strip('N')
delaymfr = pd.merge(delayFinal, joined, how='left', on=['TAIL_NUM'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [40]:
completeData['AIRCRAFT_YEAR'] = delaymfr.YEAR
completeData['AIRCRAFT_MFR'] = delaymfr.MFR

In [48]:
completeData.to_csv('cache/complete2014Data.csv')

#### Exploratory Analysis

In [47]:
# percantage of missing aircraft information
sum(completeData.AIRCRAFT_YEAR.isnull())/(1.0*len(completeData.AIRCRAFT_YEAR.isnull()))

0.29173914978101445

In [34]:
years = [int(year) for year in delaymfr.YEAR.str.strip().values if (not year != year and (len(year)==4))]
plt.xlim(1980,2020)
plt.hist(years, bins=np.arange(1950, 2020, 1))