# Reference Dataframes
Dataframes with aggregated date that can be merged to the Flights dataframe as a new feature

In [90]:
import pickle
from datetime import datetime

In [91]:

filename = 'flight_2019'

infile = open(filename,'rb')
flight_2019 = pickle.load(infile)
infile.close()

#### New dataframe with a smaller sample of 1,000,000 records

In [92]:
flight_sample = flight_2019.sample(n=1000000, random_state=1)

In [93]:
flight_sample.shape

(1000000, 38)

In [94]:
def month_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['fl_date'].notna()].apply(
        lambda x: datetime.strptime(x['fl_date'], '%Y-%m-%d').date().month, axis=1)

def is_delayed_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    flights_df_cp['dep_delay'] = flights_df_cp['dep_delay'].fillna(0)
    flights_df_cp['arr_delay'] = flights_df_cp['arr_delay'].fillna(0)
    flights_df_cp['carrier_delay'] = flights_df_cp['carrier_delay'].fillna(0)
    flights_df_cp['weather_delay'] = flights_df_cp['weather_delay'].fillna(0)
    flights_df_cp['nas_delay'] = flights_df_cp['nas_delay'].fillna(0)
    flights_df_cp['security_delay'] = flights_df_cp['security_delay'].fillna(0)
    flights_df_cp['late_aircraft_delay'] = flights_df_cp['late_aircraft_delay'].fillna(0)

    flights_df_cp['total_delay'] = flights_df_cp.apply(
        lambda x: ((x['dep_delay'] + x['arr_delay'] +
                 x['carrier_delay'] +
                 x['weather_delay'] + x['nas_delay'] + x['security_delay'] +
                 x['late_aircraft_delay']
        )),  axis=1)
    
    return flights_df_cp.apply(
        lambda x: (0 if x['total_delay'] <= 0 else 1),  axis=1)
    
def month_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['fl_date'].notna()].apply(
        lambda x: datetime.strptime(x['fl_date'], '%Y-%m-%d').date().month, axis=1)

    
def arrival_hour_of_day_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['crs_arr_time'].notna()].apply(
        lambda x: 0 if x['crs_arr_time'] == 2400.0 else datetime.strptime(str(int(x['crs_arr_time'])).zfill(4), '%H%M').time().hour, axis=1)
   


## 1. Percent of Delay Count by Month

In [95]:
def percent_delay_by_month_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['is_delayed'] = is_delayed_feature(flights_df_cp)
    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    total_delay_df = flights_df_cp[flights_df_cp['is_delayed'] == 1].groupby(
        'arr_month').arr_month.agg(total_delays=('arr_month', 'count')).reset_index()
    
    
    total_delays = total_delay_df['total_delays'].sum()
    total_delay_df['percent_delay'] = total_delay_df.apply(
        lambda x: (x['total_delays'] / total_delays),  axis=1
    )
    total_delay_df = total_delay_df.drop(columns=['total_delays'])
    outfile = open(filename,'wb')

    pickle.dump(total_delay_df,outfile)
    outfile.close()    

In [96]:
percent_delay_by_month_dataframe(flight_sample, './reference/percent_delay_by_month_2019')

In [97]:

filename = './reference/percent_delay_by_month_2019'

infile = open(filename,'rb')
delay_df = pickle.load(infile)
infile.close()

In [98]:
delay_df.head(12)

Unnamed: 0,arr_month,percent_delay
0,1,0.07559
1,2,0.081756
2,3,0.081971
3,4,0.077643
4,5,0.08914
5,6,0.103827
6,7,0.093273
7,8,0.091991
8,9,0.064276
9,10,0.079566


## 2. Percentage of Flights by Hour of Day

In [99]:
def percent_flights_by_hour_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_hour_of_day'] = arrival_hour_of_day_feature(flights_df_cp)
       
    num_flights_by_hour = flights_df_cp.groupby(
        'arr_hour_of_day').agg(num_flights=('flights', 'count')).reset_index()
    num_flights_by_hour.rename(columns = {'arr_hour_of_day':'hour_of_day'}, inplace = True)
    
    total_flights = num_flights_by_hour['num_flights'].sum()
    num_flights_by_hour['percent_flights'] = num_flights_by_hour.apply(
        lambda x: (x['num_flights'] / total_flights),  axis=1
    )
    num_flights_by_hour = num_flights_by_hour.drop(columns=['num_flights'])
    outfile = open(filename,'wb')

    pickle.dump(num_flights_by_hour,outfile)
    outfile.close()   
    
    

In [100]:
percent_flights_by_hour_dataframe(flight_sample, './reference/percent_flights_by_month_2019')

In [101]:
filename = './reference/percent_flights_by_month_2019'

infile = open(filename,'rb')
flight_counts_df = pickle.load(infile)
infile.close()

flight_counts_df.head(5)

Unnamed: 0,hour_of_day,percent_flights
0,0,0.014972
1,1,0.002648
2,2,0.000412
3,3,0.000201
4,4,0.00138


## 3. Average Flights per Month by Airport

In [102]:
def average_flights_per_month_by_airport(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    # count flights by month and airport
    num_flights_by_aiport_and_month = flights_df_cp.groupby(        
        ['origin_airport_id','arr_month']).agg(num_flights=('flights', 'count')).reset_index()
    
    # average flight count
    average_flights_by_aiport = num_flights_by_aiport_and_month.groupby(        
        'origin_airport_id').agg(avg_flights=('num_flights', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_flights_by_aiport,outfile)
    outfile.close()   
    

In [103]:
filename = './reference/average_flights_by_airport_2019'
average_flights_per_month_by_airport(flight_sample, filename)

In [104]:
infile = open(filename,'rb')
flight_counts_df = pickle.load(infile)
infile.close()

flight_counts_df.sort_values(by="avg_flights", ascending=False).head(15)

Unnamed: 0,origin_airport_id,avg_flights
257,13930,4115.5
22,10397,4070.0
95,11298,3225.0
94,11292,2920.5
74,11057,2690.0
196,12892,2578.5
320,14747,1998.5
103,11433,1834.5
169,12266,1833.5
272,14107,1815.0


## 4. Average Passengers per Month by Airport

In [105]:
def average_passengers_per_month_by_airport(passengers_df, filename):
    passengers_df_cp = passengers_df.copy()
    
    # count passengers by month and airport
    num_passengers_by_aiport_and_month = passengers_df_cp.groupby(        
        ['origin_airport_id','month']).agg(num_passengers=('passengers', 'count')).reset_index()
    
    # average flight count
    average_passengers_by_aiport = num_passengers_by_aiport_and_month.groupby(        
        'origin_airport_id').agg(avg_passengers=('num_passengers', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_passengers_by_aiport,outfile)
    outfile.close()  


In [106]:
filename = './passenger_2019'
infile = open(filename,'rb')
passenger_df = pickle.load(infile)
infile.close()

In [107]:
filename = './reference/average_passengers_by_airport_2019'
average_passengers_per_month_by_airport(passenger_df, filename)

In [108]:
infile = open(filename,'rb')
average_passengers_df = pickle.load(infile)
infile.close()

average_passengers_df.sort_values(by="avg_passengers", ascending=False).head(15)

Unnamed: 0,origin_airport_id,avg_passengers
1118,13930,1445.0
369,11292,1090.0
104,10397,940.5
833,12892,929.0
677,12266,842.5
371,11298,795.5
470,11618,687.0
953,13303,675.5
408,11433,633.0
1001,13487,622.5


## 5. Total Distance per Month by Carrier

In [109]:
def total_distance_per_month_by_carrier(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    flights_completed = flights_df_cp[(
        (flights_df_cp['cancelled'] == 0) & (flights_df_cp['diverted'] == 0)
    )]
    
    total_distances_per_month_by_carrier = flights_completed.groupby([
        'month','op_unique_carrier']).agg(total_distance=('distance', 'sum')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(total_distances_per_month_by_carrier,outfile)
    outfile.close() 

In [110]:
filename = './reference/total_distance_per_month_by_carrier_2019'
total_distance_per_month_by_carrier(flight_sample, filename)

In [111]:
infile = open(filename,'rb')
average_passengers_df = pickle.load(infile)
infile.close()

average_passengers_df.sort_values(by="total_distance", ascending=False).head(15)

Unnamed: 0,month,op_unique_carrier,total_distance
178,7.0,WN,10728813.0
306,12.0,WN,10726462.0
74,3.0,WN,10710042.0
256,10.0,WN,10578011.0
152,6.0,WN,10420390.0
126,5.0,WN,10373618.0
189,8.0,DL,10369681.0
204,8.0,WN,10308537.0
163,7.0,DL,10168693.0
281,11.0,WN,10146273.0


## 6. Total Passengers by Carrier

In [79]:
def total_passengers_by_carrier(passengers_df, filename):
    passengers_df_cp = passengers_df.copy()
        
    num_passengers_by_carrier = passengers_df_cp.groupby(        
        ['unique_carrier']).agg(total_passengers=('passengers', 'count')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(num_passengers_by_carrier,outfile)
    outfile.close()  


In [80]:
filename = './passenger_2019'
infile = open(filename,'rb')
passenger_df = pickle.load(infile)
infile.close()

In [81]:
filename = './reference/total_passengers_by_carrier_2019'
total_passengers_by_carrier(passenger_df, filename)

In [82]:
infile = open(filename,'rb')
total_passengers_df = pickle.load(infile)
infile.close()

total_passengers_df.sort_values(by="total_passengers", ascending=False).head(15)

Unnamed: 0,unique_carrier,total_passengers
286,UA,41479
133,DL,38101
304,WN,36246
235,OO,30521
86,AA,30147
150,FX,18489
146,F9,15582
317,YX,13741
229,NK,13604
98,AS,13498


## 7. Average Fuel Consumption by Carrier

In [87]:
def average_fuel_by_carrier(fuel_df, filename):
    fuel_df_cp = fuel_df.copy()
    
    
    total_fuel_by_carrier_and_month = fuel_df_cp.groupby(        
        ['unique_carrier','month']).agg(total_gallons=('total_gallons', 'sum')).reset_index()
    
    # average fuel gallons
    average_fuel_by_carrier = total_fuel_by_carrier_and_month.groupby(        
        'unique_carrier').agg(avg_fuel=('total_gallons', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_fuel_by_carrier,outfile)
    outfile.close()  

In [85]:
filename = './fuel_2019'
infile = open(filename,'rb')
fuel_df = pickle.load(infile)
infile.close()

In [88]:
filename = './reference/average_fuel_by_carrier_2019'
average_fuel_by_carrier(fuel_df, filename)

In [89]:
infile = open(filename,'rb')
average_fuel_df = pickle.load(infile)
infile.close()

average_fuel_df.sort_values(by="avg_fuel", ascending=False).head(15)

Unnamed: 0,unique_carrier,avg_fuel
17,DL,303891710.0
12,AA,303854847.0
43,UA,297848082.0
46,WN,176971032.0
20,FX,97834857.0
15,B6,74781183.0
7,5X,72821000.0
14,AS,60609906.0
8,5Y,49527226.0
35,NK,40374673.0
