# Reference Dataframes
Dataframes with aggregated date that can be merged to the Flights dataframe as a new feature

In [90]:
import pickle
from datetime import datetime

In [91]:

filename = 'flight_2019'

infile = open(filename,'rb')
flight_2019 = pickle.load(infile)
infile.close()

#### New dataframe with a smaller sample of 1,000,000 records

In [92]:
flight_sample = flight_2019.sample(n=1000000, random_state=1)

In [93]:
flight_sample.shape

(1000000, 38)

In [94]:
def is_delayed_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    flights_df_cp['dep_delay'] = flights_df_cp['dep_delay'].fillna(0)
    flights_df_cp['arr_delay'] = flights_df_cp['arr_delay'].fillna(0)
    flights_df_cp['carrier_delay'] = flights_df_cp['carrier_delay'].fillna(0)
    flights_df_cp['weather_delay'] = flights_df_cp['weather_delay'].fillna(0)
    flights_df_cp['nas_delay'] = flights_df_cp['nas_delay'].fillna(0)
    flights_df_cp['security_delay'] = flights_df_cp['security_delay'].fillna(0)
    flights_df_cp['late_aircraft_delay'] = flights_df_cp['late_aircraft_delay'].fillna(0)

    flights_df_cp['total_delay'] = flights_df_cp.apply(
        lambda x: ((x['dep_delay'] + x['arr_delay'] +
                 x['carrier_delay'] +
                 x['weather_delay'] + x['nas_delay'] + x['security_delay'] +
                 x['late_aircraft_delay']
        )),  axis=1)
    
    return flights_df_cp.apply(
        lambda x: (0 if x['total_delay'] <= 0 else 1),  axis=1)
    
def month_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['fl_date'].notna()].apply(
        lambda x: datetime.strptime(x['fl_date'], '%Y-%m-%d').date().month, axis=1)

    
def arrival_hour_of_day_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['crs_arr_time'].notna()].apply(
        lambda x: 0 if x['crs_arr_time'] == 2400.0 else datetime.strptime(str(int(x['crs_arr_time'])).zfill(4), '%H%M').time().hour, axis=1)
   


## 1. Percent of Delay Count by Month

In [122]:
def percent_delay_by_month_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['is_delayed'] = is_delayed_feature(flights_df_cp)
    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    total_delay_df = flights_df_cp[flights_df_cp['is_delayed'] == 1].groupby(
        ['origin_airport_id','arr_month']).agg(total_delays=('is_delayed', 'count')).reset_index()
    
    
    total_delays = total_delay_df['total_delays'].sum()
    total_delay_df['percent_delay'] = total_delay_df.apply(
        lambda x: (x['total_delays'] / total_delays),  axis=1
    )
    total_delay_df = total_delay_df.drop(columns=['total_delays'])
    outfile = open(filename,'wb')

    pickle.dump(total_delay_df,outfile)
    outfile.close()    

In [123]:
percent_delay_by_month_dataframe(flight_sample, './reference/percent_delay_by_month_2019')

In [124]:

filename = './reference/percent_delay_by_month_2019'

infile = open(filename,'rb')
delay_df = pickle.load(infile)
infile.close()

In [125]:
delay_df.head(12)

Unnamed: 0,origin_airport_id,arr_month,percent_delay
0,10135,1,2.6e-05
1,10135,2,5.8e-05
2,10135,3,7.8e-05
3,10135,4,8.4e-05
4,10135,5,4.4e-05
5,10135,6,9e-05
6,10135,7,8.1e-05
7,10135,8,7.3e-05
8,10135,9,4.1e-05
9,10135,10,4.9e-05


## 2. Percentage of Flights by Hour of Day

In [126]:
def percent_flights_by_hour_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_hour_of_day'] = arrival_hour_of_day_feature(flights_df_cp)
       
    num_flights_by_hour = flights_df_cp.groupby(
        ['origin_airport_id','arr_hour_of_day']).agg(num_flights=('flights', 'count')).reset_index()
    num_flights_by_hour.rename(columns = {'arr_hour_of_day':'hour_of_day'}, inplace = True)
    
    total_flights = num_flights_by_hour['num_flights'].sum()
    num_flights_by_hour['percent_flights'] = num_flights_by_hour.apply(
        lambda x: (x['num_flights'] / total_flights),  axis=1
    )
    num_flights_by_hour = num_flights_by_hour.drop(columns=['num_flights'])
    outfile = open(filename,'wb')

    pickle.dump(num_flights_by_hour,outfile)
    outfile.close()   
    
    

In [140]:
percent_flights_by_hour_dataframe(flight_sample, './reference/percent_flights_by_hour_2019')

In [141]:
filename = './reference/percent_flights_by_hour_2019'

infile = open(filename,'rb')
flight_counts_df = pickle.load(infile)
infile.close()

flight_counts_df.head(5)

Unnamed: 0,origin_airport_id,hour_of_day,percent_flights
0,10135,6,2.9e-05
1,10135,7,8.1e-05
2,10135,8,0.000134
3,10135,9,3e-05
4,10135,10,2.2e-05


## 3. Average Flights per Month by Airport

In [102]:
def average_flights_per_month_by_airport(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    # count flights by month and airport
    num_flights_by_aiport_and_month = flights_df_cp.groupby(        
        ['origin_airport_id','arr_month']).agg(num_flights=('flights', 'count')).reset_index()
    
    # average flight count
    average_flights_by_aiport = num_flights_by_aiport_and_month.groupby(        
        'origin_airport_id').agg(avg_flights=('num_flights', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_flights_by_aiport,outfile)
    outfile.close()   
    

In [103]:
filename = './reference/average_flights_by_airport_2019'
average_flights_per_month_by_airport(flight_sample, filename)

In [104]:
infile = open(filename,'rb')
flight_counts_df = pickle.load(infile)
infile.close()

flight_counts_df.sort_values(by="avg_flights", ascending=False).head(15)

Unnamed: 0,origin_airport_id,avg_flights
257,13930,4115.5
22,10397,4070.0
95,11298,3225.0
94,11292,2920.5
74,11057,2690.0
196,12892,2578.5
320,14747,1998.5
103,11433,1834.5
169,12266,1833.5
272,14107,1815.0


## 4. Average Passengers per Month by Airport

In [105]:
def average_passengers_per_month_by_airport(passengers_df, filename):
    passengers_df_cp = passengers_df.copy()
    
    num_passengers_by_aiport_and_month = passengers_df_cp.groupby(        
        ['origin_airport_id','month']).agg(num_passengers=('passengers', 'count')).reset_index()
    
    # average flight count
    average_passengers_by_aiport = num_passengers_by_aiport_and_month.groupby(        
        'origin_airport_id').agg(avg_passengers=('num_passengers', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_passengers_by_aiport,outfile)
    outfile.close()  


In [106]:
filename = './passenger_2019'
infile = open(filename,'rb')
passenger_df = pickle.load(infile)
infile.close()

In [107]:
filename = './'
average_passengers_per_month_by_airport(passenger_df, filename)

In [108]:
infile = open(filename,'rb')
average_passengers_df = pickle.load(infile)
infile.close()

average_passengers_df.sort_values(by="avg_passengers", ascending=False).head(15)

Unnamed: 0,origin_airport_id,avg_passengers
1118,13930,1445.0
369,11292,1090.0
104,10397,940.5
833,12892,929.0
677,12266,842.5
371,11298,795.5
470,11618,687.0
953,13303,675.5
408,11433,633.0
1001,13487,622.5


## 5. Total Distance per Month by Airport

In [132]:
def total_distance_per_month_by_airport(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    flights_completed = flights_df_cp[(
        (flights_df_cp['cancelled'] == 0) & (flights_df_cp['diverted'] == 0)
    )]
    
    total_distances_per_month_by_airport = flights_completed.groupby([
        'arr_month','origin_airport_id']).agg(total_distance=('distance', 'sum')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(total_distances_per_month_by_airport,outfile)
    outfile.close() 

In [133]:
filename = './reference/total_distance_per_month_by_airport_2019'
total_distance_per_month_by_airport(flight_sample, filename)

In [134]:
infile = open(filename,'rb')
distance_df = pickle.load(infile)
infile.close()

distance_df.sort_values(by="total_distance", ascending=False).head(15)

Unnamed: 0,arr_month,origin_airport_id,total_distance
2739,8,12892,3423628.0
2371,7,12892,3358308.0
904,3,12892,3210002.0
2001,6,12892,3207671.0
4196,12,12892,3106481.0
1631,5,12892,3084445.0
1267,4,12892,3077887.0
2431,7,13930,3057965.0
3470,10,12892,3012450.0
3104,9,12892,2981898.0


## 6. Total Passengers by Airport

In [136]:
def total_passengers_by_airport(passengers_df, filename):
    passengers_df_cp = passengers_df.copy()
        
    num_passengers_by_airport = passengers_df_cp.groupby(        
        ['month','origin_airport_id']).agg(total_passengers=('passengers', 'count')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(num_passengers_by_airport,outfile)
    outfile.close()  


In [137]:
filename = './passenger_2019'
infile = open(filename,'rb')
passenger_df = pickle.load(infile)
infile.close()

In [138]:
filename = './reference/total_passengers_by_airport_2019'
total_passengers_by_airport(passenger_df, filename)

In [139]:
infile = open(filename,'rb')
total_passengers_df = pickle.load(infile)
infile.close()

total_passengers_df.sort_values(by="total_passengers", ascending=False).head(15)

Unnamed: 0,month,origin_airport_id,total_passengers
14122,12,13930,1680
11732,10,13930,1606
9226,8,13930,1547
6736,6,13930,1536
12950,11,13930,1500
3071,3,13930,1461
10480,9,13930,1429
1886,2,13930,1417
7990,7,13930,1397
741,1,13930,1382


## 7. Average Fuel Consumption by Carrier

In [116]:
def average_fuel_by_carrier(fuel_df, filename):
    fuel_df_cp = fuel_df.copy()
    
    
    total_fuel_by_carrier_and_month = fuel_df_cp.groupby(        
        ['unique_carrier','month']).agg(total_gallons=('total_gallons', 'sum')).reset_index()
    
    # average fuel gallons
    average_fuel_by_carrier = total_fuel_by_carrier_and_month.groupby(        
        'unique_carrier').agg(avg_fuel=('total_gallons', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_fuel_by_carrier,outfile)
    outfile.close()  

In [117]:
filename = './fuel_2019'
infile = open(filename,'rb')
fuel_df = pickle.load(infile)
infile.close()

In [118]:
filename = './reference/average_fuel_by_carrier_2019'
average_fuel_by_carrier(fuel_df, filename)

In [119]:
infile = open(filename,'rb')
average_fuel_df = pickle.load(infile)
infile.close()

average_fuel_df.sort_values(by="avg_fuel", ascending=False).head(15)

Unnamed: 0,unique_carrier,avg_fuel
17,DL,303891710.0
12,AA,303854847.0
43,UA,297848082.0
46,WN,176971032.0
20,FX,97834857.0
15,B6,74781183.0
7,5X,72821000.0
14,AS,60609906.0
8,5Y,49527226.0
35,NK,40374673.0
