# Reference Dataframes
Dataframes with aggregated date that can be merged to the Flights dataframe as a new feature

In [13]:
import pickle
from datetime import datetime

In [2]:

filename = 'flight_2019'

infile = open(filename,'rb')
flight_2019 = pickle.load(infile)
infile.close()

#### New dataframe with a smaller sample of 500,000 records

In [23]:
flight_sample = flight_2019.sample(n=30000, random_state=1)

In [24]:
flight_sample.shape

(30000, 38)

In [50]:
def month_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['fl_date'].notna()].apply(
        lambda x: datetime.strptime(x['fl_date'], '%Y-%m-%d').date().month, axis=1)

def is_delayed_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    flights_df_cp['dep_delay'] = flights_df_cp['dep_delay'].fillna(0)
    flights_df_cp['arr_delay'] = flights_df_cp['arr_delay'].fillna(0)
    flights_df_cp['carrier_delay'] = flights_df_cp['carrier_delay'].fillna(0)
    flights_df_cp['weather_delay'] = flights_df_cp['weather_delay'].fillna(0)
    flights_df_cp['nas_delay'] = flights_df_cp['nas_delay'].fillna(0)
    flights_df_cp['security_delay'] = flights_df_cp['security_delay'].fillna(0)
    flights_df_cp['late_aircraft_delay'] = flights_df_cp['late_aircraft_delay'].fillna(0)

    flights_df_cp['total_delay'] = flights_df_cp.apply(
        lambda x: ((x['dep_delay'] + x['arr_delay'] +
                 x['carrier_delay'] +
                 x['weather_delay'] + x['nas_delay'] + x['security_delay'] +
                 x['late_aircraft_delay']
        )),  axis=1)
    
    return flights_df_cp.apply(
        lambda x: (0 if x['total_delay'] <= 0 else 1),  axis=1)
    
def month_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['fl_date'].notna()].apply(
        lambda x: datetime.strptime(x['fl_date'], '%Y-%m-%d').date().month, axis=1)

    
def arrival_hour_of_day_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['crs_arr_time'].notna()].apply(
        lambda x: 0 if x['crs_arr_time'] == 2400.0 else datetime.strptime(str(int(x['crs_arr_time'])).zfill(4), '%H%M').time().hour, axis=1)
   


## 1. Percent of Delay Count by Month

In [None]:
def percent_delay_by_month_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['is_delayed'] = is_delayed_feature(flights_df_cp)
    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    total_delay_df = flights_df_cp[flights_df_cp['is_delayed'] == 1].groupby(
        'arr_month').arr_month.agg(total_delays=('arr_month', 'count')).reset_index()
    
    
    total_delays = total_delay_df['total_delays'].sum()
    total_delay_df['percent_delay'] = total_delay_df.apply(
        lambda x: (x['total_delays'] / total_delays),  axis=1
    )
    total_delay_df = total_delay_df.drop(columns=['total_delays'])
    outfile = open(filename,'wb')

    pickle.dump(total_delay_df,outfile)
    outfile.close()    

In [35]:
percent_delay_by_month_dataframe(flight_sample, './reference/percent_delay_by_month_2019')

In [36]:

filename = './reference/percent_delay_by_month_2019'

infile = open(filename,'rb')
delay_df = pickle.load(infile)
infile.close()

In [37]:
delay_df.head(12)

Unnamed: 0,arr_month,percent_delay
0,1,0.073171
1,2,0.080085
2,3,0.079796
3,4,0.077108
4,5,0.088727
5,6,0.101402
6,7,0.093816
7,8,0.091511
8,9,0.064721
9,10,0.085366


## 2. Percentage of Flights by Hour of Day

In [41]:
def percent_flights_by_hour_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_hour_of_day'] = arrival_hour_of_day_feature(flights_df_cp)
       
    num_flights_by_hour = flights_df_cp.groupby(
        'arr_hour_of_day').agg(num_flights=('flights', 'count')).reset_index()
    num_flights_by_hour.rename(columns = {'arr_hour_of_day':'hour_of_day'}, inplace = True)
    
    total_flights = num_flights_by_hour['num_flights'].sum()
    num_flights_by_hour['percent_flights'] = num_flights_by_hour.apply(
        lambda x: (x['num_flights'] / total_flights),  axis=1
    )
    num_flights_by_hour = num_flights_by_hour.drop(columns=['num_flights'])
    outfile = open(filename,'wb')

    pickle.dump(num_flights_by_hour,outfile)
    outfile.close()   
    
    

In [42]:
percent_flights_by_hour_dataframe(flight_sample, './reference/percent_flights_by_month_2019')

In [45]:
filename = './reference/percent_flights_by_month_2019'

infile = open(filename,'rb')
flight_counts_df = pickle.load(infile)
infile.close()

flight_counts_df.head(5)

Unnamed: 0,hour_of_day,percent_flights
0,0,0.014567
1,1,0.002733
2,2,0.000367
3,3,0.000233
4,4,0.001333


## 3. Average Flights per Month by Airport

In [55]:
def average_flights_per_month_by_airport(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    # count flights by month and airport
    num_flights_by_aiport_and_month = flights_df_cp.groupby(        
        ['origin_airport_id','arr_month']).agg(num_flights=('flights', 'count')).reset_index()
    
    # average flight count
    average_flights_by_aiport = num_flights_by_aiport_and_month.groupby(        
        'origin_airport_id').agg(avg_flights=('num_flights', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_flights_by_aiport,outfile)
    outfile.close()   
    

In [56]:
filename = './reference/average_flights_by_airport_2019'
average_flights_per_month_by_airport(flight_sample, filename)

In [59]:
infile = open(filename,'rb')
flight_counts_df = pickle.load(infile)
infile.close()

flight_counts_df.sort_values(by="avg_flights", ascending=False).head(15)

Unnamed: 0,origin_airport_id,avg_flights
250,13930,123.5
20,10397,120.5
91,11292,94.5
92,11298,94.0
71,11057,78.5
189,12892,74.0
309,14747,58.0
162,12266,56.5
263,14107,53.0
99,11433,52.5


## 4. Average Passengers per Month by Airport

In [65]:
def average_passengers_per_month_by_airport(passengers_df, filename):
    passengers_df_cp = passengers_df.copy()
    
    # count passengers by month and airport
    num_passengers_by_aiport_and_month = passengers_df_cp.groupby(        
        ['origin_airport_id','month']).agg(num_passengers=('passengers', 'count')).reset_index()
    
    # average flight count
    average_passengers_by_aiport = num_passengers_by_aiport_and_month.groupby(        
        'origin_airport_id').agg(avg_passengers=('num_passengers', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_passengers_by_aiport,outfile)
    outfile.close()  


In [66]:
filename = './passenger_2019'
infile = open(filename,'rb')
passenger_df = pickle.load(infile)
infile.close()

In [67]:
filename = './reference/average_passengers_by_airport_2019'
average_passengers_per_month_by_airport(passenger_df, filename)

In [68]:
infile = open(filename,'rb')
average_passengers_df = pickle.load(infile)
infile.close()

average_passengers_df.sort_values(by="avg_passengers", ascending=False).head(15)

Unnamed: 0,origin_airport_id,avg_passengers
1118,13930,1445.0
369,11292,1090.0
104,10397,940.5
833,12892,929.0
677,12266,842.5
371,11298,795.5
470,11618,687.0
953,13303,675.5
408,11433,633.0
1001,13487,622.5


## 5. Total Distance per Month by Carrier

In [69]:
def total_distance_per_month_by_carrier(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    flights_completed = flights_df_cp[(
        (flights_df_cp['cancelled'] == 0) & (flights_df_cp['diverted'] == 0)
    )]
    
    total_distances_per_month_by_carrier = flights_completed.groupby([
        'month','op_unique_carrier']).agg(total_distance=('distance', 'sum')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(total_distances_per_month_by_carrier,outfile)
    outfile.close() 

In [71]:
filename = './reference/total_distance_per_month_by_carrier_2019'
total_distance_per_month_by_carrier(flight_sample, filename)

In [72]:
infile = open(filename,'rb')
average_passengers_df = pickle.load(infile)
infile.close()

average_passengers_df.sort_values(by="total_distance", ascending=False).head(15)

Unnamed: 0,month,op_unique_carrier,total_distance
178,8.0,AA,342032.0
148,6.0,WN,332172.0
299,12.0,WN,330395.0
249,10.0,WN,327563.0
72,3.0,WN,322477.0
159,7.0,DL,319233.0
279,12.0,AA,312402.0
173,7.0,WN,312308.0
198,8.0,WN,307810.0
122,5.0,WN,307212.0


## 6. Total Passengers by Carrier

In [79]:
def total_passengers_by_carrier(passengers_df, filename):
    passengers_df_cp = passengers_df.copy()
        
    num_passengers_by_carrier = passengers_df_cp.groupby(        
        ['unique_carrier']).agg(total_passengers=('passengers', 'count')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(num_passengers_by_carrier,outfile)
    outfile.close()  


In [80]:
filename = './passenger_2019'
infile = open(filename,'rb')
passenger_df = pickle.load(infile)
infile.close()

In [81]:
filename = './reference/total_passengers_by_carrier_2019'
total_passengers_by_carrier(passenger_df, filename)

In [82]:
infile = open(filename,'rb')
total_passengers_df = pickle.load(infile)
infile.close()

total_passengers_df.sort_values(by="total_passengers", ascending=False).head(15)

Unnamed: 0,unique_carrier,total_passengers
286,UA,41479
133,DL,38101
304,WN,36246
235,OO,30521
86,AA,30147
150,FX,18489
146,F9,15582
317,YX,13741
229,NK,13604
98,AS,13498


## 7. Average Fuel Consumption by Carrier

In [83]:
def average_fuel_per_month_by_airport(fuel_df, filename):
    fuel_df_cp = fuel_df.copy()
    
    
    total_fuel_by_carrier_and_month = fuel_df_cp.groupby(        
        ['unique_carrier','month']).agg(total_gallons=('total_gallons', 'sum')).reset_index()
    
    # average fuel gallons
    average_fuel_by_carrier = total_fuel_by_carrier_and_month.groupby(        
        'unique_carrier').agg(avg_fuel=('total_gallons', 'median')).reset_index()
    
    outfile = open(filename,'wb')

    pickle.dump(average_fuel_by_carrier,outfile)
    outfile.close()  

In [84]:
filename = './fuel_2019'
infile = open(filename,'rb')
fuel_df = pickle.load(infile)
infile.close()