# Reference Dataframes
Dataframes with aggregated date that can be merged to the Flights dataframe as a new feature

In [13]:
import pickle
from datetime import datetime

In [2]:

filename = 'flight_2019'

infile = open(filename,'rb')
flight_2019 = pickle.load(infile)
infile.close()

#### New dataframe with a smaller sample of 500,000 records

In [23]:
flight_sample = flight_2019.sample(n=30000, random_state=1)

In [24]:
flight_sample.shape

(30000, 38)

In [38]:
def is_delayed_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    flights_df_cp['dep_delay'] = flights_df_cp['dep_delay'].fillna(0)
    flights_df_cp['arr_delay'] = flights_df_cp['arr_delay'].fillna(0)
    flights_df_cp['carrier_delay'] = flights_df_cp['carrier_delay'].fillna(0)
    flights_df_cp['weather_delay'] = flights_df_cp['weather_delay'].fillna(0)
    flights_df_cp['nas_delay'] = flights_df_cp['nas_delay'].fillna(0)
    flights_df_cp['security_delay'] = flights_df_cp['security_delay'].fillna(0)
    flights_df_cp['late_aircraft_delay'] = flights_df_cp['late_aircraft_delay'].fillna(0)

    flights_df_cp['total_delay'] = flights_df_cp.apply(
        lambda x: ((x['dep_delay'] + x['arr_delay'] +
                 x['carrier_delay'] +
                 x['weather_delay'] + x['nas_delay'] + x['security_delay'] +
                 x['late_aircraft_delay']
        )),  axis=1)
    
    return flights_df_cp.apply(
        lambda x: (0 if x['total_delay'] <= 0 else 1),  axis=1)
    
def month_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['fl_date'].notna()].apply(
        lambda x: datetime.strptime(x['fl_date'], '%Y-%m-%d').date().month, axis=1)

    
def arrival_hour_of_day_feature(flights_df):
    flights_df_cp = flights_df.copy()
    
    return flights_df_cp[flights_df_cp['crs_arr_time'].notna()].apply(
        lambda x: 0 if x['crs_arr_time'] == 2400.0 else datetime.strptime(str(int(x['crs_arr_time'])).zfill(4), '%H%M').time().hour, axis=1)
   


## 1. Percent of Delay Count by Month

In [None]:
def percent_delay_by_month_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['is_delayed'] = is_delayed_feature(flights_df_cp)
    flights_df_cp['arr_month'] = month_feature(flights_df_cp)
    
    total_delay_df = flights_df_cp[flights_df_cp['is_delayed'] == 1].groupby(
        'arr_month').arr_month.agg(total_delays=('arr_month', 'count')).reset_index()
    
    
    total_delays = total_delay_df['total_delays'].sum()
    total_delay_df['percent_delay'] = total_delay_df.apply(
        lambda x: (x['total_delays'] / total_delays),  axis=1
    )
    total_delay_df = total_delay_df.drop(columns=['total_delays'])
    outfile = open(filename,'wb')

    pickle.dump(total_delay_df,outfile)
    outfile.close()    

In [35]:
percent_delay_by_month_dataframe(flight_sample, './reference/percent_delay_by_month_2019')

In [36]:

filename = './reference/percent_delay_by_month_2019'

infile = open(filename,'rb')
delay_df = pickle.load(infile)
infile.close()

In [37]:
delay_df.head(12)

Unnamed: 0,arr_month,percent_delay
0,1,0.073171
1,2,0.080085
2,3,0.079796
3,4,0.077108
4,5,0.088727
5,6,0.101402
6,7,0.093816
7,8,0.091511
8,9,0.064721
9,10,0.085366


## 2. Percentage of Flights per Hour

In [41]:
def percent_flights_by_hour_dataframe(flights_df, filename):
    flights_df_cp = flights_df.copy()

    flights_df_cp['arr_hour_of_day'] = arrival_hour_of_day_feature(flights_df_cp)
       
    num_flights_by_hour = flights_df_cp.groupby(
        'arr_hour_of_day').agg(num_flights=('flights', 'count')).reset_index()
    num_flights_by_hour.rename(columns = {'arr_hour_of_day':'hour_of_day'}, inplace = True)
    
    total_flights = num_flights_by_hour['num_flights'].sum()
    num_flights_by_hour['percent_flights'] = num_flights_by_hour.apply(
        lambda x: (x['num_flights'] / total_flights),  axis=1
    )
    num_flights_by_hour = num_flights_by_hour.drop(columns=['num_flights'])
    outfile = open(filename,'wb')

    pickle.dump(num_flights_by_hour,outfile)
    outfile.close()   
    
    

In [42]:
percent_flights_by_hour_dataframe(flight_sample, './reference/percent_flights_by_month_2019')

In [45]:

filename = './reference/percent_flights_by_month_2019'

infile = open(filename,'rb')
flight_counts_df = pickle.load(infile)
infile.close()

flight_counts_df.head(5)

Unnamed: 0,hour_of_day,percent_flights
0,0,0.014567
1,1,0.002733
2,2,0.000367
3,3,0.000233
4,4,0.001333
