In [1]:
import pandas as pd
import os
import requests as re
import datetime
import time
from IPython.display import JSON
import json

# Import Dataframes

### Flights

In [2]:
pd.set_option('max_columns', None)
#pd.reset_option('max_columns')

In [3]:
# Import df_flights dataframes. Specify datatype of cancellation_code so there are not mixed datatypes present.
# We will use statistics from the last week of December 2018 to help predict the first week of January 2019.

# df_flights_Dec_2018:   Dec 24 - 31, 2018
# df_flights_Dec_2019:   Dec 24 - 31, 2019
# df_flights:            Jan 1 - 8, 2019
# df_flights_test:       Jan 1 - 8, 2020

df_flights_Dec_2018 = pd.read_csv('flights_Dec_2018.csv', dtype={'cancellation_code':'object'})
df_flights_Dec_2019 = pd.read_csv('flights_Dec_2019.csv', dtype={'cancellation_code':'object'})
df_flights = pd.read_csv('flights_data.csv', dtype={'cancellation_code':'object'})

In [4]:
# # Check flight delay bins
# print(pd.cut(df_flights_Dec_2018['dep_delay'], bins=10).value_counts())
# print(pd.cut(df_flights_Dec_2019['dep_delay'], bins=10).value_counts())
# print(pd.cut(df_flights['dep_delay'], bins=10).value_counts())

In [5]:
# Filter out extreme delays which account for less than 1% of flights, for more accurate stats/modelling
df_flights_Dec_2018 = df_flights_Dec_2018[df_flights_Dec_2018['dep_delay'] < 325.8]
df_flights_Dec_2019 = df_flights_Dec_2019[df_flights_Dec_2019['dep_delay'] < 305.8]
df_flights = df_flights[df_flights['dep_delay'] < 300.4]

In [6]:
df_flights_Dec_2018.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-12-25,AA,AA_CODESHARE,AA,5045,OH,N519AE,5045,13931,ORF,"Norfolk, VA",11057,CLT,"Charlotte, NC",903,855.0,-8.0,23.0,918.0,1021.0,6.0,1044,1027.0,-17.0,0.0,,0.0,N,101.0,92.0,63.0,1.0,290.0,,,,,,,,,


### Flights_test

In [7]:
# Look through flight data from January 2020
df_flights_test = pd.read_csv('flights_test.csv')

# Define new column names
df_flights_test.columns = ['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance']

# Clean up fl_date
df_flights_test['fl_date'] = df_flights_test['fl_date'].str[0:10]

In [31]:
df_flights['origin'].sort_values()
#df_flights['origin'].nunique()

140480    ABE
139228    ABE
65019     ABE
25703     ABE
29384     ABE
         ... 
126407    YUM
126395    YUM
74379     YUM
343       YUM
74273     YUM
Name: origin, Length: 144605, dtype: object

In [32]:
df_flights_test['origin'].sort_values()
#df_flights_test['origin'].nunique()

82583     ABE
27143     ABE
38809     ABE
96917     ABE
50816     ABE
         ... 
44346     YUM
87391     YUM
44362     YUM
3012      YUM
148391    YUM
Name: origin, Length: 150623, dtype: object

In [10]:
df_flights_test.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363


# Feature Engineering

In [11]:
# Get hour of departure and arrival times (approximate using crs for 2020)
df_flights_Dec_2018['hour_of_day_dep'] = df_flights_Dec_2018['dep_time'] // 100
df_flights_Dec_2018['hour_of_day_arr'] = df_flights_Dec_2018['arr_time'] // 100
df_flights_Dec_2019['hour_of_day_dep'] = df_flights_Dec_2019['dep_time'] // 100
df_flights_Dec_2019['hour_of_day_arr'] = df_flights_Dec_2019['arr_time'] // 100
df_flights['hour_of_day_dep'] = df_flights['dep_time'] // 100
df_flights['hour_of_day_arr'] = df_flights['arr_time'] // 100
df_flights_test['hour_of_day_dep'] = df_flights_test['crs_dep_time'].astype(int) // 100
df_flights_test['hour_of_day_arr'] = df_flights_test['crs_arr_time'].astype(int) // 100

# Set delay nulls to zero (zero minutes of delay instead of null)
df_flights_Dec_2018[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']] = df_flights_Dec_2018[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].fillna(value=0)
df_flights_Dec_2019[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']] = df_flights_Dec_2019[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].fillna(value=0)
df_flights[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']] = df_flights[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].fillna(value=0)

# Exclude diverted and cancelled flights
df_flights_Dec_2018 = df_flights_Dec_2018[df_flights_Dec_2018['diverted'] != 1]
df_flights_Dec_2018 = df_flights_Dec_2018[df_flights_Dec_2018['cancelled'] != 1]
df_flights_Dec_2019 = df_flights_Dec_2019[df_flights_Dec_2019['diverted'] != 1]
df_flights_Dec_2019 = df_flights_Dec_2019[df_flights_Dec_2019['cancelled'] != 1]
df_flights = df_flights[df_flights['diverted'] != 1]
df_flights = df_flights[df_flights['cancelled'] != 1]

# Format fl_date column as a date
df_flights_Dec_2018['fl_date'] = pd.to_datetime(df_flights_Dec_2018['fl_date'])
df_flights_Dec_2019['fl_date'] = pd.to_datetime(df_flights_Dec_2019['fl_date'])
df_flights['fl_date'] = pd.to_datetime(df_flights['fl_date'])
df_flights_test['fl_date'] = pd.to_datetime(df_flights_test['fl_date'])

# Create 'state' column
df_flights_Dec_2018['state'] = df_flights_Dec_2018['origin_city_name'].str[-2:]
df_flights_Dec_2019['state'] = df_flights_Dec_2019['origin_city_name'].str[-2:]
df_flights['state'] = df_flights['origin_city_name'].str[-2:]
df_flights_test['state'] = df_flights_test['origin_city_name'].str[-2:]

# Create fl_day column
df_flights_Dec_2018['fl_day'] = pd.DatetimeIndex(df_flights_Dec_2018['fl_date']).day
df_flights_Dec_2019['fl_day'] = pd.DatetimeIndex(df_flights_Dec_2019['fl_date']).day
df_flights['fl_day'] = pd.DatetimeIndex(df_flights['fl_date']).day
df_flights_test['fl_day'] = pd.DatetimeIndex(df_flights_test['fl_date']).day

# NOTE: statistics will be gathered from Dec 2018/2019, and applied to Jan 2019/2020 respectively to simulate "1 week in advance" predictions.
# Get and apply daily mean flight delay values
df_daily_delay_means_Dec_2018 = df_flights_Dec_2018.groupby('fl_day').mean()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights['daily_arr_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['arr_delay'][row['fl_day']-1], axis=1)
df_flights['daily_carrier_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['carrier_delay'][row['fl_day']-1], axis=1)
df_flights['daily_weather_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['weather_delay'][row['fl_day']-1], axis=1)
df_flights['daily_nas_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['nas_delay'][row['fl_day']-1], axis=1)
df_flights['daily_security_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['security_delay'][row['fl_day']-1], axis=1)
df_flights['daily_late_aircraft_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['late_aircraft_delay'][row['fl_day']-1], axis=1)

df_daily_delay_means_Dec_2019 = df_flights_Dec_2019.groupby('fl_day').mean()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights_test['daily_arr_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['arr_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_carrier_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['carrier_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_weather_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['weather_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_nas_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['nas_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_security_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['security_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_late_aircraft_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['late_aircraft_delay'][row['fl_day']-1], axis=1)

# Get and apply daily standard deviation of flight delay values
df_daily_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('fl_day').std()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights['daily_arr_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['arr_delay'][row['fl_day']-1], axis=1)
df_flights['daily_carrier_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['carrier_delay'][row['fl_day']-1], axis=1)
df_flights['daily_weather_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['weather_delay'][row['fl_day']-1], axis=1)
df_flights['daily_nas_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['nas_delay'][row['fl_day']-1], axis=1)
df_flights['daily_security_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['security_delay'][row['fl_day']-1], axis=1)
df_flights['daily_late_aircraft_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['late_aircraft_delay'][row['fl_day']-1], axis=1)

df_daily_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('fl_day').std()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights_test['daily_arr_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['arr_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_carrier_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['carrier_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_weather_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['weather_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_nas_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['nas_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_security_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['security_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_late_aircraft_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['late_aircraft_delay'][row['fl_day']-1], axis=1)

In [12]:
# Get and apply hourly mean flight delay values
dep_hourly_delay_mean_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_dep').mean()['dep_delay']
arr_hourly_delay_mean_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_arr').mean()['arr_delay']
dep_hourly_delay_mean_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_dep').mean()['dep_delay']
arr_hourly_delay_mean_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_arr').mean()['arr_delay']

df_flights['dep_mean_hourly_delay'] = df_flights.apply(lambda row: dep_hourly_delay_mean_Dec_2018[row['hour_of_day_dep']], axis=1)
df_flights['arr_mean_hourly_delay'] = df_flights.apply(lambda row: arr_hourly_delay_mean_Dec_2018[row['hour_of_day_arr']], axis=1)
df_flights_test['dep_mean_hourly_delay'] = df_flights_test.apply(lambda row: dep_hourly_delay_mean_Dec_2019[row['hour_of_day_dep']], axis=1)
df_flights_test['arr_mean_hourly_delay'] = df_flights_test.apply(lambda row: arr_hourly_delay_mean_Dec_2019[row['hour_of_day_arr']], axis=1)


# Get and apply hourly standard deviation of flight delay values
dep_hourly_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_dep').std()['dep_delay']
arr_hourly_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_arr').std()['arr_delay']
dep_hourly_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_dep').std()['dep_delay']
arr_hourly_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_arr').std()['arr_delay']

df_flights['dep_std_hourly_delay'] = df_flights.apply(lambda row: dep_hourly_delay_std_Dec_2018[row['hour_of_day_dep']], axis=1)
df_flights['arr_std_hourly_delay'] = df_flights.apply(lambda row: arr_hourly_delay_std_Dec_2018[row['hour_of_day_arr']], axis=1)
df_flights_test['dep_std_hourly_delay'] = df_flights_test.apply(lambda row: dep_hourly_delay_std_Dec_2019[row['hour_of_day_dep']], axis=1)
df_flights_test['arr_std_hourly_delay'] = df_flights_test.apply(lambda row: arr_hourly_delay_std_Dec_2019[row['hour_of_day_arr']], axis=1)

In [13]:
# Calculate and apply mean marketing (mkt) and operating (op) carrier delays
df_mean_mkt_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('mkt_unique_carrier').mean()['carrier_delay']
df_mean_op_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('op_unique_carrier').mean()['carrier_delay']
df_mean_mkt_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('mkt_unique_carrier').mean()['carrier_delay']
df_mean_op_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('op_unique_carrier').mean()['carrier_delay']

df_flights['mean_mkt_carrier_delay'] = df_flights.apply(lambda row: df_mean_mkt_carrier_delays_Dec_2018[row['mkt_unique_carrier']], axis=1)
df_flights['mean_op_carrier_delay'] = df_flights.apply(lambda row: df_mean_op_carrier_delays_Dec_2018[row['op_unique_carrier']], axis=1)
df_flights_test['mean_mkt_carrier_delay'] = df_flights_test.apply(lambda row: df_mean_mkt_carrier_delays_Dec_2019[row['mkt_unique_carrier']], axis=1)
df_flights_test['mean_op_carrier_delay'] = df_flights_test.apply(lambda row: df_mean_op_carrier_delays_Dec_2019[row['op_unique_carrier']] if row['op_unique_carrier'] in df_mean_op_carrier_delays_Dec_2019.index else df_mean_op_carrier_delays_Dec_2019.mean(), axis=1)


# Calculate and apply standard deviation of marketing (mkt) and operating (op) carrier delays
df_std_mkt_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('mkt_unique_carrier').std()['carrier_delay']
df_std_op_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('op_unique_carrier').std()['carrier_delay']
df_std_mkt_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('mkt_unique_carrier').std()['carrier_delay']
df_std_op_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('op_unique_carrier').std()['carrier_delay']

df_flights['std_mkt_carrier_delay'] = df_flights.apply(lambda row: df_std_mkt_carrier_delays_Dec_2018[row['mkt_unique_carrier']], axis=1)
df_flights['std_op_carrier_delay'] = df_flights.apply(lambda row: df_std_op_carrier_delays_Dec_2018[row['op_unique_carrier']], axis=1)
df_flights_test['std_mkt_carrier_delay'] = df_flights_test.apply(lambda row: df_std_mkt_carrier_delays_Dec_2019[row['mkt_unique_carrier']], axis=1)
df_flights_test['std_op_carrier_delay'] = df_flights_test.apply(lambda row: df_std_op_carrier_delays_Dec_2019[row['op_unique_carrier']] if row['op_unique_carrier'] in df_std_op_carrier_delays_Dec_2019.index else df_std_op_carrier_delays_Dec_2019.mean(), axis=1)

In [14]:
# Add relationship between tail number and delay time
tail_num_delay_means_Dec_2018 = df_flights_Dec_2018.groupby('tail_num').mean()['arr_delay']
tail_num_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('tail_num').std()['arr_delay']
tail_num_delay_means_Dec_2019 = df_flights_Dec_2019.groupby('tail_num').mean()['arr_delay']
tail_num_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('tail_num').std()['arr_delay']

df_flights['mean_tail_num_arr_delay'] = df_flights.apply(lambda row: tail_num_delay_means_Dec_2018[row['tail_num']] if row['tail_num'] in tail_num_delay_means_Dec_2018.index else tail_num_delay_means_Dec_2018.mean(), axis=1)
df_flights['std_tail_num_arr_delay'] = df_flights.apply(lambda row: tail_num_delay_std_Dec_2018[row['tail_num']] if row['tail_num'] in tail_num_delay_std_Dec_2018.index else tail_num_delay_std_Dec_2018.mean(), axis=1)
df_flights['std_tail_num_arr_delay'].fillna(value=tail_num_delay_std_Dec_2018.mean(),inplace=True)  # Fix null values

df_flights_test['mean_tail_num_arr_delay'] = df_flights_test.apply(lambda row: tail_num_delay_means_Dec_2019[row['tail_num']] if row['tail_num'] in tail_num_delay_means_Dec_2019.index else tail_num_delay_means_Dec_2019.mean(), axis=1)
df_flights_test['std_tail_num_arr_delay'] = df_flights_test.apply(lambda row: tail_num_delay_std_Dec_2019[row['tail_num']] if row['tail_num'] in tail_num_delay_std_Dec_2019.index else tail_num_delay_std_Dec_2019.mean(), axis=1)
df_flights_test['std_tail_num_arr_delay'].fillna(value=tail_num_delay_std_Dec_2019.mean(),inplace=True)  # Fix null values

In [15]:
df_flights;

In [16]:
# One hot encoding?

# Dropping Columns, Construct Final DFs

In [17]:
# Drop inappropriate columns (this should be the final step, after other feature engineering is complete).
# Except for the arr_delay column, df_flights_final should have the same columns as df_flights_test in order to use both as training/testing X values.

df_flights_final = df_flights.drop(columns=['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'no_name', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 
                                           'arr_time', 'cancelled', 'cancellation_code', 'diverted', 'actual_elapsed_time', 'air_time', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime',
                                           'branded_code_share', 'op_carrier_fl_num', 'origin_city_name', 'dest_city_name', 'dup', 'flights', 'mkt_unique_carrier', 'op_unique_carrier', 'tail_num', 
                                           'origin_airport_id', 'dest_airport_id', 'crs_dep_time', 'crs_arr_time', 'hour_of_day_dep', 'hour_of_day_arr','state','fl_day'])

# Orignal columns of df_flights_test:
# ['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance']

df_flights_test_final = df_flights_test.drop(columns=['branded_code_share', 'op_carrier_fl_num', 'origin_city_name', 'dest_city_name', 'dup', 'flights',
                                                     'mkt_unique_carrier', 'op_unique_carrier', 'tail_num', 'origin_airport_id', 'dest_airport_id', 'crs_dep_time', 'crs_arr_time', 'hour_of_day_dep', 'hour_of_day_arr','state','fl_day'])
df_flights_test_final = df_flights_test_final.dropna()   # Drop rows with missing tail numbers (235 rows, or 0.14%)

In [18]:
df_flights_final

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,arr_delay,crs_elapsed_time,distance,daily_arr_delay_mean,daily_carrier_delay_mean,daily_weather_delay_mean,daily_nas_delay_mean,daily_security_delay_mean,daily_late_aircraft_delay_mean,daily_arr_delay_std,daily_carrier_delay_std,daily_weather_delay_std,daily_nas_delay_std,daily_security_delay_std,daily_late_aircraft_delay_std,dep_mean_hourly_delay,arr_mean_hourly_delay,dep_std_hourly_delay,arr_std_hourly_delay,mean_mkt_carrier_delay,mean_op_carrier_delay,std_mkt_carrier_delay,std_op_carrier_delay,mean_tail_num_arr_delay,std_tail_num_arr_delay
0,2019-01-01,AA,5677,DSM,CLT,-7.0,156.0,815.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,-0.449436,-3.107996,10.583675,22.461241,4.441386,3.721582,19.546037,18.803913,3.590909,22.829314
1,2019-01-01,AA,5679,CLT,MYR,-6.0,51.0,157.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,28.263723,18.320480,52.018430,48.367420,4.441386,3.721582,19.546037,18.803913,2.230769,27.862880
2,2019-01-01,AA,5680,BHM,DCA,3.0,117.0,653.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,17.749125,12.800197,40.055377,44.322185,4.441386,3.721582,19.546037,18.803913,0.909091,56.212106
3,2019-01-01,AA,5680,DCA,BHM,48.0,132.0,653.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,15.892553,10.074311,38.728841,38.745566,4.441386,3.721582,19.546037,18.803913,0.909091,56.212106
4,2019-01-01,AA,5682,STL,CLT,-11.0,118.0,575.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,6.922844,2.887204,29.396028,34.947741,4.441386,3.721582,19.546037,18.803913,11.625000,63.577003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146480,2019-01-01,AA,5672,IAD,CLT,-4.0,97.0,322.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,14.956673,11.761319,38.538667,42.701346,4.441386,3.721582,19.546037,18.803913,10.000000,44.406598
146481,2019-01-01,AA,5673,DCA,CVG,-8.0,97.0,411.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,19.956551,13.642727,42.106195,44.725278,4.441386,3.721582,19.546037,18.803913,6.075000,56.374839
146482,2019-01-01,AA,5675,CLT,XNA,-5.0,150.0,754.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,21.671688,13.642727,42.989322,44.725278,4.441386,3.721582,19.546037,18.803913,22.634615,46.791163
146483,2019-01-01,AA,5676,CLT,HSV,-7.0,86.0,333.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,6.922844,-3.107996,29.396028,22.461241,4.441386,3.721582,19.546037,18.803913,6.075000,56.374839


In [19]:
df_flights_test_final

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,crs_elapsed_time,distance,daily_arr_delay_mean,daily_carrier_delay_mean,daily_weather_delay_mean,daily_nas_delay_mean,daily_security_delay_mean,daily_late_aircraft_delay_mean,daily_arr_delay_std,daily_carrier_delay_std,daily_weather_delay_std,daily_nas_delay_std,daily_security_delay_std,daily_late_aircraft_delay_std,dep_mean_hourly_delay,arr_mean_hourly_delay,dep_std_hourly_delay,arr_std_hourly_delay,mean_mkt_carrier_delay,mean_op_carrier_delay,std_mkt_carrier_delay,std_op_carrier_delay,mean_tail_num_arr_delay,std_tail_num_arr_delay
0,2020-01-01,WN,5888,ONT,SFO,95,363,-7.601474,2.293873,0.208744,1.078996,0.002138,1.984977,28.767319,14.868113,5.377393,7.795824,0.202660,13.822821,14.600959,6.842410,38.198772,40.244274,4.035753,4.035753,15.809649,15.809649,-5.266667,14.273943
1,2020-01-01,WN,6276,ONT,SFO,90,363,-7.601474,2.293873,0.208744,1.078996,0.002138,1.984977,28.767319,14.868113,5.377393,7.795824,0.202660,13.822821,7.755154,-1.430349,30.239301,32.276206,4.035753,4.035753,15.809649,15.809649,-4.543478,15.118358
2,2020-01-01,WN,4598,ONT,SJC,70,333,-7.601474,2.293873,0.208744,1.078996,0.002138,1.984977,28.767319,14.868113,5.377393,7.795824,0.202660,13.822821,18.174040,7.547861,41.402626,42.265464,4.035753,4.035753,15.809649,15.809649,15.080000,67.948461
3,2020-01-01,WN,4761,ONT,SJC,75,333,-7.601474,2.293873,0.208744,1.078996,0.002138,1.984977,28.767319,14.868113,5.377393,7.795824,0.202660,13.822821,10.626506,1.073590,33.046285,34.084743,4.035753,4.035753,15.809649,15.809649,9.760870,59.222437
4,2020-01-01,WN,5162,ONT,SJC,80,333,-7.601474,2.293873,0.208744,1.078996,0.002138,1.984977,28.767319,14.868113,5.377393,7.795824,0.202660,13.822821,5.256890,-4.560211,24.774575,24.846163,4.035753,4.035753,15.809649,15.809649,15.690476,42.163727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150618,2020-01-07,DL,4813,DTW,JFK,117,509,-0.756638,3.038223,0.368415,1.554443,0.026017,3.991649,33.458493,16.372505,6.052948,7.359100,1.023451,18.881022,11.203728,6.842410,34.426052,40.244274,2.878207,2.543391,16.381576,16.671133,-13.600000,29.515533
150619,2020-01-07,DL,4814,GSP,LGA,119,610,-0.756638,3.038223,0.368415,1.554443,0.026017,3.991649,33.458493,16.372505,6.052948,7.359100,1.023451,18.881022,-0.578312,-8.259563,9.906493,18.974555,2.878207,2.543391,16.381576,16.671133,-16.750000,14.903703
150620,2020-01-07,DL,4815,ATL,XNA,121,589,-0.756638,3.038223,0.368415,1.554443,0.026017,3.991649,33.458493,16.372505,6.052948,7.359100,1.023451,18.881022,11.203728,4.429141,34.426052,39.506032,2.878207,2.543391,16.381576,16.671133,-15.928571,17.749134
150621,2020-01-07,DL,4815,XNA,ATL,114,589,-0.756638,3.038223,0.368415,1.554443,0.026017,3.991649,33.458493,16.372505,6.052948,7.359100,1.023451,18.881022,14.600959,7.547861,38.198772,42.265464,2.878207,2.543391,16.381576,16.671133,-15.928571,17.749134


In [20]:
# Export DFs to csv
df_flights_final.to_csv('df_flights_final.csv')
df_flights_test_final.to_csv('df_flights_test_final.csv')

In [21]:
df_flights_final['origin'].nunique()

360

In [22]:
df_flights_test_final['origin'].nunique()

363

# Hypothesis

I think these will be the most important feature(s):
* mkt and op carrier delay stats (mean/std)
* daily and hourly delay stats
* weather delay stats

I think these will be the least important feature(s):
* tail number stats