In [1]:
import pandas as pd
import os
import requests as re
import datetime
import time
from IPython.display import JSON
import json

# Import Dataframes

### Flights

In [2]:
pd.set_option('max_columns', None)
#pd.reset_option('max_columns')

In [3]:
# Import df_flights dataframes. Specify datatype of cancellation_code so there are not mixed datatypes present.
# We will use statistics from the last week of December 2018 to help predict the first week of January 2019.

# df_flights_Dec_2018:   Dec 24 - 31, 2018
# df_flights_Dec_2019:   Dec 24 - 31, 2019
# df_flights:            Jan 1 - 8, 2019 (after filtering)
# df_flights_test:       Jan 1 - 8, 2020

df_flights_Dec_2018 = pd.read_csv('flights_Dec_2018.csv', dtype={'cancellation_code':'object'})
df_flights_Dec_2019 = pd.read_csv('flights_Dec_2019.csv', dtype={'cancellation_code':'object'})
df_flights = pd.read_csv('flights_data.csv', dtype={'cancellation_code':'object'})

# Filter df_flights to exclude January 2018
df_flights = df_flights[df_flights['fl_date'] >= '2019-01-01']

In [4]:
# # Check flight delay bins
# print(pd.cut(df_flights_Dec_2018['dep_delay'], bins=10).value_counts())
# print(pd.cut(df_flights_Dec_2019['dep_delay'], bins=10).value_counts())
# print(pd.cut(df_flights['dep_delay'], bins=10).value_counts())

In [5]:
# Filter out extreme delays which account for less than 1% of flights, for more accurate stats/modelling
df_flights_Dec_2018 = df_flights_Dec_2018[df_flights_Dec_2018['dep_delay'] < 325.8]
df_flights_Dec_2019 = df_flights_Dec_2019[df_flights_Dec_2019['dep_delay'] < 305.8]
df_flights = df_flights[df_flights['dep_delay'] < 300.4]

In [6]:
df_flights_Dec_2018.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-12-31,F9,F9,F9,1753,F9,N211FR,1753,13204,MCO,"Orlando, FL",11109,COS,"Colorado Springs, CO",2115,2105.0,-10.0,15.0,2120.0,2331.0,33.0,2321,4.0,43.0,0.0,,0.0,N,246.0,299.0,251.0,1.0,1520.0,0.0,0.0,43.0,0.0,0.0,,,,


### Flights_test

In [7]:
# Look through flight data from January 2020
df_flights_test = pd.read_csv('flights_test.csv')

# Move column names to first row of data
df_flights_test = (df_flights_test.T.reset_index().T.reset_index(drop=True))

# Define new column names
df_flights_test.columns = ['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance']

# Clean up fl_date
df_flights_test['fl_date'] = df_flights_test['fl_date'].str[0:10]

In [8]:
df_flights_test.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN.1,WN.2,5888,WN.3,N951WN,5888.1,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363


# Feature Engineering

In [11]:
# Get hour of departure and arrival times (approximate using crs for 2020)
df_flights_Dec_2018['hour_of_day_dep'] = df_flights_Dec_2018['dep_time'] // 100
df_flights_Dec_2018['hour_of_day_arr'] = df_flights_Dec_2018['arr_time'] // 100
df_flights_Dec_2019['hour_of_day_dep'] = df_flights_Dec_2019['dep_time'] // 100
df_flights_Dec_2019['hour_of_day_arr'] = df_flights_Dec_2019['arr_time'] // 100
df_flights['hour_of_day_dep'] = df_flights['dep_time'] // 100
df_flights['hour_of_day_arr'] = df_flights['arr_time'] // 100
df_flights_test['hour_of_day_dep'] = df_flights_test['crs_dep_time'].astype(int) // 100
df_flights_test['hour_of_day_arr'] = df_flights_test['crs_arr_time'].astype(int) // 100

# Set delay nulls to zero (zero minutes of delay instead of null)
df_flights_Dec_2018[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']] = df_flights_Dec_2018[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].fillna(value=0)
df_flights_Dec_2019[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']] = df_flights_Dec_2019[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].fillna(value=0)
df_flights[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']] = df_flights[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].fillna(value=0)

# Exclude diverted and cancelled flights
df_flights_Dec_2018 = df_flights_Dec_2018[df_flights_Dec_2018['diverted'] != 1]
df_flights_Dec_2018 = df_flights_Dec_2018[df_flights_Dec_2018['cancelled'] != 1]
df_flights_Dec_2019 = df_flights_Dec_2019[df_flights_Dec_2019['diverted'] != 1]
df_flights_Dec_2019 = df_flights_Dec_2019[df_flights_Dec_2019['cancelled'] != 1]
df_flights = df_flights[df_flights['diverted'] != 1]
df_flights = df_flights[df_flights['cancelled'] != 1]

# Format fl_date column as a date
df_flights_Dec_2018['fl_date'] = pd.to_datetime(df_flights_Dec_2018['fl_date'])
df_flights_Dec_2019['fl_date'] = pd.to_datetime(df_flights_Dec_2019['fl_date'])
df_flights['fl_date'] = pd.to_datetime(df_flights['fl_date'])
df_flights_test['fl_date'] = pd.to_datetime(df_flights_test['fl_date'])

# Create 'state' column
df_flights_Dec_2018['state'] = df_flights_Dec_2018['origin_city_name'].str[-2:]
df_flights_Dec_2019['state'] = df_flights_Dec_2019['origin_city_name'].str[-2:]
df_flights['state'] = df_flights['origin_city_name'].str[-2:]
df_flights_test['state'] = df_flights_test['origin_city_name'].str[-2:]

# Create fl_day column
df_flights_Dec_2018['fl_day'] = pd.DatetimeIndex(df_flights_Dec_2018['fl_date']).day
df_flights_Dec_2019['fl_day'] = pd.DatetimeIndex(df_flights_Dec_2019['fl_date']).day
df_flights['fl_day'] = pd.DatetimeIndex(df_flights['fl_date']).day
df_flights_test['fl_day'] = pd.DatetimeIndex(df_flights_test['fl_date']).day

# NOTE: statistics will be gathered from Dec 2018/2019, and applied to Jan 2019/2020 respectively to simulate "1 week in advance" predictions.
# Get and apply daily mean flight delay values
df_daily_delay_means_Dec_2018 = df_flights_Dec_2018.groupby('fl_day').mean()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights['daily_arr_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['arr_delay'][row['fl_day']-1], axis=1)
df_flights['daily_carrier_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['carrier_delay'][row['fl_day']-1], axis=1)
df_flights['daily_weather_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['weather_delay'][row['fl_day']-1], axis=1)
df_flights['daily_nas_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['nas_delay'][row['fl_day']-1], axis=1)
df_flights['daily_security_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['security_delay'][row['fl_day']-1], axis=1)
df_flights['daily_late_aircraft_delay_mean'] = df_flights.apply(lambda row: df_daily_delay_means_Dec_2018['late_aircraft_delay'][row['fl_day']-1], axis=1)

df_daily_delay_means_Dec_2019 = df_flights_Dec_2019.groupby('fl_day').mean()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights_test['daily_arr_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['arr_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_carrier_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['carrier_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_weather_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['weather_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_nas_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['nas_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_security_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['security_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_late_aircraft_delay_mean'] = df_flights_test.apply(lambda row: df_daily_delay_means_Dec_2019['late_aircraft_delay'][row['fl_day']-1], axis=1)

# Get and apply daily standard deviation of flight delay values
df_daily_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('fl_day').std()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights['daily_arr_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['arr_delay'][row['fl_day']-1], axis=1)
df_flights['daily_carrier_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['carrier_delay'][row['fl_day']-1], axis=1)
df_flights['daily_weather_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['weather_delay'][row['fl_day']-1], axis=1)
df_flights['daily_nas_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['nas_delay'][row['fl_day']-1], axis=1)
df_flights['daily_security_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['security_delay'][row['fl_day']-1], axis=1)
df_flights['daily_late_aircraft_delay_std'] = df_flights.apply(lambda row: df_daily_delay_std_Dec_2018['late_aircraft_delay'][row['fl_day']-1], axis=1)

df_daily_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('fl_day').std()[['arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].reset_index()
df_flights_test['daily_arr_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['arr_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_carrier_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['carrier_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_weather_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['weather_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_nas_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['nas_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_security_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['security_delay'][row['fl_day']-1], axis=1)
df_flights_test['daily_late_aircraft_delay_std'] = df_flights_test.apply(lambda row: df_daily_delay_std_Dec_2019['late_aircraft_delay'][row['fl_day']-1], axis=1)

In [12]:
# Get and apply hourly mean flight delay values
dep_hourly_delay_mean_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_dep').mean()['dep_delay']
arr_hourly_delay_mean_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_arr').mean()['arr_delay']
dep_hourly_delay_mean_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_dep').mean()['dep_delay']
arr_hourly_delay_mean_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_arr').mean()['arr_delay']

df_flights['dep_mean_hourly_delay'] = df_flights.apply(lambda row: dep_hourly_delay_mean_Dec_2018[row['hour_of_day_dep']], axis=1)
df_flights['arr_mean_hourly_delay'] = df_flights.apply(lambda row: arr_hourly_delay_mean_Dec_2018[row['hour_of_day_arr']], axis=1)
df_flights_test['dep_mean_hourly_delay'] = df_flights_test.apply(lambda row: dep_hourly_delay_mean_Dec_2019[row['hour_of_day_dep']], axis=1)
df_flights_test['arr_mean_hourly_delay'] = df_flights_test.apply(lambda row: arr_hourly_delay_mean_Dec_2019[row['hour_of_day_arr']], axis=1)


# Get and apply hourly standard deviation of flight delay values
dep_hourly_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_dep').std()['dep_delay']
arr_hourly_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('hour_of_day_arr').std()['arr_delay']
dep_hourly_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_dep').std()['dep_delay']
arr_hourly_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('hour_of_day_arr').std()['arr_delay']

df_flights['dep_std_hourly_delay'] = df_flights.apply(lambda row: dep_hourly_delay_std_Dec_2018[row['hour_of_day_dep']], axis=1)
df_flights['arr_std_hourly_delay'] = df_flights.apply(lambda row: arr_hourly_delay_std_Dec_2018[row['hour_of_day_arr']], axis=1)
df_flights_test['dep_std_hourly_delay'] = df_flights_test.apply(lambda row: dep_hourly_delay_std_Dec_2019[row['hour_of_day_dep']], axis=1)
df_flights_test['arr_std_hourly_delay'] = df_flights_test.apply(lambda row: arr_hourly_delay_std_Dec_2019[row['hour_of_day_arr']], axis=1)

In [13]:
# Calculate and apply mean marketing (mkt) and operating (op) carrier delays
df_mean_mkt_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('mkt_unique_carrier').mean()['carrier_delay']
df_mean_op_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('op_unique_carrier').mean()['carrier_delay']
df_mean_mkt_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('mkt_unique_carrier').mean()['carrier_delay']
df_mean_op_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('op_unique_carrier').mean()['carrier_delay']

df_flights['mean_mkt_carrier_delay'] = df_flights.apply(lambda row: df_mean_mkt_carrier_delays_Dec_2018[row['mkt_unique_carrier']], axis=1)
df_flights['mean_op_carrier_delay'] = df_flights.apply(lambda row: df_mean_op_carrier_delays_Dec_2018[row['op_unique_carrier']], axis=1)
df_flights_test['mean_mkt_carrier_delay'] = df_flights_test.apply(lambda row: df_mean_mkt_carrier_delays_Dec_2019[row['mkt_unique_carrier']], axis=1)
df_flights_test['mean_op_carrier_delay'] = df_flights_test.apply(lambda row: df_mean_op_carrier_delays_Dec_2019[row['op_unique_carrier']] if row['op_unique_carrier'] in df_mean_op_carrier_delays_Dec_2019.index else df_mean_op_carrier_delays_Dec_2019.mean(), axis=1)


# Calculate and apply standard deviation of marketing (mkt) and operating (op) carrier delays
df_std_mkt_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('mkt_unique_carrier').std()['carrier_delay']
df_std_op_carrier_delays_Dec_2018 = df_flights_Dec_2018.groupby('op_unique_carrier').std()['carrier_delay']
df_std_mkt_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('mkt_unique_carrier').std()['carrier_delay']
df_std_op_carrier_delays_Dec_2019 = df_flights_Dec_2019.groupby('op_unique_carrier').std()['carrier_delay']

df_flights['std_mkt_carrier_delay'] = df_flights.apply(lambda row: df_std_mkt_carrier_delays_Dec_2018[row['mkt_unique_carrier']], axis=1)
df_flights['std_op_carrier_delay'] = df_flights.apply(lambda row: df_std_op_carrier_delays_Dec_2018[row['op_unique_carrier']], axis=1)
df_flights_test['std_mkt_carrier_delay'] = df_flights_test.apply(lambda row: df_std_mkt_carrier_delays_Dec_2019[row['mkt_unique_carrier']], axis=1)
df_flights_test['std_op_carrier_delay'] = df_flights_test.apply(lambda row: df_std_op_carrier_delays_Dec_2019[row['op_unique_carrier']] if row['op_unique_carrier'] in df_std_op_carrier_delays_Dec_2019.index else df_std_op_carrier_delays_Dec_2019.mean(), axis=1)

In [26]:
df_mean_op_carrier_delays_Dec_2018

op_unique_carrier
9E    4.038208
AA    4.345640
AS    1.960087
AX    3.403130
B6    6.949810
C5    3.091423
CP    4.377983
DL    2.427403
EM    1.320000
EV    4.492783
F9    6.111444
G4    2.806542
G7    4.139578
HA    2.690518
KS    4.304348
MQ    2.936698
NK    2.482390
OH    3.554984
OO    4.461860
PT    3.243643
QX    3.406237
UA    3.152323
WN    5.081818
YV    5.452431
YX    2.022060
ZW    2.372856
Name: carrier_delay, dtype: float64

In [14]:
# Add relationship between tail number and delay time
tail_num_delay_means_Dec_2018 = df_flights_Dec_2018.groupby('tail_num').mean()['arr_delay']
tail_num_delay_std_Dec_2018 = df_flights_Dec_2018.groupby('tail_num').std()['arr_delay']
tail_num_delay_means_Dec_2019 = df_flights_Dec_2019.groupby('tail_num').mean()['arr_delay']
tail_num_delay_std_Dec_2019 = df_flights_Dec_2019.groupby('tail_num').std()['arr_delay']

df_flights['mean_tail_num_arr_delay'] = df_flights.apply(lambda row: tail_num_delay_means_Dec_2018[row['tail_num']] if row['tail_num'] in tail_num_delay_means_Dec_2018.index else tail_num_delay_means_Dec_2018.mean(), axis=1)
df_flights['std_tail_num_arr_delay'] = df_flights.apply(lambda row: tail_num_delay_std_Dec_2018[row['tail_num']] if row['tail_num'] in tail_num_delay_std_Dec_2018.index else tail_num_delay_std_Dec_2018.mean(), axis=1)
df_flights['std_tail_num_arr_delay'].fillna(value=tail_num_delay_std_Dec_2018.mean(),inplace=True)  # Fix null values

df_flights_test['mean_tail_num_arr_delay'] = df_flights_test.apply(lambda row: tail_num_delay_means_Dec_2019[row['tail_num']] if row['tail_num'] in tail_num_delay_means_Dec_2019.index else tail_num_delay_means_Dec_2019.mean(), axis=1)
df_flights_test['std_tail_num_arr_delay'] = df_flights_test.apply(lambda row: tail_num_delay_std_Dec_2019[row['tail_num']] if row['tail_num'] in tail_num_delay_std_Dec_2019.index else tail_num_delay_std_Dec_2019.mean(), axis=1)
df_flights_test['std_tail_num_arr_delay'].fillna(value=tail_num_delay_std_Dec_2019.mean(),inplace=True)  # Fix null values

In [26]:
df_flights;

In [None]:
# One hot encoding?

# Dropping Columns, Construct Final DFs

In [15]:
# Drop inappropriate columns (this should be the final step, after other feature engineering is complete).
# Except for the arr_delay column, df_flights_final should have the same columns as df_flights_test in order to use both as training/testing X values.

df_flights_final = df_flights.drop(columns=['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'no_name', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 
                                      'arr_time', 'cancelled', 'cancellation_code', 'diverted', 'actual_elapsed_time', 'air_time', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime',
                                           'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_carrier_fl_num', 'origin_city_name', 'dest_city_name', 'dup', 'flights'])

# Orignal columns of df_flights_test:
# ['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance']

df_flights_test_final = df_flights_test.drop(columns=['branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_carrier_fl_num', 'origin_city_name', 'dest_city_name', 'dup', 'flights'])
df_flights_test_final = df_flights_test_final.dropna()   # Drop rows with missing tail numbers (235 rows, or 0.14%)

In [22]:
df_flights_final

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,origin_airport_id,origin,dest_airport_id,dest,crs_dep_time,crs_arr_time,arr_delay,crs_elapsed_time,distance,state,fl_day,daily_arr_delay_mean,daily_carrier_delay_mean,daily_weather_delay_mean,daily_nas_delay_mean,daily_security_delay_mean,daily_late_aircraft_delay_mean,daily_arr_delay_std,daily_carrier_delay_std,daily_weather_delay_std,daily_nas_delay_std,daily_security_delay_std,daily_late_aircraft_delay_std,hour_of_day_dep,hour_of_day_arr,dep_mean_hourly_delay,arr_mean_hourly_delay,dep_std_hourly_delay,arr_std_hourly_delay,mean_mkt_carrier_delay,mean_op_carrier_delay,std_mkt_carrier_delay,std_op_carrier_delay,mean_tail_num_arr_delay,std_tail_num_arr_delay
154049,2019-01-01,AA,OH,N575NN,13577,MYR,11057,CLT,2157,2311,-3.0,74.0,157.0,SC,1,-6.032670,2.099320,0.072074,0.965896,0.019515,2.067024,27.003492,14.301496,2.141705,6.505192,1.090462,13.931125,21.0,23.0,27.562254,16.242184,50.056620,46.043611,4.098024,3.554984,18.659873,18.181722,9.375000,34.433650
154050,2019-01-01,AA,OH,N254PS,10529,BDL,11278,DCA,1151,1325,6.0,94.0,313.0,CT,1,-6.032670,2.099320,0.072074,0.965896,0.019515,2.067024,27.003492,14.301496,2.141705,6.505192,1.090462,13.931125,12.0,13.0,9.213188,2.091566,30.555721,33.496974,4.098024,3.554984,18.659873,18.181722,8.696970,65.443910
154051,2019-01-01,AA,OH,N254PS,11278,DCA,10529,BDL,1009,1126,14.0,77.0,313.0,DC,1,-6.032670,2.099320,0.072074,0.965896,0.019515,2.067024,27.003492,14.301496,2.141705,6.505192,1.090462,13.931125,10.0,11.0,7.447353,0.032751,30.340531,29.797562,4.098024,3.554984,18.659873,18.181722,8.696970,65.443910
154052,2019-01-01,AA,OH,N710PS,11278,DCA,11996,GSP,2000,2134,-12.0,94.0,396.0,DC,1,-6.032670,2.099320,0.072074,0.965896,0.019515,2.067024,27.003492,14.301496,2.141705,6.505192,1.090462,13.931125,19.0,21.0,19.120787,12.272941,42.202577,44.169176,4.098024,3.554984,18.659873,18.181722,22.076923,54.191854
154053,2019-01-01,AA,OH,N515AE,13931,ORF,11057,CLT,1215,1347,-14.0,92.0,290.0,VA,1,-6.032670,2.099320,0.072074,0.965896,0.019515,2.067024,27.003492,14.301496,2.141705,6.505192,1.090462,13.931125,12.0,13.0,9.213188,2.091566,30.555721,33.496974,4.098024,3.554984,18.659873,18.181722,-7.000000,10.904276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321093,2019-01-08,UA,EV,N14907,14685,SAV,11618,EWR,1845,2059,87.0,134.0,708.0,GA,8,4.825704,3.604662,0.422636,2.636248,0.047228,5.107682,36.347172,17.466081,6.505810,8.681913,1.531397,22.033801,19.0,22.0,19.120787,16.589411,42.202577,47.978874,3.414305,4.492783,17.667574,20.617308,-8.090909,17.698289
321094,2019-01-08,UA,EV,N881AS,13930,ORD,11775,FSD,1955,2150,-9.0,115.0,463.0,IL,8,4.825704,3.604662,0.422636,2.636248,0.047228,5.107682,36.347172,17.466081,6.505810,8.681913,1.531397,22.033801,19.0,21.0,19.120787,12.272941,42.202577,44.169176,3.414305,4.492783,17.667574,20.617308,0.100000,22.161399
321095,2019-01-08,UA,EV,N13913,13930,ORD,11042,CLE,1005,1230,-31.0,85.0,315.0,IL,8,4.825704,3.604662,0.422636,2.636248,0.047228,5.107682,36.347172,17.466081,6.505810,8.681913,1.531397,22.033801,9.0,11.0,6.206752,0.032751,28.636964,29.797562,3.414305,4.492783,17.667574,20.617308,18.826087,57.196354
321096,2019-01-08,UA,EV,N14570,12266,IAH,12206,HRL,1420,1540,-13.0,80.0,295.0,TX,8,4.825704,3.604662,0.422636,2.636248,0.047228,5.107682,36.347172,17.466081,6.505810,8.681913,1.531397,22.033801,14.0,15.0,12.756996,4.892655,34.773392,35.442232,3.414305,4.492783,17.667574,20.617308,-1.821429,19.191151


In [24]:
df_flights_test_final

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,origin_airport_id,origin,dest_airport_id,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,state,fl_day,daily_arr_delay_mean,daily_carrier_delay_mean,daily_weather_delay_mean,daily_nas_delay_mean,daily_security_delay_mean,daily_late_aircraft_delay_mean,daily_arr_delay_std,daily_carrier_delay_std,daily_weather_delay_std,daily_nas_delay_std,daily_security_delay_std,daily_late_aircraft_delay_std,hour_of_day_dep,hour_of_day_arr,dep_mean_hourly_delay,arr_mean_hourly_delay,dep_std_hourly_delay,arr_std_hourly_delay,mean_mkt_carrier_delay,mean_op_carrier_delay,std_mkt_carrier_delay,std_op_carrier_delay,mean_tail_num_arr_delay,std_tail_num_arr_delay
0,2020-01-01,WN,WN.3,N951WN,13891,ONT,14771,SFO,1810,1945,95,363,CA,1,-0.866558,2.682493,0.685071,2.025531,0.011319,4.846812,40.131499,15.472635,11.257484,15.798276,0.722741,23.896525,18,19,14.307777,6.072259,38.402956,39.672890,3.857863,3.457568,15.459722,16.802249,-6.166667,13.546348
1,2020-01-01,WN,WN,N467WN,13891,ONT,14771,SFO,1150,1320,90,363,CA,1,-0.866558,2.682493,0.685071,2.025531,0.011319,4.846812,40.131499,15.472635,11.257484,15.798276,0.722741,23.896525,11,13,8.328564,-1.132045,32.194015,34.470865,3.857863,3.857863,15.459722,15.459722,-5.354167,14.846003
2,2020-01-01,WN,WN,N7885A,13891,ONT,14831,SJC,2020,2130,70,333,CA,1,-0.866558,2.682493,0.685071,2.025531,0.011319,4.846812,40.131499,15.472635,11.257484,15.798276,0.722741,23.896525,20,21,17.420892,6.645410,41.089700,42.125636,3.857863,3.857863,15.459722,15.459722,16.666667,68.935172
3,2020-01-01,WN,WN,N551WN,13891,ONT,14831,SJC,1340,1455,75,333,CA,1,-0.866558,2.682493,0.685071,2.025531,0.011319,4.846812,40.131499,15.472635,11.257484,15.798276,0.722741,23.896525,13,14,11.185063,1.249471,34.838580,35.436593,3.857863,3.857863,15.459722,15.459722,2.833333,41.927157
4,2020-01-01,WN,WN,N968WN,13891,ONT,14831,SJC,915,1035,80,333,CA,1,-0.866558,2.682493,0.685071,2.025531,0.011319,4.846812,40.131499,15.472635,11.257484,15.798276,0.722741,23.896525,9,10,5.940884,-4.711270,27.717365,25.929012,3.857863,3.857863,15.459722,15.459722,14.975000,41.473802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172036,2020-01-08,DL,9E,N301PQ,12953,LGA,14321,PWM,2147,2306,79,269,NY,8,-0.685563,3.036365,0.374985,1.561211,0.026716,4.025208,33.518791,16.484985,6.187920,7.407990,1.038467,18.921518,21,23,22.649809,11.359668,46.971053,44.962441,2.813129,2.410097,16.419739,16.267815,-3.961538,21.127197
172037,2020-01-08,DL,9E,N300PQ,12953,LGA,15016,STL,1220,1423,183,888,NY,8,-0.685563,3.036365,0.374985,1.561211,0.026716,4.025208,33.518791,16.484985,6.187920,7.407990,1.038467,18.921518,12,14,9.196483,1.249471,34.120512,35.436593,2.813129,2.410097,16.419739,16.267815,-10.800000,14.235002
172038,2020-01-08,DL,9E,N300PQ,15016,STL,12953,LGA,1459,1823,144,888,MO,8,-0.685563,3.036365,0.374985,1.561211,0.026716,4.025208,33.518791,16.484985,6.187920,7.407990,1.038467,18.921518,14,18,10.510318,4.190206,34.731142,40.022930,2.813129,2.410097,16.419739,16.267815,-10.800000,14.235002
172039,2020-01-08,DL,9E,N297PQ,10397,ATL,10785,BTV,2009,2245,156,961,GA,8,-0.685563,3.036365,0.374985,1.561211,0.026716,4.025208,33.518791,16.484985,6.187920,7.407990,1.038467,18.921518,20,22,17.420892,10.736339,41.089700,44.728542,2.813129,2.410097,16.419739,16.267815,-16.393939,10.422150


In [16]:
# Export DFs to csv
df_flights_final.to_csv('df_flights_final.csv')
df_flights_test_final.to_csv('df_flights_test_final.csv')

# Hypothesis

I think these will be the most important feature(s):
* mkt and op carrier delay stats (mean/std)
* daily and hourly delay stats
* weather delay stats

I think these will be the least important feature(s):
* tail number stats