# Introduction

Baseline experiment

# Set up Environment

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel
from math import sqrt
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                             explained_variance_score, max_error)

# display all columns of dataframes in the notebook
pd.options.display.max_columns = None

# ignore warnings
warnings.filterwarnings('ignore')

# set up random seed for reproducibility
RANDOM_SEED = 42

# Load Data

In [2]:
file_path = '../data/raw/training_data/'
file_name = 'training_data_ATL.zip'
df = pd.read_csv(file_path + file_name)

# Pre-process Data

## Remove Irrelevant Columns

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,ArrivalDate,ARR_Flight_SLDT,Num_Arr_SLDT-30,Num_Arr_SLDT-25,Num_Arr_SLDT-20,Num_Arr_SLDT-15,Num_Arr_SLDT-10,Num_Arr_SLDT-5,Num_Arr_SLDT-0,Num_Arr_SLDT+5,Num_Arr_SLDT+10,Num_Arr_SLDT+15,Num_Arr_SLDT+20,Num_Arr_SLDT+25,Num_Dep_SLDT-30,Num_Dep_SLDT-25,Num_Dep_SLDT-20,Num_Dep_SLDT-15,Num_Dep_SLDT-10,Num_Dep_SLDT-5,Num_Dep_SLDT-0,Num_Dep_SLDT+5,Num_Dep_SLDT+10,Num_Dep_SLDT+15,Num_Dep_SLDT+20,Num_Dep_SLDT+25
0,4569,2017,1,1,1,7,2017-01-01 00:00:00,AA,19805,AA,N869AA,232,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,10397,1039705,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,14:25:00,1430.0,5.0,5.0,0.0,0.0,1400-1459,12.0,1442.0,1717.0,10.0,17:26:00,1727.0,1.0,1.0,0.0,0.0,1700-1759,121.0,117.0,95.0,1.0,731.0,3,,,,,,2017-01-01 00:00:00,2017-01-01 17:26:00,9,9,12,3,7,0,2,3,2,5,1,3,3,2,0,3,7,0,3,2,1,9,3,5
1,4570,2017,1,1,2,1,2017-01-02 00:00:00,AA,19805,AA,N866AA,232,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,10397,1039705,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,14:25:00,1420.0,-5.0,0.0,0.0,-1.0,1400-1459,13.0,1433.0,1723.0,11.0,17:26:00,1734.0,8.0,8.0,0.0,0.0,1700-1759,121.0,134.0,110.0,1.0,731.0,3,,,,,,2017-01-02 00:00:00,2017-01-02 17:26:00,6,8,9,4,10,0,2,1,5,4,2,2,6,3,1,4,8,0,3,2,4,11,6,3
2,4571,2017,1,1,3,2,2017-01-03 00:00:00,AA,19805,AA,N897AA,232,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,10397,1039705,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,14:25:00,1426.0,1.0,1.0,0.0,0.0,1400-1459,14.0,1440.0,1718.0,5.0,17:26:00,1723.0,-3.0,0.0,0.0,-1.0,1700-1759,121.0,117.0,98.0,1.0,731.0,3,,,,,,2017-01-03 00:00:00,2017-01-03 17:26:00,6,10,9,3,8,0,2,1,4,4,4,2,5,2,0,4,5,0,3,2,4,14,7,4


In [4]:
# obtain a segment of df (intersted columns)
segment = df[['ActualElapsedTime', 'FlightDate', 'ArrivalDate', 'Reporting_Airline',\
              'OriginAirportID', 'OriginCityMarketID', 'OriginState', 'CRSDepTime',
              'DepTime', 'DepDelay', 'DepDel15', 'DepartureDelayGroups', 'CRSArrTime',
              'CRSElapsedTime', 'Distance', 'DistanceGroup']]

In [5]:
# rename some column names
segment = segment.rename(columns={'ActualElapsedTime': 'AOBTtoAIBT',
                                  'FlightDate': 'SOBTDate',
                                  'ArrivalDate': 'SIBTDate',
                                  'Reporting_Airline': 'UniqueCarrierCode',
                                  'CRSDepTime': 'SOBTTime',
                                  'DepTime': 'AOBTTime',
                                  'DepDelay': 'OBTDelay',
                                  'DepDel15': 'OBTDel15',
                                  'DepartureDelayGroups': 'OBTDelayGroups',
                                  'CRSArrTime': 'SIBTTime',
                                  'CRSElapsedTime': 'SOBTtoSIBT',
                                  })

In [6]:
traffic = df[df.columns[-24:]]
traffic.columns = traffic.columns.str.replace('SLDT', 'SIBT')
traffic.head()

Unnamed: 0,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25
0,9,9,12,3,7,0,2,3,2,5,1,3,3,2,0,3,7,0,3,2,1,9,3,5
1,6,8,9,4,10,0,2,1,5,4,2,2,6,3,1,4,8,0,3,2,4,11,6,3
2,6,10,9,3,8,0,2,1,4,4,4,2,5,2,0,4,5,0,3,2,4,14,7,4
3,7,10,11,4,9,0,3,1,5,5,3,1,5,2,0,5,5,0,3,2,4,14,7,4
4,5,8,10,4,7,0,2,1,4,5,3,2,5,2,0,5,6,0,3,3,3,14,5,4


In [7]:
segment = pd.concat([segment, traffic], axis=1)
segment.head()

Unnamed: 0,AOBTtoAIBT,SOBTDate,SIBTDate,UniqueCarrierCode,OriginAirportID,OriginCityMarketID,OriginState,SOBTTime,AOBTTime,OBTDelay,OBTDel15,OBTDelayGroups,SIBTTime,SOBTtoSIBT,Distance,DistanceGroup,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25
0,117.0,2017-01-01 00:00:00,2017-01-01 00:00:00,AA,11298,30194,TX,14:25:00,1430.0,5.0,0.0,0.0,17:26:00,121.0,731.0,3,9,9,12,3,7,0,2,3,2,5,1,3,3,2,0,3,7,0,3,2,1,9,3,5
1,134.0,2017-01-02 00:00:00,2017-01-02 00:00:00,AA,11298,30194,TX,14:25:00,1420.0,-5.0,0.0,-1.0,17:26:00,121.0,731.0,3,6,8,9,4,10,0,2,1,5,4,2,2,6,3,1,4,8,0,3,2,4,11,6,3
2,117.0,2017-01-03 00:00:00,2017-01-03 00:00:00,AA,11298,30194,TX,14:25:00,1426.0,1.0,0.0,0.0,17:26:00,121.0,731.0,3,6,10,9,3,8,0,2,1,4,4,4,2,5,2,0,4,5,0,3,2,4,14,7,4
3,103.0,2017-01-04 00:00:00,2017-01-04 00:00:00,AA,11298,30194,TX,14:25:00,1421.0,-4.0,0.0,-1.0,17:26:00,121.0,731.0,3,7,10,11,4,9,0,3,1,5,5,3,1,5,2,0,5,5,0,3,2,4,14,7,4
4,105.0,2017-01-05 00:00:00,2017-01-05 00:00:00,AA,11298,30194,TX,14:25:00,1424.0,-1.0,0.0,-1.0,17:26:00,121.0,731.0,3,5,8,10,4,7,0,2,1,4,5,3,2,5,2,0,5,6,0,3,3,3,14,5,4


## Format Datetime Objects

In [8]:
segment['SOBTDate'] = pd.to_datetime(segment['SOBTDate']).dt.date
segment['SIBTDate'] = pd.to_datetime(segment['SIBTDate']).dt.date
segment['SOBTTime'] = pd.to_datetime(
    segment['SOBTTime'], format='%H:%M:%S').dt.time
segment['SIBTTime'] = pd.to_datetime(
    segment['SIBTTime'], format='%H:%M:%S').dt.time

In [9]:
# format AOBTTime
segment['AOBTTime'] = segment['AOBTTime'].apply(lambda x: str(int(x)).zfill(4))

# for actual timings, '2400' would be produced when the date changed to the next day
# for e.g., 2400 1 Jan is actually equivalent to 0000 2 Jan
print('Number of AOBTTime being "2400":',
      segment[segment['AOBTTime'] == '2400'].shape[0])

Number of AOBTTime being "2400": 60


In [10]:
one_day = pd.Timedelta('1 days')

# Initialize AOBTDate
segment['AOBTDate'] = segment['SOBTDate']

# if 2400 occurs, add 1 day to AOBTDate
segment.loc[segment.AOBTTime == '2400',
            "AOBTDate"] = segment['AOBTDate'] + one_day

# update AOBTTime from '2400' to '0000'
segment.loc[segment.AOBTTime == '2400', "AOBTTime"] = '0000'

# now the issue of '2400' is resolved
# update AOBTTime to time object
segment['AOBTTime'] = pd.to_datetime(
    segment['AOBTTime'], format='%H%M').dt.time

In [11]:
# merge date & time, and only keep datetime objects
list_1 = ['SOBTDate', 'SIBTDate', 'AOBTDate']
list_2 = ['SOBTTime', 'SIBTTime', 'AOBTTime']
list_3 = ['SOBT', 'SIBT', 'AOBT']

for i in range(3):
    segment[list_1[i]] = segment[list_1[i]].apply(lambda x: str(x))
    segment[list_2[i]] = segment[list_2[i]].apply(lambda x: str(x))
    segment[list_3[i]] = pd.to_datetime(
        segment[list_1[i]] + ' ' + segment[list_2[i]])
    segment = segment.drop(labels=[list_1[i], list_2[i]], axis=1)

In [12]:
# sort flights by SIBT
segment.sort_values('SIBT', inplace=True, ascending=True)
segment = segment.reset_index(drop=True)

In [13]:
# there are 14 flights (from EDA) scheduled to arrive at ATL in 2019
# remove them as only arrival flights scheduled in 2017 and 2018 are of concern
segment = segment[segment['SIBT'].dt.year != 2019].reset_index(drop=True)

In [14]:
segment.head(3)

Unnamed: 0,AOBTtoAIBT,UniqueCarrierCode,OriginAirportID,OriginCityMarketID,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,SOBTtoSIBT,Distance,DistanceGroup,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25,SOBT,SIBT,AOBT
0,218.0,DL,14869,34614,UT,-2.0,0.0,-1.0,223.0,1590.0,7,0,0,0,0,1,0,0,2,0,0,0,0,2,1,1,0,1,0,0,3,1,0,3,1,2017-01-01 00:55:00,2017-01-01 06:38:00,2017-01-01 00:53:00
1,81.0,F9,13204,31454,FL,-4.0,0.0,-1.0,88.0,404.0,2,0,0,0,0,3,0,1,0,0,0,0,1,1,1,0,1,3,0,3,1,0,3,1,0,2017-01-01 05:15:00,2017-01-01 06:43:00,2017-01-01 05:11:00
2,88.0,EV,10980,30980,TN,302.0,1.0,12.0,57.0,106.0,1,0,0,0,0,3,0,0,0,0,0,1,0,1,0,1,0,4,0,2,0,3,1,0,0,2017-01-01 05:49:00,2017-01-01 06:46:00,2017-01-01 10:51:00


## Impute Missing Values

In [17]:
# find columns that contain null values
null_col = segment.columns[segment.isna().any()].tolist()

# find how many values are missing in each column
missing_count = pd.Series(segment[null_col].isna().sum(), name='Count')

# determine percentage of missing values
missing_percentage = pd.Series(
    np.round(segment[null_col].isnull().mean()*100, 3), name='Percentage')

missing_var_summary = pd.concat([missing_count, missing_percentage], axis=1)
missing_var_summary

Unnamed: 0,Count,Percentage
OBTDelay,460,0.062
OBTDel15,460,0.062
OBTDelayGroups,460,0.062


In [18]:
# handle missing OBTDelay
segment.loc[segment.OBTDelay.isnull(), "OBTDelay"] = (
    segment.AOBT - segment.SOBT).astype('timedelta64[m]')


def delayed_more_than_15(x):
    # handle missing OBTDel15
    if x >= 15:
        a = 1
    else:
        a = 0
    return a


segment['OBTDel15'] = segment['OBTDelay'].apply(
    lambda x: delayed_more_than_15(x))

# for all 460 entries, the OBTDelay is 0. Proved by:
# segment[segment['OBTDelayGroups'].isnull()]['OBTDelay'].unique()
# Hence, update missing OBTDelayGroups to be 0
segment.loc[segment.OBTDelayGroups.isnull(), "OBTDelayGroups"] = 0

## Format Int

In [20]:
# format float to int
to_int = ['AOBTtoAIBT', 'OBTDelay', 'OBTDelayGroups',
          'SOBTtoSIBT', 'Distance', 'DistanceGroup']

for item in to_int:
    segment[item] = segment[item].astype(int)

In [23]:
segment.head(3)

Unnamed: 0,AOBTtoAIBT,UniqueCarrierCode,OriginAirportID,OriginCityMarketID,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,SOBTtoSIBT,Distance,DistanceGroup,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25,SOBT,SIBT,AOBT,SIBTQuarter,SIBTMonth,SIBTDayOfMonth,SIBTDayOfWeek,SIBTHour
0,218,DL,14869,34614,UT,-2,0,-1,223,1590,7,0,0,0,0,1,0,0,2,0,0,0,0,2,1,1,0,1,0,0,3,1,0,3,1,2017-01-01 00:55:00,2017-01-01 06:38:00,2017-01-01 00:53:00,1,1,1,7,6
1,81,F9,13204,31454,FL,-4,0,-1,88,404,2,0,0,0,0,3,0,1,0,0,0,0,1,1,1,0,1,3,0,3,1,0,3,1,0,2017-01-01 05:15:00,2017-01-01 06:43:00,2017-01-01 05:11:00,1,1,1,7,6
2,88,EV,10980,30980,TN,302,1,12,57,106,1,0,0,0,0,3,0,0,0,0,0,1,0,1,0,1,0,4,0,2,0,3,1,0,0,2017-01-01 05:49:00,2017-01-01 06:46:00,2017-01-01 10:51:00,1,1,1,7,6


## Update Temporal Features

The temporal features of the dataset were originally based on SOBT. For this project, we need to update them to be about SIBT.

In [21]:
# create Quarter, Month, DayOfMonth, DayOfWeek based on SIBT (previously, SOBT)
segment['SIBTQuarter'] = segment['SIBT'].dt.quarter
segment['SIBTMonth'] = segment['SIBT'].dt.month
segment['SIBTDayOfMonth'] = segment['SIBT'].dt.day
segment['SIBTDayOfWeek'] = segment['SIBT'].dt.weekday + 1
segment['SIBTHour'] = segment['SIBT'].dt.hour

In [22]:
segment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744965 entries, 0 to 744964
Data columns (total 43 columns):
AOBTtoAIBT            744965 non-null int32
UniqueCarrierCode     744965 non-null object
OriginAirportID       744965 non-null int64
OriginCityMarketID    744965 non-null int64
OriginState           744965 non-null object
OBTDelay              744965 non-null int32
OBTDel15              744965 non-null int64
OBTDelayGroups        744965 non-null int32
SOBTtoSIBT            744965 non-null int32
Distance              744965 non-null int32
DistanceGroup         744965 non-null int32
Num_Arr_SIBT-30       744965 non-null int64
Num_Arr_SIBT-25       744965 non-null int64
Num_Arr_SIBT-20       744965 non-null int64
Num_Arr_SIBT-15       744965 non-null int64
Num_Arr_SIBT-10       744965 non-null int64
Num_Arr_SIBT-5        744965 non-null int64
Num_Arr_SIBT-0        744965 non-null int64
Num_Arr_SIBT+5        744965 non-null int64
Num_Arr_SIBT+10       744965 non-null int64
Num_Arr

## Remove Datetime Data

In [24]:
segment = segment.drop(['SOBT', 'SIBT', 'AOBT'], axis=1)

# Save the Transformed Data

In [35]:
# create an empty csv file to be written
dataset_path = '..\\data\\interim\\'
dataset_name = 'train_val_data.csv'

with open(dataset_path + dataset_name, "w") as my_empty_csv:
    pass

segment.to_csv(dataset_path + dataset_name, mode='w', header=True, index=False)