# Introduction

Baseline experiment

# Set up Environment

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel
from math import sqrt
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                             explained_variance_score, max_error)

# display all columns of dataframes in the notebook
pd.options.display.max_columns = None

# ignore warnings
warnings.filterwarnings('ignore')

# set up random seed for reproducibility
RANDOM_SEED = 42

# Load Data

In [2]:
file_path = '../data/raw/test_data/'
file_name = 'test_data_ATL.zip'
df = pd.read_csv(file_path + file_name)

# Pre-process Data

## Remove Irrelevant Columns

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,ArrivalDate,ARR_Flight_SLDT,Num_Arr_SLDT-30,Num_Arr_SLDT-25,Num_Arr_SLDT-20,Num_Arr_SLDT-15,Num_Arr_SLDT-10,Num_Arr_SLDT-5,Num_Arr_SLDT-0,Num_Arr_SLDT+5,Num_Arr_SLDT+10,Num_Arr_SLDT+15,Num_Arr_SLDT+20,Num_Arr_SLDT+25,Num_Dep_SLDT-30,Num_Dep_SLDT-25,Num_Dep_SLDT-20,Num_Dep_SLDT-15,Num_Dep_SLDT-10,Num_Dep_SLDT-5,Num_Dep_SLDT-0,Num_Dep_SLDT+5,Num_Dep_SLDT+10,Num_Dep_SLDT+15,Num_Dep_SLDT+20,Num_Dep_SLDT+25
0,794,2019,1,1,5,6,2019-01-05 00:00:00,OO,20304,OO,N426SW,3500,11641,1164102,31641,FAY,"Fayetteville, NC",NC,37,North Carolina,36,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,12:40:00,1240.0,0.0,0.0,0.0,0.0,1200-1259,11.0,1251.0,1351.0,5.0,14:16:00,1356.0,-20.0,0.0,0.0,-2.0,1400-1459,96.0,76.0,60.0,1.0,331.0,2,,,,,,2019-01-05 00:00:00,2019-01-05 14:16:00,6,3,3,7,29,0,14,3,0,2,3,3,8,5,7,5,7,0,2,2,4,4,1,3
1,797,2019,1,1,5,6,2019-01-05 00:00:00,OO,20304,OO,N906SW,3502,11641,1164102,31641,FAY,"Fayetteville, NC",NC,37,North Carolina,36,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,09:58:00,951.0,-7.0,0.0,0.0,-1.0,0900-0959,10.0,1001.0,1058.0,6.0,11:30:00,1104.0,-26.0,0.0,0.0,-2.0,1100-1159,92.0,73.0,57.0,1.0,331.0,2,,,,,,2019-01-05 00:00:00,2019-01-05 11:30:00,7,1,3,2,13,0,1,3,1,1,1,1,7,0,7,3,11,0,4,9,3,4,3,1
2,799,2019,1,1,5,6,2019-01-05 00:00:00,OO,20304,OO,N779CA,3503,11308,1130802,31308,DHN,"Dothan, AL",AL,1,Alabama,51,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,15:00:00,1450.0,-10.0,0.0,0.0,-1.0,1500-1559,11.0,1501.0,1642.0,25.0,17:10:00,1707.0,-3.0,0.0,0.0,-1.0,1700-1759,70.0,77.0,41.0,1.0,170.0,1,,,,,,2019-01-05 00:00:00,2019-01-05 17:10:00,3,7,1,12,23,0,6,1,2,2,5,3,1,1,3,6,14,0,5,2,2,3,1,3


In [4]:
# obtain a segment of df (intersted columns)
segment = df[['ActualElapsedTime', 'FlightDate', 'ArrivalDate', 'Reporting_Airline',\
              'OriginAirportID', 'OriginCityMarketID', 'OriginState', 'CRSDepTime',
              'DepTime', 'DepDelay', 'DepDel15', 'DepartureDelayGroups', 'CRSArrTime',
              'CRSElapsedTime', 'Distance', 'DistanceGroup']]

In [5]:
# rename some column names
segment = segment.rename(columns={'ActualElapsedTime': 'AOBTtoAIBT',
                                  'FlightDate': 'SOBTDate',
                                  'ArrivalDate': 'SIBTDate',
                                  'Reporting_Airline': 'UniqueCarrierCode',
                                  'CRSDepTime': 'SOBTTime',
                                  'DepTime': 'AOBTTime',
                                  'DepDelay': 'OBTDelay',
                                  'DepDel15': 'OBTDel15',
                                  'DepartureDelayGroups': 'OBTDelayGroups',
                                  'CRSArrTime': 'SIBTTime',
                                  'CRSElapsedTime': 'SOBTtoSIBT',
                                  })

In [6]:
traffic = df[df.columns[-24:]]
traffic.columns = traffic.columns.str.replace('SLDT', 'SIBT')
traffic.head()

Unnamed: 0,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25
0,6,3,3,7,29,0,14,3,0,2,3,3,8,5,7,5,7,0,2,2,4,4,1,3
1,7,1,3,2,13,0,1,3,1,1,1,1,7,0,7,3,11,0,4,9,3,4,3,1
2,3,7,1,12,23,0,6,1,2,2,5,3,1,1,3,6,14,0,5,2,2,3,1,3
3,3,1,4,4,21,0,9,13,1,2,2,1,5,3,2,1,10,0,6,2,5,1,1,1
4,1,7,12,12,39,0,11,18,20,1,1,1,4,6,4,2,4,0,2,9,3,6,5,5


In [7]:
segment = pd.concat([segment, traffic], axis=1)
segment.head()

Unnamed: 0,AOBTtoAIBT,SOBTDate,SIBTDate,UniqueCarrierCode,OriginAirportID,OriginCityMarketID,OriginState,SOBTTime,AOBTTime,OBTDelay,OBTDel15,OBTDelayGroups,SIBTTime,SOBTtoSIBT,Distance,DistanceGroup,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25
0,76.0,2019-01-05 00:00:00,2019-01-05 00:00:00,OO,11641,31641,NC,12:40:00,1240.0,0.0,0.0,0.0,14:16:00,96.0,331.0,2,6,3,3,7,29,0,14,3,0,2,3,3,8,5,7,5,7,0,2,2,4,4,1,3
1,73.0,2019-01-05 00:00:00,2019-01-05 00:00:00,OO,11641,31641,NC,09:58:00,951.0,-7.0,0.0,-1.0,11:30:00,92.0,331.0,2,7,1,3,2,13,0,1,3,1,1,1,1,7,0,7,3,11,0,4,9,3,4,3,1
2,77.0,2019-01-05 00:00:00,2019-01-05 00:00:00,OO,11308,31308,AL,15:00:00,1450.0,-10.0,0.0,-1.0,17:10:00,70.0,170.0,1,3,7,1,12,23,0,6,1,2,2,5,3,1,1,3,6,14,0,5,2,2,3,1,3
3,54.0,2019-01-05 00:00:00,2019-01-05 00:00:00,OO,11308,31308,AL,10:45:00,1038.0,-7.0,0.0,-1.0,12:58:00,73.0,170.0,1,3,1,4,4,21,0,9,13,1,2,2,1,5,3,2,1,10,0,6,2,5,1,1,1
4,88.0,2019-01-05 00:00:00,2019-01-05 00:00:00,OO,11308,31308,AL,06:45:00,635.0,-10.0,0.0,-1.0,08:51:00,66.0,170.0,1,1,7,12,12,39,0,11,18,20,1,1,1,4,6,4,2,4,0,2,9,3,6,5,5


## Format Datetime Objects

In [8]:
segment['SOBTDate'] = pd.to_datetime(segment['SOBTDate']).dt.date
segment['SIBTDate'] = pd.to_datetime(segment['SIBTDate']).dt.date
segment['SOBTTime'] = pd.to_datetime(
    segment['SOBTTime'], format='%H:%M:%S').dt.time
segment['SIBTTime'] = pd.to_datetime(
    segment['SIBTTime'], format='%H:%M:%S').dt.time

In [9]:
# format AOBTTime
segment['AOBTTime'] = segment['AOBTTime'].apply(lambda x: str(int(x)).zfill(4))

# for actual timings, '2400' would be produced when the date changed to the next day
# for e.g., 2400 1 Jan is actually equivalent to 0000 2 Jan
print('Number of AOBTTime being "2400":',
      segment[segment['AOBTTime'] == '2400'].shape[0])

Number of AOBTTime being "2400": 33


In [10]:
one_day = pd.Timedelta('1 days')

# Initialize AOBTDate
segment['AOBTDate'] = segment['SOBTDate']

# if 2400 occurs, add 1 day to AOBTDate
segment.loc[segment.AOBTTime == '2400',
            "AOBTDate"] = segment['AOBTDate'] + one_day

# update AOBTTime from '2400' to '0000'
segment.loc[segment.AOBTTime == '2400', "AOBTTime"] = '0000'

# now the issue of '2400' is resolved
# update AOBTTime to time object
segment['AOBTTime'] = pd.to_datetime(
    segment['AOBTTime'], format='%H%M').dt.time

In [11]:
# merge date & time, and only keep datetime objects
list_1 = ['SOBTDate', 'SIBTDate', 'AOBTDate']
list_2 = ['SOBTTime', 'SIBTTime', 'AOBTTime']
list_3 = ['SOBT', 'SIBT', 'AOBT']

for i in range(3):
    segment[list_1[i]] = segment[list_1[i]].apply(lambda x: str(x))
    segment[list_2[i]] = segment[list_2[i]].apply(lambda x: str(x))
    segment[list_3[i]] = pd.to_datetime(
        segment[list_1[i]] + ' ' + segment[list_2[i]])
    segment = segment.drop(labels=[list_1[i], list_2[i]], axis=1)

In [12]:
# sort flights by SIBT
segment.sort_values('SIBT', inplace=True, ascending=True)
segment = segment.reset_index(drop=True)

In [13]:
# only keep flight with SIBT in 2019
segment = segment[segment['SIBT'].dt.year == 2019].reset_index(drop=True)

In [14]:
segment.head(3)

Unnamed: 0,AOBTtoAIBT,UniqueCarrierCode,OriginAirportID,OriginCityMarketID,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,SOBTtoSIBT,Distance,DistanceGroup,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25,SOBT,SIBT,AOBT
0,206.0,DL,14869,34614,UT,-5.0,0.0,-1.0,218.0,1590.0,7,0,0,0,0,1,0,0,1,0,0,2,0,0,2,0,1,4,0,2,3,2,0,4,0,2019-01-01 00:59:00,2019-01-01 06:37:00,2019-01-01 00:54:00
1,65.0,DL,10994,30994,SC,2.0,0.0,0.0,74.0,259.0,2,0,0,0,0,2,0,0,0,2,0,0,2,2,0,1,0,7,0,3,2,0,4,0,0,2019-01-01 05:30:00,2019-01-01 06:44:00,2019-01-01 05:32:00
2,216.0,F9,12889,32211,NV,23.0,1.0,1.0,232.0,1747.0,7,0,0,1,1,2,0,1,0,2,2,3,6,0,2,2,3,6,0,4,0,0,0,2,3,2019-01-01 00:05:00,2019-01-01 06:57:00,2019-01-01 00:28:00


## Impute Missing Values

In [15]:
# find columns that contain null values
null_col = segment.columns[segment.isna().any()].tolist()

# find how many values are missing in each column
missing_count = pd.Series(segment[null_col].isna().sum(), name='Count')

# determine percentage of missing values
missing_percentage = pd.Series(
    np.round(segment[null_col].isnull().mean()*100, 3), name='Percentage')

missing_var_summary = pd.concat([missing_count, missing_percentage], axis=1)
missing_var_summary

Unnamed: 0,Count,Percentage


In [16]:
# handle missing OBTDelay
segment.loc[segment.OBTDelay.isnull(), "OBTDelay"] = (
    segment.AOBT - segment.SOBT).astype('timedelta64[m]')


def delayed_more_than_15(x):
    # handle missing OBTDel15
    if x >= 15:
        a = 1
    else:
        a = 0
    return a


segment['OBTDel15'] = segment['OBTDelay'].apply(
    lambda x: delayed_more_than_15(x))

# for all 460 entries, the OBTDelay is 0. Proved by:
# segment[segment['OBTDelayGroups'].isnull()]['OBTDelay'].unique()
# Hence, update missing OBTDelayGroups to be 0
segment.loc[segment.OBTDelayGroups.isnull(), "OBTDelayGroups"] = 0

## Format Int

In [17]:
# format float to int
to_int = ['AOBTtoAIBT', 'OBTDelay', 'OBTDelayGroups',
          'SOBTtoSIBT', 'Distance', 'DistanceGroup']

for item in to_int:
    segment[item] = segment[item].astype(int)

In [18]:
segment.head(3)

Unnamed: 0,AOBTtoAIBT,UniqueCarrierCode,OriginAirportID,OriginCityMarketID,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,SOBTtoSIBT,Distance,DistanceGroup,Num_Arr_SIBT-30,Num_Arr_SIBT-25,Num_Arr_SIBT-20,Num_Arr_SIBT-15,Num_Arr_SIBT-10,Num_Arr_SIBT-5,Num_Arr_SIBT-0,Num_Arr_SIBT+5,Num_Arr_SIBT+10,Num_Arr_SIBT+15,Num_Arr_SIBT+20,Num_Arr_SIBT+25,Num_Dep_SIBT-30,Num_Dep_SIBT-25,Num_Dep_SIBT-20,Num_Dep_SIBT-15,Num_Dep_SIBT-10,Num_Dep_SIBT-5,Num_Dep_SIBT-0,Num_Dep_SIBT+5,Num_Dep_SIBT+10,Num_Dep_SIBT+15,Num_Dep_SIBT+20,Num_Dep_SIBT+25,SOBT,SIBT,AOBT
0,206,DL,14869,34614,UT,-5,0,-1,218,1590,7,0,0,0,0,1,0,0,1,0,0,2,0,0,2,0,1,4,0,2,3,2,0,4,0,2019-01-01 00:59:00,2019-01-01 06:37:00,2019-01-01 00:54:00
1,65,DL,10994,30994,SC,2,0,0,74,259,2,0,0,0,0,2,0,0,0,2,0,0,2,2,0,1,0,7,0,3,2,0,4,0,0,2019-01-01 05:30:00,2019-01-01 06:44:00,2019-01-01 05:32:00
2,216,F9,12889,32211,NV,23,1,1,232,1747,7,0,0,1,1,2,0,1,0,2,2,3,6,0,2,2,3,6,0,4,0,0,0,2,3,2019-01-01 00:05:00,2019-01-01 06:57:00,2019-01-01 00:28:00


## Update Temporal Features

The temporal features of the dataset were originally based on SOBT. For this project, we need to update them to be about SIBT.

In [19]:
# create Quarter, Month, DayOfMonth, DayOfWeek based on SIBT (previously, SOBT)
segment['SIBTQuarter'] = segment['SIBT'].dt.quarter
segment['SIBTMonth'] = segment['SIBT'].dt.month
segment['SIBTDayOfMonth'] = segment['SIBT'].dt.day
segment['SIBTDayOfWeek'] = segment['SIBT'].dt.weekday + 1
segment['SIBTHour'] = segment['SIBT'].dt.hour

In [20]:
segment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391713 entries, 0 to 391712
Data columns (total 43 columns):
AOBTtoAIBT            391713 non-null int32
UniqueCarrierCode     391713 non-null object
OriginAirportID       391713 non-null int64
OriginCityMarketID    391713 non-null int64
OriginState           391713 non-null object
OBTDelay              391713 non-null int32
OBTDel15              391713 non-null int64
OBTDelayGroups        391713 non-null int32
SOBTtoSIBT            391713 non-null int32
Distance              391713 non-null int32
DistanceGroup         391713 non-null int32
Num_Arr_SIBT-30       391713 non-null int64
Num_Arr_SIBT-25       391713 non-null int64
Num_Arr_SIBT-20       391713 non-null int64
Num_Arr_SIBT-15       391713 non-null int64
Num_Arr_SIBT-10       391713 non-null int64
Num_Arr_SIBT-5        391713 non-null int64
Num_Arr_SIBT-0        391713 non-null int64
Num_Arr_SIBT+5        391713 non-null int64
Num_Arr_SIBT+10       391713 non-null int64
Num_Arr

## Remove Datetime Data

In [21]:
segment = segment.drop(['SOBT', 'SIBT', 'AOBT'], axis=1)

# Sample 10,000 test samples

In [22]:
sample = segment.sample(n=10000, random_state=RANDOM_SEED)

# Save the Transformed Data

In [23]:
# create an empty csv file to be written
dataset_path = '..\\data\\interim\\'
dataset_name = 'test_data.csv'

with open(dataset_path + dataset_name, "w") as my_empty_csv:
    pass

sample.to_csv(dataset_path + dataset_name, mode='w', header=True, index=False)