# Introduction

Baseline experiment:
- feature engineering for training set

# Set up Environment

In [148]:
import pandas as pd
import warnings

# be able to view all columns of dataframes
pd.options.display.max_columns = None

# ignore warnings
warnings.filterwarnings('ignore')

# set up random seed for reproducibility
RANDOM_SEED = 42

# Load Data

In [158]:
SMALL_DATA_MODE = True
if SMALL_DATA_MODE:
    DATA_FILE = '../data/interim/small_train_data.csv'
    flights = pd.read_csv(DATA_FILE)
else:
    DATA_FILE = '../data/raw/training_data/training_data_ATL.zip'
    df = pd.read_csv(DATA_FILE)

# Pre-process

## Remove Irrelevant Columns

In [4]:
# remove the Unnamed column
df.drop(df.columns[df.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)

In [6]:
df.head(3)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,ArrivalDate,ARR_Flight_SLDT,Num_Arr_SLDT-30,Num_Arr_SLDT-25,Num_Arr_SLDT-20,Num_Arr_SLDT-15,Num_Arr_SLDT-10,Num_Arr_SLDT-5,Num_Arr_SLDT-0,Num_Arr_SLDT+5,Num_Arr_SLDT+10,Num_Arr_SLDT+15,Num_Arr_SLDT+20,Num_Arr_SLDT+25,Num_Dep_SLDT-30,Num_Dep_SLDT-25,Num_Dep_SLDT-20,Num_Dep_SLDT-15,Num_Dep_SLDT-10,Num_Dep_SLDT-5,Num_Dep_SLDT-0,Num_Dep_SLDT+5,Num_Dep_SLDT+10,Num_Dep_SLDT+15,Num_Dep_SLDT+20,Num_Dep_SLDT+25
0,2017,1,1,1,7,2017-01-01 00:00:00,AA,19805,AA,N869AA,232,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,10397,1039705,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,14:25:00,1430.0,5.0,5.0,0.0,0.0,1400-1459,12.0,1442.0,1717.0,10.0,17:26:00,1727.0,1.0,1.0,0.0,0.0,1700-1759,121.0,117.0,95.0,1.0,731.0,3,,,,,,2017-01-01 00:00:00,2017-01-01 17:26:00,9,9,12,3,7,0,2,3,2,5,1,3,3,2,0,3,7,0,3,2,1,9,3,5
1,2017,1,1,2,1,2017-01-02 00:00:00,AA,19805,AA,N866AA,232,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,10397,1039705,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,14:25:00,1420.0,-5.0,0.0,0.0,-1.0,1400-1459,13.0,1433.0,1723.0,11.0,17:26:00,1734.0,8.0,8.0,0.0,0.0,1700-1759,121.0,134.0,110.0,1.0,731.0,3,,,,,,2017-01-02 00:00:00,2017-01-02 17:26:00,6,8,9,4,10,0,2,1,5,4,2,2,6,3,1,4,8,0,3,2,4,11,6,3
2,2017,1,1,3,2,2017-01-03 00:00:00,AA,19805,AA,N897AA,232,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,10397,1039705,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,14:25:00,1426.0,1.0,1.0,0.0,0.0,1400-1459,14.0,1440.0,1718.0,5.0,17:26:00,1723.0,-3.0,0.0,0.0,-1.0,1700-1759,121.0,117.0,98.0,1.0,731.0,3,,,,,,2017-01-03 00:00:00,2017-01-03 17:26:00,6,10,9,3,8,0,2,1,4,4,4,2,5,2,0,4,5,0,3,2,4,14,7,4


In [5]:
df.shape

(744979, 84)

Airport traffic data are stored in the last 24 columns of df. We want to select only the first 60 columns for EDA.

In [7]:
arrivals = df.iloc[:, 0:60].copy()

In [6]:
arrivals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744979 entries, 0 to 744978
Data columns (total 60 columns):
Year                               744979 non-null int64
Quarter                            744979 non-null int64
Month                              744979 non-null int64
DayofMonth                         744979 non-null int64
DayOfWeek                          744979 non-null int64
FlightDate                         744979 non-null object
Reporting_Airline                  744979 non-null object
DOT_ID_Reporting_Airline           744979 non-null int64
IATA_CODE_Reporting_Airline        744979 non-null object
Tail_Number                        744979 non-null object
Flight_Number_Reporting_Airline    744979 non-null int64
OriginAirportID                    744979 non-null int64
OriginAirportSeqID                 744979 non-null int64
OriginCityMarketID                 744979 non-null int64
Origin                             744979 non-null object
OriginCityName               

In [114]:
# obtain a segment of arrivals (intersted columns)
segment = arrivals[['FlightDate', 'ArrivalDate', 'Reporting_Airline', 'OriginAirportID', 'Origin',
                    'OriginState', 'CRSDepTime', 'DepTime', 'DepDelay', 'DepDel15', 'DepartureDelayGroups',
                    'DepTimeBlk', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay',
                    'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk', 'CRSElapsedTime', 'ActualElapsedTime', 'AirTime',
                    'Distance', 'DistanceGroup', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
                    'LateAircraftDelay']]

In [115]:
# rename some column names
segment = segment.rename(columns={'FlightDate': 'SOBTDate',
                                  'ArrivalDate': 'SIBTDate',
                                  'Reporting_Airline': 'UniqueCarrierCode',
                                  'CRSDepTime': 'SOBTTime',
                                  'DepTime': 'AOBTTime',
                                  'DepDelay': 'OBTDelay',
                                  'DepDel15': 'OBTDel15',
                                  'DepartureDelayGroups': 'OBTDelayGroups',
                                  'DepTimeBlk': 'OBTTimeBlk',
                                  'TaxiOut': 'AXOT',
                                  'WheelsOff': 'ATOTTime',
                                  'WheelsOn': 'ALDTTime',
                                  'TaxiIn': 'AXIT',
                                  'CRSArrTime': 'SIBTTime',
                                  'ArrTime': 'AIBTTime',
                                  'ArrDelay': 'IBTDelay',
                                  'ArrDel15': 'IBTDel15',
                                  'ArrivalDelayGroups': 'IBTDelayGroups',
                                  'ArrTimeBlk': 'IBTTimeBlk',
                                  'CRSElapsedTime': 'SOBTtoSIBT',
                                  'ActualElapsedTime': 'AOBTtoAIBT',
                                  'AirTime': 'AirborneTime'})

## Format Datetime Objects

In [116]:
segment['SOBTDate'] = pd.to_datetime(segment['SOBTDate']).dt.date
segment['SIBTDate'] = pd.to_datetime(segment['SIBTDate']).dt.date
segment['SOBTTime'] = pd.to_datetime(segment['SOBTTime'], format='%H:%M:%S').dt.time
segment['SIBTTime'] = pd.to_datetime(segment['SIBTTime'], format='%H:%M:%S').dt.time

In [117]:
timing_list = ['AOBTTime', 'ATOTTime', 'ALDTTime', 'AIBTTime']

for timing in timing_list:
    segment[timing] = segment[timing].apply(lambda x: str(int(x)).zfill(4))
    
# for these actual timings, '2400' would be produced when the date changed to the next day
# for e.g., 2400 1 Jan is actually equivalent to 0000 2 Jan

In [118]:
print('Number of AOBTTime being "2400":', segment[segment['AOBTTime'] == '2400'].shape[0])
print('Number of ATOTTime being "2400":', segment[segment['ATOTTime'] == '2400'].shape[0])
print('Number of ALDTTime being "2400":', segment[segment['ALDTTime'] == '2400'].shape[0])
print('Number of AIBTTime being "2400":', segment[segment['AIBTTime'] == '2400'].shape[0])

Number of AOBTTime being "2400": 60
Number of ATOTTime being "2400": 84
Number of ALDTTime being "2400": 96
Number of AIBTTime being "2400": 132


In [119]:
one_day = pd.Timedelta('1 days')

# Initialize AOBTDate
segment['AOBTDate'] = segment['SOBTDate']

# if 2400 occurs, add 1 day to AOBTDate
segment.loc[segment.AOBTTime == '2400', "AOBTDate"] = segment['AOBTDate'] + one_day

# update AOBTTime from '2400' to '0000'
segment.loc[segment.AOBTTime == '2400', "AOBTTime"] = '0000'

# carry out the same setps on ATOT, ALDT and AIBT:
segment['ATOTDate'] = segment['SOBTDate']
segment.loc[segment.ATOTTime == '2400', "ATOTDate"] = segment['ATOTDate'] + one_day
segment.loc[segment.ATOTTime == '2400', "ATOTTime"] = '0000'

segment['ALDTDate'] = segment['SIBTDate']
segment.loc[segment.ALDTTime == '2400', "ALDTDate"] = segment['ALDTDate'] + one_day
segment.loc[segment.ALDTTime == '2400', "ALDTTime"] = '0000'

segment['AIBTDate'] = segment['SIBTDate']
segment.loc[segment.AIBTTime == '2400', "AIBTDate"] = segment['AIBTDate'] + one_day
segment.loc[segment.AIBTTime == '2400', "AIBTTime"] = '0000'

In [120]:
# now the issue of '2400' is resolved
# update timings to time objects
for timing in timing_list:
    segment[timing] = pd.to_datetime(segment[timing], format='%H%M').dt.time

In [121]:
# merge date & time, and only keep datetime objects
list_1 = ['SOBTDate', 'SIBTDate', 'AOBTDate',
          'ATOTDate', 'ALDTDate', 'AIBTDate']
list_2 = ['SOBTTime', 'SIBTTime', 'AOBTTime',
          'ATOTTime', 'ALDTTime', 'AIBTTime']
list_3 = ['SOBT', 'SIBT', 'AOBT', 'ATOT', 'ALDT', 'AIBT']

for i in range(6):
    segment[list_1[i]] = segment[list_1[i]].apply(lambda x: str(x))
    segment[list_2[i]] = segment[list_2[i]].apply(lambda x: str(x))
    segment[list_3[i]] = pd.to_datetime(segment[list_1[i]] + ' ' + segment[list_2[i]])
    segment = segment.drop(labels=[list_1[i], list_2[i]], axis=1)

In [78]:
segment.head(3)

Unnamed: 0,UniqueCarrierCode,OriginAirportID,Origin,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,OBTTimeBlk,AXOT,AXIT,IBTDelay,IBTDel15,IBTDelayGroups,IBTTimeBlk,SOBTtoSIBT,AOBTtoAIBT,AirborneTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,SOBT,SIBT,AOBT,ATOT,ALDT,AIBT
0,AA,11298,DFW,TX,5.0,0.0,0.0,1400-1459,12.0,10.0,1.0,0.0,0.0,1700-1759,121.0,117.0,95.0,731.0,3,,,,,,2017-01-01 14:25:00,2017-01-01 17:26:00,2017-01-01 14:30:00,2017-01-01 14:42:00,2017-01-01 17:17:00,2017-01-01 17:27:00
1,AA,11298,DFW,TX,-5.0,0.0,-1.0,1400-1459,13.0,11.0,8.0,0.0,0.0,1700-1759,121.0,134.0,110.0,731.0,3,,,,,,2017-01-02 14:25:00,2017-01-02 17:26:00,2017-01-02 14:20:00,2017-01-02 14:33:00,2017-01-02 17:23:00,2017-01-02 17:34:00
2,AA,11298,DFW,TX,1.0,0.0,0.0,1400-1459,14.0,5.0,-3.0,0.0,-1.0,1700-1759,121.0,117.0,98.0,731.0,3,,,,,,2017-01-03 14:25:00,2017-01-03 17:26:00,2017-01-03 14:26:00,2017-01-03 14:40:00,2017-01-03 17:18:00,2017-01-03 17:23:00


## Create Time-related Features

In [122]:
# create Year, Quarter, Month, DayOfMonth, DayOfWeek based on SIBT (previously, SOBT)
segment['Year'] = segment['SIBT'].dt.year
segment['Quarter'] = segment['SIBT'].dt.quarter
segment['Month'] = segment['SIBT'].dt.month
segment['DayOfMonth'] = segment['SIBT'].dt.day
segment['DayOfWeek'] = segment['SIBT'].dt.weekday + 1

# update OBTTimeBlk and IBTTimeBlk based on SOBT and SIBT respectively
segment['OBTTimeBlk'] = segment['SOBT'].dt.hour
segment['IBTTimeBlk'] = segment['SIBT'].dt.hour

In [123]:
# there are 14 flights (from EDA) scheduled to arrive at ATL in 2019
# remove them as only arrival flights scheduled in 2017 and 2018 are of concern
segment = segment[segment['Year'] != 2019]

# obtain a copy of segment for experiments
flights = segment.copy()

## Impute Missing Values

In [125]:
# find columns that contain null values
null_col = segment.columns[segment.isna().any()].tolist()
print(null_col)

['OBTDelay', 'OBTDel15', 'OBTDelayGroups', 'IBTDelay', 'IBTDel15', 'IBTDelayGroups', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']


In [128]:
# find how many values are missing in each column
segment[null_col].isna().sum()

OBTDelay                460
OBTDel15                460
OBTDelayGroups          460
IBTDelay                326
IBTDel15                326
IBTDelayGroups          326
CarrierDelay         634252
WeatherDelay         634252
NASDelay             634252
SecurityDelay        634252
LateAircraftDelay    634252
dtype: int64

In [130]:
# handle missing OBTDelay
segment.loc[segment.OBTDelay.isnull(), "OBTDelay"] = (segment.AOBT - segment.SOBT).astype('timedelta64[m]')

# handle missing OBTDel15
def delayed_more_than_15(x):
    if x >= 15:
        a = 1
    else:
        a = 0
    return a
segment['OBTDel15'] = segment['OBTDelay'].apply(lambda x: delayed_more_than_15(x))

# for all 460 entries, the OBTDelay is 0. Proved by:
# segment[segment['OBTDelayGroups'].isnull()]['OBTDelay'].unique()
# Hence, update missing OBTDelayGroups to be 0
segment.loc[segment.OBTDelayGroups.isnull(), "OBTDelayGroups"] = 0

In [135]:
# handle missing IBTDelay
segment.loc[segment.IBTDelay.isnull(), "IBTDelay"] = (segment.AIBT - segment.SIBT).astype('timedelta64[m]')

# handle missing OBTDel15
segment['IBTDel15'] = segment['IBTDelay'].apply(lambda x: delayed_more_than_15(x))

# for all 326 entries, the OBTDelay is 0. Proved by:
# segment[segment['IBTDelayGroups'].isnull()]['IBTDelay'].unique()
# Hence, update missing IBTDelayGroups to be 0
segment.loc[segment.IBTDelayGroups.isnull(), "IBTDelayGroups"] = 0

In [141]:
# format float to int
to_int = ['OBTDelay', 'OBTDel15', 'OBTDelayGroups', 'IBTDelay', 'IBTDel15', 'IBTDelayGroups', 'AXOT', 'AXIT',
          'SOBTtoSIBT', 'AOBTtoAIBT', 'AirborneTime', 'Distance', 'DistanceGroup',
          'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

for item in to_int:
    segment[item] = segment[item].fillna(0).apply(lambda x: int(x))

In [142]:
segment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 744965 entries, 0 to 744978
Data columns (total 35 columns):
UniqueCarrierCode    744965 non-null object
OriginAirportID      744965 non-null int64
Origin               744965 non-null object
OriginState          744965 non-null object
OBTDelay             744965 non-null int64
OBTDel15             744965 non-null int64
OBTDelayGroups       744965 non-null int64
OBTTimeBlk           744965 non-null int64
AXOT                 744965 non-null int64
AXIT                 744965 non-null int64
IBTDelay             744965 non-null int64
IBTDel15             744965 non-null int64
IBTDelayGroups       744965 non-null int64
IBTTimeBlk           744965 non-null int64
SOBTtoSIBT           744965 non-null int64
AOBTtoAIBT           744965 non-null int64
AirborneTime         744965 non-null int64
Distance             744965 non-null int64
DistanceGroup        744965 non-null int64
CarrierDelay         744965 non-null int64
WeatherDelay         74496

In [145]:
segment.head(3)

Unnamed: 0,UniqueCarrierCode,OriginAirportID,Origin,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,OBTTimeBlk,AXOT,AXIT,IBTDelay,IBTDel15,IBTDelayGroups,IBTTimeBlk,SOBTtoSIBT,AOBTtoAIBT,AirborneTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,SOBT,SIBT,AOBT,ATOT,ALDT,AIBT,Year,Quarter,Month,DayOfMonth,DayOfWeek
0,AA,11298,DFW,TX,5,0,0,14,12,10,1,0,0,17,121,117,95,731,3,0,0,0,0,0,2017-01-01 14:25:00,2017-01-01 17:26:00,2017-01-01 14:30:00,2017-01-01 14:42:00,2017-01-01 17:17:00,2017-01-01 17:27:00,2017,1,1,1,7
1,AA,11298,DFW,TX,-5,0,-1,14,13,11,8,0,0,17,121,134,110,731,3,0,0,0,0,0,2017-01-02 14:25:00,2017-01-02 17:26:00,2017-01-02 14:20:00,2017-01-02 14:33:00,2017-01-02 17:23:00,2017-01-02 17:34:00,2017,1,1,2,1
2,AA,11298,DFW,TX,1,0,0,14,14,5,-3,0,-1,17,121,117,98,731,3,0,0,0,0,0,2017-01-03 14:25:00,2017-01-03 17:26:00,2017-01-03 14:26:00,2017-01-03 14:40:00,2017-01-03 17:18:00,2017-01-03 17:23:00,2017,1,1,3,2


Code that generated a small training dataset for faster feature engineering:

    small = segment.sample(n=10000, random_state=RANDOM_SEED)

    # create an empty csv file to be written
    small_dataset_path = 'C:\\Users\\lis\\repos\\flight_duration_prediction\\data\\interim\\'
    small_dataset_name = 'small_train_data.csv'

    with open(small_dataset_path + small_dataset_name, "w") as my_empty_csv:
        pass

    # append the df to csv file
    small.to_csv(small_dataset_path + small_dataset_name, mode='a', header=True)

In [159]:
if SMALL_DATA_MODE == False:
    flights = segment.copy()

# Feature Engineering
Trials done on the small training dataset.

In [165]:
flights.shape

(10000, 35)

In [163]:
# remove the Unnamed column
flights.drop(flights.columns[flights.columns.str.contains(
    'unnamed', case=False)], axis=1, inplace=True)

In [164]:
flights.head(3)

Unnamed: 0,UniqueCarrierCode,OriginAirportID,Origin,OriginState,OBTDelay,OBTDel15,OBTDelayGroups,OBTTimeBlk,AXOT,AXIT,IBTDelay,IBTDel15,IBTDelayGroups,IBTTimeBlk,SOBTtoSIBT,AOBTtoAIBT,AirborneTime,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,SOBT,SIBT,AOBT,ATOT,ALDT,AIBT,Year,Quarter,Month,DayOfMonth,DayOfWeek
0,WN,12191,HOU,TX,-1,0,-1,8,8,8,-13,0,-1,11,125,113,97,696,3,0,0,0,0,0,2018-09-12 08:05:00,2018-09-12 11:10:00,2018-09-12 08:04:00,2018-09-12 08:12:00,2018-09-12 10:49:00,2018-09-12 10:57:00,2018,3,9,12,3
1,DL,15304,TPA,FL,-7,0,-1,17,8,20,2,0,0,19,89,98,70,406,2,0,0,0,0,0,2018-03-30 17:55:00,2018-03-30 19:24:00,2018-03-30 17:48:00,2018-03-30 17:56:00,2018-03-30 19:06:00,2018-03-30 19:26:00,2018,1,3,30,5
2,DL,14635,RSW,FL,17,1,1,13,13,14,13,0,0,15,107,103,76,515,3,0,0,0,0,0,2018-11-15 13:25:00,2018-11-15 15:12:00,2018-11-15 13:42:00,2018-11-15 13:55:00,2018-11-15 15:11:00,2018-11-15 15:25:00,2018,4,11,15,4
