# Data Preprocesing

In [21]:
# import libraries
import numpy as np
import pandas as pd
from modules import missing_describe

## Read Data

In [22]:
# read csv files
# extract feature names available in test file when predicting
test_features = pd.read_csv('data/flights_test_raw.csv', nrows=0, index_col=0).columns.to_list()
# test_features.append('arr_delay')
# read raw data
df = pd.read_csv('data/flights_raw_15k.csv', sep=',', index_col=0)
# df = df[test_features]
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-01-01,UA,UA_CODESHARE,UA,6246,YV,N89308,6246,12992,LIT,...,374.0,,,,,,,,,
1,2018-01-01,UA,UA_CODESHARE,UA,6183,YV,N88331,6183,13158,MAF,...,429.0,,,,,,,,,
2,2018-01-01,UA,UA_CODESHARE,UA,6034,YV,N519LR,6034,13198,MCI,...,926.0,,,,,,,,,
3,2018-01-01,UA,UA_CODESHARE,UA,6153,YV,N511MJ,6153,13198,MCI,...,926.0,,,,,,,,,
4,2018-01-01,UA,UA_CODESHARE,UA,6016,YV,N86336,6016,13198,MCI,...,643.0,,,,,,,,,


In [23]:
df.shape

(15477, 42)

## Data Cleaning

In [24]:
# drop duplicates if there's one
df.drop_duplicates(inplace=True)

# check missing values information
missing = missing_describe.md(df)
missing[missing['missing_count'] > 0]

Unnamed: 0,dtype,missing_count,missing_percent
no_name,float64,15477,1.0
first_dep_time,float64,15367,0.992893
longest_add_gtime,float64,15367,0.992893
total_add_gtime,float64,15367,0.992893
cancellation_code,object,15192,0.981586
late_aircraft_delay,float64,12599,0.814047
security_delay,float64,12599,0.814047
nas_delay,float64,12599,0.814047
weather_delay,float64,12599,0.814047
carrier_delay,float64,12599,0.814047


In [25]:
missing_describe.md(df).index

Index(['no_name', 'first_dep_time', 'longest_add_gtime', 'total_add_gtime',
       'cancellation_code', 'late_aircraft_delay', 'security_delay',
       'nas_delay', 'weather_delay', 'carrier_delay', 'actual_elapsed_time',
       'arr_delay', 'air_time', 'wheels_on', 'taxi_in', 'arr_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'dep_time', 'tail_num',
       'diverted', 'flights', 'crs_elapsed_time', 'distance', 'dup', 'fl_date',
       'cancelled', 'mkt_unique_carrier', 'crs_dep_time', 'dest_city_name',
       'dest', 'dest_airport_id', 'origin_city_name', 'origin',
       'origin_airport_id', 'op_carrier_fl_num', 'op_unique_carrier',
       'mkt_carrier_fl_num', 'mkt_carrier', 'branded_code_share',
       'crs_arr_time'],
      dtype='object')

In [26]:
# drop the column with full NaN
df.drop(columns='no_name', inplace=True)
# drop the rows with NaN in 'arr_delay'
df.dropna(subset=['arr_delay'], inplace=True)

# fill NaN with 0 in delay types columns
df.fillna({'first_dep_time': 'no first departure', 'longest_add_gtime': 0, 'total_add_gtime': 0, 'cancellation_code': 'not cancelled', 'late_aircraft_delay': 0, 'security_delay': 0, 'nas_delay': 0, 'weather_delay': 0, 'carrier_delay': 0, 'dep_delay': 0}, inplace=True)

In [27]:
df.isnull().sum().sum()

0

## Feature Engineering and Exploratory Data Analysis

Feature engineering will play a crucial role in this problems. We have only very little attributes so we need to create some features that will have some predictive power.

- weather: we can use some weather API to look for the weather in time of the scheduled departure and scheduled arrival.
- statistics (avg, mean, median, std, min, max...): we can take a look at previous delays and compute descriptive statistics
- airports encoding: we need to think about what to do with the airports and other categorical variables
- time of the day: the delay probably depends on the airport traffic which varies during the day.
- airport traffic
- unsupervised learning as feature engineering?
- **what are the additional options?**: Think about what we could do more to improve the model.

In [28]:
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime
0,2018-01-01,UA,UA_CODESHARE,UA,6246,YV,N89308,6246,12992,LIT,...,1.0,374.0,0.0,0.0,0.0,0.0,0.0,no first departure,0.0,0.0
1,2018-01-01,UA,UA_CODESHARE,UA,6183,YV,N88331,6183,13158,MAF,...,1.0,429.0,0.0,0.0,0.0,0.0,0.0,no first departure,0.0,0.0
2,2018-01-01,UA,UA_CODESHARE,UA,6034,YV,N519LR,6034,13198,MCI,...,1.0,926.0,0.0,0.0,0.0,0.0,0.0,no first departure,0.0,0.0
3,2018-01-01,UA,UA_CODESHARE,UA,6153,YV,N511MJ,6153,13198,MCI,...,1.0,926.0,0.0,0.0,0.0,0.0,0.0,no first departure,0.0,0.0
4,2018-01-01,UA,UA_CODESHARE,UA,6016,YV,N86336,6016,13198,MCI,...,1.0,643.0,0.0,0.0,0.0,0.0,0.0,no first departure,0.0,0.0


### Feature Engineering First Attempt

#### Columns about Carrier/flignt_num/flights

In [29]:
# check how many rows have a mkt_carrier_fl_num different from op_carrier_fl_num
print('Number of rows that have different "mkt_carrier_fl_num" and "op_carrier_fl_num": ', (df['mkt_carrier_fl_num'] != df['op_carrier_fl_num']).sum())

# check if come columns are same or similar
print('Number of rows that have different "mkt_unique_carrier" and "mkt_carrier": ', (df['mkt_unique_carrier'] != df['mkt_carrier']).sum())
print('Number of rows that have different "mkt_unique_carrier" and "branded_code_share": ', (df['mkt_unique_carrier'] != df['branded_code_share']).sum())
print('Number of rows that have different "op_unique_carrier" and "branded_code_share": ', (df['op_unique_carrier'] != df['branded_code_share']).sum())

Number of rows that have different "mkt_carrier_fl_num" and "op_carrier_fl_num":  0
Number of rows that have different "mkt_unique_carrier" and "mkt_carrier":  0
Number of rows that have different "mkt_unique_carrier" and "branded_code_share":  5528
Number of rows that have different "op_unique_carrier" and "branded_code_share":  5528


In [30]:
# check how rows with different 'mkt_carrier', 'op_unique_carrier', 'branded_code_share' look like 
df[df['mkt_carrier'] != df['op_unique_carrier']][['mkt_carrier', 'op_unique_carrier', 'branded_code_share']]

Unnamed: 0,mkt_carrier,op_unique_carrier,branded_code_share
0,UA,YV,UA_CODESHARE
1,UA,YV,UA_CODESHARE
2,UA,YV,UA_CODESHARE
3,UA,YV,UA_CODESHARE
4,UA,YV,UA_CODESHARE
...,...,...,...
15472,AA,YV,AA_CODESHARE
15473,AA,YV,AA_CODESHARE
15474,AA,YV,AA_CODESHARE
15475,AA,YV,AA_CODESHARE


In [31]:
df['dup'].value_counts()

N    15156
Name: dup, dtype: int64

In [32]:
# check what's in 'flights' column
df['flights'].value_counts()

1.0    15156
Name: flights, dtype: int64

In [33]:
### drop columns containing repeated information
# keep mkt_unique_carrier, drop mkt_carrier
# drop branded_code_share, since wherever 'mkt_carrier' and 'op_unique_carrier' are different, it's a shared code
# keep the 'op_carrier_fl_num', drop 'mkt_carrier_fl_num'
# keep 'origin_airport_id', drop 'origin'
# keep 'dest_airport_id', drop 'dest'
# drop 'dup'
# drop 'flights'
df.drop(columns=['mkt_carrier', 'branded_code_share', 'mkt_carrier_fl_num', 'origin', 'dest', 'dup', 'flights'], inplace=True)

# create a column containing if code is shared , then drop 'mkt_unique_carrier', keep 'op_unique_carrier'
df['share_code'] = (df['mkt_unique_carrier']!=df['op_unique_carrier']).astype('int')
df.drop(columns='mkt_unique_carrier', inplace=True)

#### Columns about origin and dest

In [34]:
# split city and states in 'origin_city_name' and 'dest_city_name' columns into 2 columns
# and drop original two columns
df[['origin_city', 'origin_state']] = df['origin_city_name'].str.split(',', expand=True, n=2)
df[['dest_city', 'dest_state']] = df['dest_city_name'].str.split(',', expand=True, n=2)
df.drop(columns=['origin_city_name', 'dest_city_name'], inplace=True)

#### Columns about time

In [35]:
# convert datetime columns into pd.datetime
df['fl_date'] = pd.to_datetime(df['fl_date'])
df['crs_dep_time'] = pd.to_datetime(df['crs_dep_time'].astype(str).str.zfill(4), format='%H%M', errors='coerce').dt.time
df['crs_arr_time'] = pd.to_datetime(df['crs_arr_time'].astype(str).str.zfill(4), format='%H%M', errors='coerce').dt.time

df.isnull().sum().sum()

0

In [36]:
# add arrival date
# flight_date +1 for the flights arrives on next day
df['arr_date'] = pd.to_datetime(np.where(df['crs_dep_time'] > df['crs_arr_time'], (df['fl_date'] + pd.to_timedelta(1, unit="D")).dt.date, df['fl_date'].dt.date))

# for departure and arrival, each merge date and time into one column
df['dep_datetime'] = pd.to_datetime(df['fl_date'].astype(str) + ' ' + df['crs_dep_time'].astype(str))
df['arr_datetime'] = pd.to_datetime(df['arr_date'].astype(str) + ' ' + df['crs_arr_time'].astype(str))

# extract month and day of the week from flight date
df['fl_month'] = df.fl_date.dt.month
df['fl_weekday'] = df.fl_date.dt.dayofweek
df['season'] = df.fl_month  % 12 // 3 + 1


# calculate number of flights scheduled for departure at each airport each day
num_of_flights = df[['fl_date', 'origin_airport_id', 'op_carrier_fl_num']].groupby(['fl_date', 'origin_airport_id'], as_index=False).count().rename(columns={'op_carrier_fl_num': 'day_num_of_flights'})
df = pd.merge(df, num_of_flights, on=['fl_date', 'origin_airport_id'], how='left')
df.head()

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,taxi_out,...,origin_state,dest_city,dest_state,arr_date,dep_datetime,arr_datetime,fl_month,fl_weekday,season,day_num_of_flights
0,2018-01-01,YV,N89308,6246,12992,12266,18:55:00,1849.0,-6.0,12.0,...,AR,Houston,TX,2018-01-01,2018-01-01 18:55:00,2018-01-01 20:30:00,1,0,1,1
1,2018-01-01,YV,N88331,6183,13158,12266,09:45:00,1013.0,28.0,9.0,...,TX,Houston,TX,2018-01-01,2018-01-01 09:45:00,2018-01-01 11:15:00,1,0,1,1
2,2018-01-01,YV,N519LR,6034,13198,12264,12:20:00,1217.0,-3.0,11.0,...,MO,Washington,DC,2018-01-01,2018-01-01 12:20:00,2018-01-01 15:36:00,1,0,1,6
3,2018-01-01,YV,N511MJ,6153,13198,12264,08:20:00,813.0,-7.0,21.0,...,MO,Washington,DC,2018-01-01,2018-01-01 08:20:00,2018-01-01 11:35:00,1,0,1,6
4,2018-01-01,YV,N86336,6016,13198,12266,11:05:00,1056.0,-9.0,12.0,...,MO,Houston,TX,2018-01-01,2018-01-01 11:05:00,2018-01-01 13:21:00,1,0,1,6


In [37]:
# convert departure and arrival time to minites of the day, add to dataframe
df['dep_min_of_day'] = (df['dep_datetime'].dt.hour) * 60 + (df['dep_datetime'].dt.minute)
df['arr_min_of_day'] = (df['arr_datetime'].dt.hour) * 60 + (df['arr_datetime'].dt.minute)

# extract departure and arrival hour, add to dataframe
df['dep_hr'] = df['dep_datetime'].dt.hour
df['arr_hr'] = df['arr_datetime'].dt.hour

# make departure time columns circular
df['dep_min_sin'] = np.sin(df.dep_min_of_day*(2.*np.pi/1440))
df['dep_min_cos'] = np.cos(df.dep_min_of_day*(2.*np.pi/1440))

df['dep_hr_sin'] = np.sin(df.dep_hr*(2.*np.pi/24))
df['dep_hr_cos'] = np.cos(df.dep_hr*(2.*np.pi/24))

df['arr_hr_sin'] = np.sin(df.arr_hr*(2.*np.pi/24))
df['arr_hr_cos'] = np.cos(df.arr_hr*(2.*np.pi/24))

df['fl_mnth_sin'] = np.sin((df.fl_month-1)*(2.*np.pi/12))
df['fl_mnth_cos'] = np.cos((df.fl_month-1)*(2.*np.pi/12))

df['fl_wkday_sin'] = np.sin((df.fl_weekday-1)*(2.*np.pi/7))
df['fl_wkday_cos'] = np.cos((df.fl_weekday-1)*(2.*np.pi/7))

In [38]:
# add a column determine if it is busy around its departure time (6 hrs in the middle)
# for each flight, time frame is in between 3 hrs earlier and 3 hrs later of its departure time at same airport
# calculate total number of flights scheduled for departure and arrival in timeframe

num_dep = df.apply(lambda x: df[df['origin_airport_id']==x['origin_airport_id']]['dep_datetime'].between((x['dep_datetime'] - pd.to_timedelta(3, unit="H")), (x['dep_datetime'] + pd.to_timedelta(3, unit="H"))).sum(), axis=1)
num_arr = df.apply(lambda x: df[df['dest_airport_id']==x['origin_airport_id']]['arr_datetime'].between((x['dep_datetime'] - pd.to_timedelta(3, unit="H")), (x['dep_datetime'] + pd.to_timedelta(3, unit="H"))).sum(), axis=1)
df['num_flights_6hr'] = num_dep + num_arr

df.head()

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,taxi_out,...,dep_min_cos,dep_hr_sin,dep_hr_cos,arr_hr_sin,arr_hr_cos,fl_mnth_sin,fl_mnth_cos,fl_wkday_sin,fl_wkday_cos,num_flights_6hr
0,2018-01-01,YV,N89308,6246,12992,12266,18:55:00,1849.0,-6.0,12.0,...,0.237686,-1.0,-1.83697e-16,-0.866025,0.5,0.0,1.0,-0.781831,0.62349,1
1,2018-01-01,YV,N88331,6183,13158,12266,09:45:00,1013.0,28.0,9.0,...,-0.83147,0.7071068,-0.7071068,0.258819,-0.965926,0.0,1.0,-0.781831,0.62349,1
2,2018-01-01,YV,N519LR,6034,13198,12264,12:20:00,1217.0,-3.0,11.0,...,-0.996195,1.224647e-16,-1.0,-0.707107,-0.707107,0.0,1.0,-0.781831,0.62349,3
3,2018-01-01,YV,N511MJ,6153,13198,12264,08:20:00,813.0,-7.0,21.0,...,-0.573576,0.8660254,-0.5,0.258819,-0.965926,0.0,1.0,-0.781831,0.62349,4
4,2018-01-01,YV,N86336,6016,13198,12266,11:05:00,1056.0,-9.0,12.0,...,-0.971342,0.258819,-0.9659258,-0.258819,-0.965926,0.0,1.0,-0.781831,0.62349,4


In [39]:
# add column about if it's inbound flight within 2 hrs ahead of each flight's departure
df['inbound_fl_num'] = df.apply(lambda x: (df[(df['tail_num'] == x['tail_num']) & (df['arr_datetime'].between(x['dep_datetime'] - pd.to_timedelta(2, unit='H'), x['dep_datetime']))]['tail_num'].count()), axis=1)

df['inbound_fl'] = (df['inbound_fl_num'] > 0).astype(int)
df['inbound_fl_num'].value_counts()

0    12532
1     2593
2       31
Name: inbound_fl_num, dtype: int64

In [40]:
df.shape

(15156, 60)

#### Write in csv files

In [41]:
df.to_csv('data/flights_preprocessed.csv')