## Preprocess Flight Delay Data
This only needs to be run once.
It takes our large dataset of ~17 million rows and runs the following steps:
1. Drop flights diverted and cancelled
2. balances data by undersampling majority class. we take all delayed flights, then randomly sample on time flights to get a 60/40 split.
3. Add Column: aircraft_daily_flight_count
4. Add Column: CRSDepTime
5. drops columns that are not used in our training set


In [1]:
import pandas as pd

## Helper functions

In [13]:
def calculate_crs_deptime(deptime, depdelay):
    hours = deptime // 100
    minutes = deptime % 100
    total_minutes = (hours * 60 + minutes) - depdelay
    crs_hours = abs(total_minutes) // 60
    crs_minutes = total_minutes % 60
    if total_minutes < 0:
        return total_minutes
    return crs_hours * 100 + crs_minutes

## Balancing dataset

In [2]:
df = pd.read_csv('~/Downloads/flight_data.csv')

In [None]:
# 1. Drop flights diverted and cancelled
df = df[df.Cancelled == 0]
df = df[df.Diverted == 0]

In [9]:
# 2. balances data by undersampling majority class. we take all delayed flights
#  then randomly sample on time flights to get a 60/40 split.
df_delayed = df[df.ArrDel15 == 1]
df_on_time = df[df.ArrDel15 == 0]

print(f"count of delayed flights: {len(df_delayed)}. count of on time flights: {len(df_on_time)}. Delayed percentage: {len(df_delayed)/len(df):.2f}")
on_time_to_sample = int((len(df_delayed) / 0.4) - len(df_delayed))

print("count of on time flights to sample: ", on_time_to_sample)
df_on_time_sample = df_on_time.sample(n=on_time_to_sample, random_state=1)

flight_df = pd.concat([df_delayed, df_on_time_sample])
print("new balanced dataset: ", len(flight_df))

count of delayed flights: 2819051. count of on time flights: 13990755. Delayed percentage: 0.16
count of on time flights to sample:  4228576
new balanced dataset:  7047627


In [11]:
# 3. Add Column: aircraft_daily_flight_count
flight_df['Aircraft_Daily_Flight_Count'] = None 

flight_df.sort_values(by=['FlightDate', 'DepTime'], inplace=True)

# Group by 'flightdate' and 'Tail_Number' and use cumcount() to get a count within each group
flight_df['Aircraft_Daily_Flight_Count'] = flight_df.groupby(['FlightDate', 'Tail_Number']).cumcount() + 1

In [14]:
# 4. Add Column: CRSDepTime
flight_df['CRSDepTime'] = flight_df.apply(lambda row: calculate_crs_deptime(row['DepTime'], row['DepDelay']), axis=1)

In [15]:
# 5. drops columns that are not used in our training set
flight_df = flight_df.drop(['Cancelled', 'Diverted', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Flights', 'Tail_Number', 'Flight_Number_Reporting_Airline', 'Reporting_Airline'], axis=1)

In [16]:
flight_df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Origin', 'Dest', 'DepTime', 'DepDelay', 'TaxiOut', 'WheelsOff',
       'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay', 'ArrDel15',
       'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Distance', 'Carrier',
       'Full-time', 'Part-time', 'Grand Total', 'Aircraft_Daily_Flight_Count',
       'CRSDepTime'],
      dtype='object')

In [18]:
dest_filepath = "../data/"
output_file = "flight_data_large_balanced.csv"

print(f"writing to {output_file}")
flight_df.to_csv(dest_filepath + output_file, index=False)

writing to flight_data_large_balanced.csv
