In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import category_encoders as ce

## Helper Functions

In [2]:
def binary_encode(df, columns):
    encoder = ce.BinaryEncoder(cols=columns)

    # Fit and transform to produce binary encoded data
    df_encoded = encoder.fit_transform(df[columns])

    # Merge the encoded data back with the original DataFrame
    df = df.drop(columns, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

In [3]:
def calculate_crs_deptime(deptime, depdelay):
    hours = deptime // 100
    minutes = deptime % 100
    total_minutes = (hours * 60 + minutes) - depdelay
    crs_hours = abs(total_minutes) // 60
    crs_minutes = total_minutes % 60
    if total_minutes < 0:
        return total_minutes
    return crs_hours * 100 + crs_minutes

In [4]:
def calculate_model_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] 

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    f1 = f1_score(y_test, y_pred)
    print("F1 Score:", f1)
    
    auc = roc_auc_score(y_test, y_pred_proba)
    print("AUC Score:", auc)

In [5]:
def split_X_y_and_2023(df: pd.DataFrame):
    flight_df_2023 = df[df['Year'] == 2023]
    flight_df = df[df['Year'] != 2023]

    X = flight_df.drop('ArrDel15', axis=1)
    y = flight_df['ArrDel15']

    X_test_2023 = flight_df_2023.drop('ArrDel15', axis=1)
    y_test_2023 = flight_df_2023['ArrDel15']

    return X, y, X_test_2023, y_test_2023

In [6]:
def filter_df_for_feature_and_encode(df, features, encoded_columns):
    flight_df = df[features]
    flight_df = flight_df.dropna()
    
    # Encoding combined categorical variables
    flight_df = binary_encode(flight_df, encoded_columns)
    flight_df = pd.get_dummies(flight_df, columns=['Carrier'])
    return flight_df

In [26]:
file_name = 'flight_data_large_balanced.csv'
file_path = f"../data/{file_name}"
df = pd.read_csv(file_path)

In [27]:
file_name = 'flight_test_data.csv'
file_path = f"../data/{file_name}"
df_2023 = pd.read_csv(file_path)

# Preprocess
# Add Column: aircraft_daily_flight_count
df_2023['Aircraft_Daily_Flight_Count'] = None 

df_2023.sort_values(by=['FlightDate', 'DepTime'], inplace=True)

# Group by 'flightdate' and 'Tail_Number' and use cumcount() to get a count within each group
df_2023['Aircraft_Daily_Flight_Count'] = df_2023.groupby(['FlightDate', 'Tail_Number']).cumcount() + 1

In [28]:
# Combine to get same encodings for training
df = pd.concat([df, df_2023])

## Model All Data feature set 1

In [10]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
        'Dest', 'Origin', 'Distance', 'Carrier',
        'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime', 'Aircraft_Daily_Flight_Count', 'ArrDel15']

In [11]:
flight_df = filter_df_for_feature_and_encode(df, features, ['Dest', 'Origin'])
X, y, X_test_2023, y_test_2023 = split_X_y_and_2023(flight_df)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [95]:
model_1 = RandomForestClassifier(random_state=42)
model_1.fit(X_train, y_train)

: 

: 

In [None]:
calculate_model_metrics(model_1, X_test, y_test)

Accuracy: 0.6915576117022059
F1 Score: 0.5830619205703418
AUC Score: 0.7371557314714821


In [None]:
calculate_model_metrics(model_1, X_test_2023, y_test_2023)

Accuracy: 0.5549857142857143
F1 Score: 0.3117778317831341
AUC Score: 0.634357784489796


Random forest using feature set 1 features performed at 86.1094% accuracy

## Model All Data feature set 1 & 2

In [29]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Aircraft_Daily_Flight_Count', 'DepDelay', 'DepTime', 'TaxiOut', 'WheelsOff', 'ArrDel15']

In [30]:
flight_df = filter_df_for_feature_and_encode(df, features, ['Dest', 'Origin'])
X, y, X_test_2023, y_test_2023 = split_X_y_and_2023(flight_df)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
model_2 = RandomForestClassifier(random_state=42)
model_2.fit(X_train, y_train)

In [33]:
calculate_model_metrics(model_2, X_test, y_test)

Accuracy: 0.9207956756428659
F1 Score: 0.897861666295683
AUC Score: 0.9692681798210157


In [34]:
calculate_model_metrics(model_2, X_test_2023, y_test_2023)

Accuracy: 0.9033428571428571
F1 Score: 0.9000236420592234
AUC Score: 0.9606729942857143


# Training with smaller dataset and weather data
can't test on 2023 data, as we didn't enrich it with weather features

In [96]:
flight_df = pd.read_csv('../data/flight_data_weather.csv')

In [97]:
flight_df['CRSDepTime'] = flight_df.apply(lambda row: calculate_crs_deptime(row['DepTime'], row['DepDelay']), axis=1)

In [98]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip', 'dest_ianaTimeZone',
'origin_ianaTimeZone', 'Aircraft_Daily_Flight_Count', 'ArrDel15']

In [99]:
flight_df = filter_df_for_feature_and_encode(flight_df, features, ['Dest', 'Origin', 'dest_ianaTimeZone', 'origin_ianaTimeZone'])
X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [103]:
model_3 = RandomForestClassifier(n_estimators=400, min_samples_split=20, min_samples_leaf=2, max_features='sqrt', max_depth=30, bootstrap=False, random_state=42)
model_3.fit(X_train, y_train)

In [104]:
calculate_model_metrics(model_3, X_test, y_test)

Accuracy: 0.627613274267506
F1 Score: 0.6186932993792774
AUC Score: 0.6763009010535174


# Small Dataset, Feature set 1 | no weather data

In [105]:
flight_df = pd.read_csv('../data/flight_data_weather.csv')

flight_df['CRSDepTime'] = flight_df.apply(lambda row: calculate_crs_deptime(row['DepTime'], row['DepDelay']), axis=1)

In [107]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Aircraft_Daily_Flight_Count', 'ArrDel15']

In [108]:
df = pd.concat([flight_df, df_2023])

In [113]:
flight_df = filter_df_for_feature_and_encode(df, features, ['Dest', 'Origin'])
X, y, X_test_2023, y_test_2023 = split_X_y_and_2023(flight_df)

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [118]:
model_4 = RandomForestClassifier(n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=None, bootstrap=False, random_state=42)
model_4.fit(X_train, y_train)

In [119]:
calculate_model_metrics(model_4, X_test, y_test)

Accuracy: 0.6079117121318101
F1 Score: 0.6028419043120584
AUC Score: 0.6469622175961672


In [120]:
calculate_model_metrics(model_4, X_test_2023, y_test_2023)

Accuracy: 0.5989285714285715
F1 Score: 0.5842773162750063
AUC Score: 0.6368971289795918


## Smaller dataset, feature set 1 & 2 | weather

In [11]:
flight_df = pd.read_csv('../data/flight_data_weather.csv')

flight_df['CRSDepTime'] = flight_df.apply(lambda row: calculate_crs_deptime(row['DepTime'], row['DepDelay']), axis=1)

In [12]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip', 'dest_ianaTimeZone',
'origin_ianaTimeZone', 'Aircraft_Daily_Flight_Count', 'DepDelay', 'DepTime', 'TaxiOut', 'WheelsOff', 'ArrDel15']

In [13]:
flight_df = filter_df_for_feature_and_encode(flight_df, features, ['Dest', 'Origin', 'dest_ianaTimeZone', 'origin_ianaTimeZone'])
X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
model_3 = RandomForestClassifier(n_estimators=400, min_samples_split=20, min_samples_leaf=2, max_features='sqrt', max_depth=30, bootstrap=False, random_state=42)
model_3.fit(X_train, y_train)

In [15]:
calculate_model_metrics(model_3, X_test, y_test)

Accuracy: 0.9056501126913811
F1 Score: 0.9027321528723661
AUC Score: 0.9633407974937651


## Smaller dataset, feature set 1 & 2 | no weather

In [16]:
flight_df = pd.read_csv('../data/flight_data_weather.csv')

flight_df['CRSDepTime'] = flight_df.apply(lambda row: calculate_crs_deptime(row['DepTime'], row['DepDelay']), axis=1)

In [17]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Aircraft_Daily_Flight_Count', 'DepDelay', 'DepTime', 'TaxiOut', 'WheelsOff', 'ArrDel15']

In [20]:
df = pd.concat([flight_df, df_2023])

In [21]:
flight_df = filter_df_for_feature_and_encode(df, features, ['Dest', 'Origin'])
X, y, X_test_2023, y_test_2023 = split_X_y_and_2023(flight_df)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
model_4 = RandomForestClassifier(n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=None, bootstrap=False, random_state=42)
model_4.fit(X_train, y_train)

In [24]:
calculate_model_metrics(model_4, X_test, y_test)

# Accuracy: 0.6107581409808036
# F1 Score: 0.6021823525323431
# AUC Score: 0.6534185545951675

Accuracy: 0.9056598274656097
F1 Score: 0.9028812593133382
AUC Score: 0.9609838249462554


In [25]:
calculate_model_metrics(model_4, X_test_2023, y_test_2023)

# Accuracy: 0.5997428571428571
# F1 Score: 0.5829165178039777
# AUC Score: 0.636713576734694

Accuracy: 0.9009428571428572
F1 Score: 0.8992326919723304
AUC Score: 0.9578662991836734
