In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import category_encoders as ce

from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import LearningRateScheduler
from feature_engine.creation import CyclicalFeatures

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np


## Helper Functions

In [None]:
def calculate_crs_deptime(deptime, depdelay):
    hours = deptime // 100
    minutes = deptime % 100
    total_minutes = (hours * 60 + minutes) - depdelay
    crs_hours = abs(total_minutes) // 60
    crs_minutes = total_minutes % 60
    if total_minutes < 0:
        return total_minutes
    return crs_hours * 100 + crs_minutes

In [None]:
def binary_encode(df, columns):
    encoder = ce.BinaryEncoder(cols=columns)

    # Fit and transform to produce binary encoded data
    df_encoded = encoder.fit_transform(df[columns])

    # Merge the encoded data back with the original DataFrame
    df = df.drop(columns, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

In [None]:
def scale_columns_by_standard_scaler(x_train_df, x_validation_df, x_test_df, x_test_2023_df, columns):
    scaler = StandardScaler()
    
    x_train_df[columns] = scaler.fit_transform(x_train_df[columns])
    x_validation_df[columns] = scaler.transform(x_validation_df[columns])
    x_test_df[columns] = scaler.transform(x_test_df[columns])
    
    if x_test_2023_df is not None:
        x_test_2023_df[columns] = scaler.transform(x_test_2023_df[columns])
    
    return x_train_df, x_validation_df, x_test_df, x_test_2023_df

In [None]:
def scale_columns_by_cyclical_scaler(x_train_df, x_validation_df, x_test_df, x_test_2023_df, columns):
    scaler = CyclicalFeatures(drop_original=True)
    
    x_train_transformed_df = scaler.fit_transform(x_train_df[columns])
    x_train_df = pd.concat([x_train_df.drop(columns, axis=1), x_train_transformed_df], axis=1)
    
    x_validation_transformed_df = scaler.transform(x_validation_df[columns])
    x_validation_df = pd.concat([x_validation_df.drop(columns, axis=1), x_validation_transformed_df], axis=1)


    x_test_transformed_df = scaler.transform(x_test_df[columns])
    x_test_df = pd.concat([x_test_df.drop(columns, axis=1), x_test_transformed_df], axis=1)
    
    if x_test_2023_df is not None:
        x_test_2023_transformed_df = scaler.transform(x_test_2023_df[columns])
        x_test_2023_df = pd.concat([x_test_2023_df.drop(columns, axis=1), x_test_2023_transformed_df], axis=1)

    return x_train_df, x_validation_df, x_test_df, x_test_2023_df

In [None]:
def step_decay(epoch):
    lrate = 0.001  # Start with this learning rate
    if epoch > 25 and epoch <= 40:
        lrate = 0.0001 # Drop to this for epochs 3 and 4
    elif epoch > 40:
        lrate = 0.00001 # Drop to this after epoch 5
    return lrate

In [None]:
def filter_df_for_feature_and_encode(df, features, encoded_columns):
    flight_df = df[features]
    flight_df = flight_df.dropna()

    # Encoding combined categorical variables
    flight_df = binary_encode(flight_df, encoded_columns)
    flight_df = pd.get_dummies(flight_df, columns=['Carrier'])
    return flight_df

In [None]:
def split_X_y_and_2023(df: pd.DataFrame):
    flight_df_2023 = df[df['Year'] == 2023]
    flight_df = df[df['Year'] != 2023]

    X = flight_df.drop('ArrDel15', axis=1)
    y = flight_df['ArrDel15']

    X_test_2023 = flight_df_2023.drop('ArrDel15', axis=1)
    y_test_2023 = flight_df_2023['ArrDel15']

    return X, y, X_test_2023, y_test_2023

In [None]:
def calculate_model_metrics(model, X_test, y_test):
    # Get model predictions
    y_pred_proba = model.predict(X_test)

    # If your model outputs probabilities for both classes, keep the one for the positive class
    if y_pred_proba.shape[1] == 2:
        y_pred_proba = y_pred_proba[:, 1]

    # Convert probabilities to class labels (0 or 1)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    f1 = f1_score(y_test, y_pred)
    print("F1 Score:", f1)
    
    auc = roc_auc_score(y_test, y_pred_proba)
    print("AUC Score:", auc)

## Preprocessing and loading

In [None]:
file_name = 'flight_test_data.csv'
file_path = f"../data/{file_name}"
df_2023 = pd.read_csv(file_path)

# Preprocess
# Add Column: aircraft_daily_flight_count
df_2023['Aircraft_Daily_Flight_Count'] = None 

df_2023.sort_values(by=['FlightDate', 'DepTime'], inplace=True)

# Group by 'flightdate' and 'Tail_Number' and use cumcount() to get a count within each group
df_2023['Aircraft_Daily_Flight_Count'] = df_2023.groupby(['FlightDate', 'Tail_Number']).cumcount() + 1

In [None]:
file_name = 'flight_data_large_balanced.csv'
file_path = f"../data/{file_name}"
df = pd.read_csv(file_path)

In [None]:
# Combine to get same encodings for training
df = pd.concat([df, df_2023])

## Model run feature set 1  Large Balanced

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Aircraft_Daily_Flight_Count', 'ArrDel15']

In [None]:
flight_df = filter_df_for_feature_and_encode(df, features, ['Dest', 'Origin'])
X, y, X_test_2023, y_test_2023 = split_X_y_and_2023(flight_df)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total', 'Aircraft_Daily_Flight_Count']
X_train, X_validation, X_test, X_test_2023 = scale_columns_by_standard_scaler(X_train, X_validation, X_test, X_test_2023, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
# X_train, X_validation, X_test, X_test_2023 = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, X_test_2023, time_columns)

print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lr_scheduler = LearningRateScheduler(step_decay)

history = model.fit(
    X_train, 
    y_train, 
    epochs=20, 
    batch_size=32, 
    validation_data=[X_validation, y_validation],
    callbacks=[lr_scheduler]
)

In [None]:
calculate_model_metrics(model, X_test, y_test)

In [None]:
calculate_model_metrics(model, X_test_2023, y_test_2023)

## Model run feature set 1 & 2 -- Large Balanced


In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Aircraft_Daily_Flight_Count', 'DepTime', 'DepDelay', 'TaxiOut', 'WheelsOff', 'ArrDel15']

In [None]:
flight_df = filter_df_for_feature_and_encode(df, features, ['Dest', 'Origin'])
X, y, X_test_2023, y_test_2023 = split_X_y_and_2023(flight_df)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total', 'Aircraft_Daily_Flight_Count']
X_train, X_validation, X_test, X_test_2023 = scale_columns_by_standard_scaler(X_train, X_validation, X_test, X_test_2023, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
X_train, X_validation, X_test, X_test_2023 = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, X_test_2023, time_columns)

print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train, 
    y_train, 
    epochs=10, 
    batch_size=32, 
    validation_data=[X_validation, y_validation]
)

In [None]:
calculate_model_metrics(model, X_test, y_test)

In [None]:
calculate_model_metrics(model, X_test_2023, y_test_2023)

## Model run feature set 1 --  No weather | Small

In [None]:
file_name = 'flight_data_weather.csv'
file_path = f"../data/{file_name}"
df_weather = pd.read_csv(file_path)

df_weather['CRSDepTime'] = df_weather.apply(lambda x: calculate_crs_deptime(x['DepTime'], x['DepDelay']), axis=1)

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Aircraft_Daily_Flight_Count', 'ArrDel15']

In [None]:
flight_df = df_weather[features]
flight_df = flight_df.dropna()

X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']

In [None]:
# Encoding categorical variables
X = binary_encode(X, ['Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total', 'Aircraft_Daily_Flight_Count']
X_train, X_validation, X_test, _ = scale_columns_by_standard_scaler(X_train, X_validation, X_test, None, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
X_train, X_validation, X_test, _ = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, None, time_columns)

print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lr_scheduler = LearningRateScheduler(step_decay)

history = model.fit(
    X_train, 
    y_train, 
    epochs=50, 
    batch_size=32, 
    validation_data=[X_validation, y_validation],
    callbacks=[lr_scheduler]
)

In [None]:
calculate_model_metrics(model, X_test, y_test)

## Model run feature set 1  weather | Small

In [None]:
file_name = 'flight_data_weather.csv'
file_path = f"../data/{file_name}"
df_weather = pd.read_csv(file_path)

df_weather['CRSDepTime'] = df_weather.apply(lambda x: calculate_crs_deptime(x['DepTime'], x['DepDelay']), axis=1)

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip', 'dest_ianaTimeZone',
'origin_ianaTimeZone', 'Aircraft_Daily_Flight_Count', 'ArrDel15']

In [None]:
flight_df = df_weather[features]
flight_df = flight_df.dropna()

X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']

In [None]:
# Encoding categorical variables
X = binary_encode(X, ['origin_ianaTimeZone', 'dest_ianaTimeZone', 'Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total', 'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip', 'Aircraft_Daily_Flight_Count']
X_train, X_validation, X_test, _ = scale_columns_by_standard_scaler(X_train, X_validation, X_test, None, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
X_train, X_validation, X_test, _ = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, None, time_columns)

print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lr_scheduler = LearningRateScheduler(step_decay)

history = model.fit(
    X_train, 
    y_train, 
    epochs=50, 
    batch_size=32, 
    validation_data=[X_validation, y_validation],
    callbacks=[lr_scheduler]
)

In [None]:
calculate_model_metrics(model, X_test, y_test)

## Model run feature set 1 & 2 -  No Weather

In [None]:
file_name = 'flight_data_weather.csv'
file_path = f"../data/{file_name}"
df_weather = pd.read_csv(file_path)

df_weather['CRSDepTime'] = df_weather.apply(lambda x: calculate_crs_deptime(x['DepTime'], x['DepDelay']), axis=1)

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Aircraft_Daily_Flight_Count', 
'DepDelay', 'DepTime', 'TaxiOut', 'WheelsOff', 'ArrDel15']

In [None]:
flight_df = df_weather[features]
flight_df = flight_df.dropna()

X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']

In [None]:
# Encoding categorical variables
X = binary_encode(X, ['Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total', 'Aircraft_Daily_Flight_Count']
X_train, X_validation, X_test, _ = scale_columns_by_standard_scaler(X_train, X_validation, X_test, None, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
X_train, X_validation, X_test, _ = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, None, time_columns)

print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lr_scheduler = LearningRateScheduler(step_decay)

history = model.fit(
    X_train, 
    y_train, 
    epochs=50, 
    batch_size=32, 
    validation_data=[X_validation, y_validation],
    callbacks=[lr_scheduler]
)

In [None]:
calculate_model_metrics(model, X_test, y_test)

## Model run feature set 1 & 2 -- weather | Small

In [None]:
file_name = 'flight_data_weather.csv'
file_path = f"../data/{file_name}"
df_weather = pd.read_csv(file_path)

df_weather['CRSDepTime'] = df_weather.apply(lambda x: calculate_crs_deptime(x['DepTime'], x['DepDelay']), axis=1)

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip', 'dest_ianaTimeZone',
'origin_ianaTimeZone', 'Aircraft_Daily_Flight_Count', 
'DepDelay', 'DepTime', 'TaxiOut', 'WheelsOff', 'ArrDel15']

In [None]:
flight_df = df_weather[features]
flight_df = flight_df.dropna()

X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']

In [None]:
# Encoding categorical variables
X = binary_encode(X, ['origin_ianaTimeZone', 'dest_ianaTimeZone', 'Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total', 'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip', 'Aircraft_Daily_Flight_Count']
X_train, X_validation, X_test, _ = scale_columns_by_standard_scaler(X_train, X_validation, X_test, None, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
X_train, X_validation, X_test, _ = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, None, time_columns)

print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lr_scheduler = LearningRateScheduler(step_decay)

history = model.fit(
    X_train, 
    y_train, 
    epochs=50, 
    batch_size=32, 
    validation_data=[X_validation, y_validation],
    callbacks=[lr_scheduler]
)

In [None]:
calculate_model_metrics(model, X_test, y_test)