In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from feature_engine.creation import CyclicalFeatures

## Helper Functions

In [None]:
def scale_columns_by_standard_scaler(x_train_df, x_validation_df, x_test_df, columns):
    scaler = StandardScaler()
    
    x_train_df[columns] = scaler.fit_transform(x_train_df[columns])
    x_validation_df[columns] = scaler.transform(x_validation_df[columns])
    x_test_df[columns] = scaler.transform(x_test_df[columns])
    return x_train_df, x_validation_df, x_test_df

In [None]:
def scale_columns_by_cyclical_scaler(x_train_df, x_validation_df, x_test_df, columns):
    scaler = CyclicalFeatures(drop_original=True)
    
    x_train_transformed_df = scaler.fit_transform(x_train_df[columns])
    x_train_df = pd.concat([x_train_df.drop(columns, axis=1), x_train_transformed_df], axis=1)
    
    x_validation_transformed_df = scaler.transform(x_validation_df[columns])
    x_validation_df = pd.concat([x_validation_df.drop(columns, axis=1), x_validation_transformed_df], axis=1)


    x_test_transformed_df = scaler.transform(x_test_df[columns])
    x_test_df = pd.concat([x_test_df.drop(columns, axis=1), x_test_transformed_df], axis=1)

    return x_train_df, x_validation_df, x_test_df

In [None]:
def binary_encode(df, columns):
    encoder = ce.BinaryEncoder(cols=columns)

    # Fit and transform to produce binary encoded data
    df_encoded = encoder.fit_transform(df[columns])

    # Merge the encoded data back with the original DataFrame
    df = df.drop(columns, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

In [None]:
def calculate_crs_deptime(deptime, depdelay):
    hours = deptime // 100
    minutes = deptime % 100
    total_minutes = (hours * 60 + minutes) - depdelay
    crs_hours = abs(total_minutes) // 60
    crs_minutes = total_minutes % 60
    if total_minutes < 0:
        return total_minutes
    return crs_hours * 100 + crs_minutes

In [None]:
def step_decay(epoch):
    lrate = 0.001  # Start with this learning rate
    if epoch > 15 and epoch <= 30:
        lrate = 0.0001 # Drop to this for epochs 3 and 4
    elif epoch > 30:
        lrate = 0.00001 # Drop to this after epoch 5
    return lrate


In [None]:
def load_and_preprocess_data():
    df = pd.read_csv('~/Downloads/flight_data.csv')

    df = df[df.Cancelled == 0]
    df = df[df.Diverted == 0]

    df['CRSDepTime'] = df.apply(lambda row: calculate_crs_deptime(row['DepTime'], row['DepDelay']), axis=1)

    # Reducing dimensionality of Carrier with custom grouping
    value_counts = df['Carrier'].value_counts()
    to_remove = value_counts[value_counts <= 9000].index
    df['Carrier'].replace(to_remove, 'Other', inplace=True)
    return df

## Preprocessing and loading

In [None]:
df = load_and_preprocess_data()

## Model run feature set 1

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
        'Dest', 'Origin', 'Distance', 'Carrier',
        'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime', 'ArrDel15']

In [None]:
flight_df = df[features]
flight_df = flight_df.dropna()

X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']

In [None]:
# Encoding categorical variables
X = binary_encode(X, ['Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total']
X_train, X_validation, X_test = scale_columns_by_standard_scaler(X_train, X_validation, X_test, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
X_train, X_validation, X_test = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, time_columns)

print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid') 
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lr_scheduler = LearningRateScheduler(step_decay)

history = model.fit(
    X_train, 
    y_train, 
    epochs=50, 
    batch_size=32, 
    validation_data=[X_validation, y_validation], 
    callbacks=[lr_scheduler]
)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)

In [None]:
model.save('../models/neural_net_features_1_all.keras')

## Neural Net Feature Set 1 & 2

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
'DepDelay', 'DepTime', 'TaxiOut', 'WheelsOff', 'ArrDel15']

In [None]:
flight_df = df[features]
flight_df = flight_df.dropna()

X = flight_df.drop('ArrDel15', axis=1)
y = flight_df['ArrDel15']

In [None]:
# Encoding categorical variables
X = binary_encode(X, ['Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Transform with StandardScaler
standard_columns = ['Distance', 'Full-time', 'Part-time', 'Grand Total']
X_train, X_validation, X_test = scale_columns_by_standard_scaler(X_train, X_validation, X_test, standard_columns)

# Transform with CyclicalFeatures
time_columns =['Quarter', 'Month', 'DayofMonth', 'DayOfWeek'] 
X_train, X_validation, X_test = scale_columns_by_cyclical_scaler(X_train, X_validation, X_test, time_columns)
print(X_train.shape)

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid') 
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lr_scheduler = LearningRateScheduler(step_decay)

history = model.fit(
    X_train, 
    y_train, 
    epochs=20, 
    batch_size=32, 
    validation_data=[X_validation, y_validation], 
    callbacks=[lr_scheduler]
)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)