In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense
import category_encoders as ce


In [None]:
filename = "flight_data_weather.csv"
filepath = "../data/"
flight_df = pd.read_csv(filepath + filename)

In [None]:
# flight_df.columns
# flight_df.describe()

## Shared Preprocessing

In [None]:
# Reducing dimensionality of Carrier with custom grouping
value_counts = flight_df['Carrier'].value_counts()
to_remove = value_counts[value_counts <= 9000].index
flight_df['Carrier'].replace(to_remove, 'Other', inplace=True)

In [None]:
# Reducing dimensionality of TimeZones 
def binary_encode_timezone(df, columns):
    encoder = ce.BinaryEncoder(cols=columns)

    # Fit and transform to produce binary encoded data
    df_encoded = encoder.fit_transform(df[columns])

    # Merge the encoded data back with the original DataFrame
    df = df.drop(columns, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

## Model run 1

Timezones instead of airports

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
'Dest', 'Origin', 'Distance', 'Carrier',
'Full-time', 'Part-time', 'Grand Total', 'CRSArrTime', 'CRSElapsedTime',
'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip', 'dest_ianaTimeZone',
'origin_ianaTimeZone', 'Aircraft_Daily_Flight_Count']
len(features)

In [None]:
y = flight_df['ArrDel15']
X = flight_df[features]

# Encoding categorical variables
X = binary_encode_timezone(X, ['origin_ianaTimeZone', 'dest_ianaTimeZone', 'Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)
# Test accuracy: 0.8716484308242798

## Model run 2

Airport Dest and Origin

In [None]:
features = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 
       'DepTime', 'Dest', 'DepDelay','ArrTime',
       'Distance', 'Carrier', 'Origin', 'Origin_Windspeed', 'Origin_Precip', 
       'Dest_Windspeed', 'Dest_Precip', 'dest_ianaTimeZone',
       'origin_ianaTimeZone', 'Aircraft_Daily_Flight_Count']
len(features)

In [None]:
y = flight_df['ArrDel15']
X = flight_df[features]

# Encoding categorical variables
X = binary_encode_timezone(X, ['origin_ianaTimeZone', 'dest_ianaTimeZone', 'Dest', 'Origin'])
X = pd.get_dummies(X, columns=['Carrier'])
# X = label_encode_airport_codes(X)
print(f"Shape after encoding: {X.shape}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)
# Test accuracy: 0.860675573348999

## Testing neural net params

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)
# Test accuracy: 0.8750291466712952

## More layer testing

In [None]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)
# loss: 0.2428 - accuracy: 0.8953 - val_loss: 0.2410 - val_accuracy: 0.8953

## Hyperparameter tuning

In [None]:
from hyperopt import hp, fmin, tpe, Trials
# from tf.keras.optimizers.legacy.Adam import Adam
import numpy as np

In [None]:
def objective_function(params):
    # Set your desired learning rate
    learning_rate = params['learning_rate']

    # Create an Adam optimizer with the set learning rate
    adam_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)
    
    model = Sequential([
        Dense(params['first_hidden_layer_neurons'], activation='relu', input_shape=(X_train.shape[1],)),
        Dense(params['middle_hidden_layer_neurons'], activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=20, batch_size=int(params['batch_size']), validation_split=0.2, verbose=0)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
    print(f"For training run with params: {params}. Loss: {loss}, Accuracy: {accuracy}")
    return loss

space = {
    'batch_size': hp.choice('batch_size', [32, 64]),
    'first_hidden_layer_neurons': hp.choice('first_hidden_layer_neurons', [64, 128, 256, 512]),
    'middle_hidden_layer_neurons': hp.choice('middle_hidden_layer_neurons', [32, 64, 128, 256]),
    'learning_rate': hp.choice('learning_rate', [0.001, 0.0001, 0.00001]),
}

best = fmin(fn=objective_function,
            space=space,
            algo=tpe.suggest,
            max_evals=20,
            trials=Trials())

print("Best: ", best)