<a href="https://colab.research.google.com/github/It21258794/DL_Assignment_SE4050/blob/main/Bidirectional_LSTM_Flight_Delay_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bidirectional LSTM Model **

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [None]:
# Define the path to the data set
path = "/content/drive/MyDrive/flights_sample_3m.csv.zip"

In [None]:
# importing the dataset
df = pd.read_csv(path)

# checking the dataset
df.head()

In [None]:
df = df[df['ARR_DELAY'] > 0]

In [None]:
# show the shape of the dataset and the number of rows and columns
df.shape

In [None]:
# check the data types of the columns
df.info()

In [None]:
# encode the categorical data
le = LabelEncoder()

def clean_labels_encoder(list_of_labels, df):
  for label in list_of_labels:
        df[label] = le.fit_transform(df[label])
  return df

In [None]:
def process_time_columns(df):
    # Convert 'hhmm' format into 'hours' and 'minutes' for both CRS_DEP_TIME and CRS_ARR_TIME
    df['CRS_DEP_HOUR'] = df['CRS_DEP_TIME'] // 100  # Extract hour part
    df['CRS_DEP_MINUTE'] = df['CRS_DEP_TIME'] % 100  # Extract minute part
    df['CRS_ARR_HOUR'] = df['CRS_ARR_TIME'] // 100  # Extract hour part
    df['CRS_ARR_MINUTE'] = df['CRS_ARR_TIME'] % 100  # Extract minute part

    # create a time period (morning, afternoon, etc.)
    df['DEP_TIME_PERIOD'] = pd.cut(df['CRS_DEP_HOUR'], bins=[0, 6, 12, 18, 24],
                                   labels=['Night', 'Morning', 'Afternoon', 'Evening'], right=False)
    return df

In [None]:
def process_flight_data(df):
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], errors='coerce')  # Auto-infer format

    # Extract relevant date features
    df['DayOfWeek'] = df['FL_DATE'].dt.weekday  # 0=Monday, 6=Sunday
    df['IsWeekend'] = df['DayOfWeek'] >= 5  # True if weekend (Saturday/Sunday)

    # Process the time columns (CRS_DEP_TIME, CRS_ARR_TIME)
    df = process_time_columns(df)

    return df

In [None]:
df = process_flight_data(df)

In [None]:
columns_to_drop = [
    'CANCELLED',
    'CANCELLATION_CODE',
    'TAXI_OUT',
    'WHEELS_OFF',
    'WHEELS_ON',
    'TAXI_IN',
    'DELAY_DUE_CARRIER',
    'DELAY_DUE_WEATHER',
    'DELAY_DUE_NAS',
    'DELAY_DUE_SECURITY',
    'DELAY_DUE_LATE_AIRCRAFT',
    'DOT_CODE',
    'AIRLINE_CODE',
    'ORIGIN_CITY' ,
    'DEST_CITY',
    'AIRLINE_DOT',
    'FL_NUMBER',
    'DIVERTED',
    'FL_DATE'
]

In [None]:
df = df.drop(columns=columns_to_drop)
df.head()

In [None]:
le = LabelEncoder()

def clean_labels_encoder(list_of_labels, df):
    for label in list_of_labels:
        df[label] = le.fit_transform(df[label])
    return df

In [None]:
# clean the labels
list_of_labels = ['AIRLINE','ORIGIN', 'DEST','DEP_TIME_PERIOD']
df = clean_labels_encoder(list_of_labels, df)

df.head()

In [None]:
# describe the dataset
df.describe()

In [None]:
min_delay = df['ARR_DELAY'].min()
max_delay = df['ARR_DELAY'].max()
mean_delay = df['ARR_DELAY'].mean()
std_delay = df['ARR_DELAY'].std()

print(f"Min delay: {min_delay}")
print(f"Max delay: {max_delay}")
print(f"Mean delay: {mean_delay}")
print(f"Standard deviation: {std_delay}")

In [None]:
# fill the missing values with mean
df.fillna(df.mean(), inplace=True)

# show correlation
df.corr()

In [None]:
# show the correlation in a plt figure

def show_correlation(df):
    plt.figure(figsize=(20, 10))
    sns.set(style='whitegrid', context='notebook')
    sns.heatmap(df.corr(), annot=True, square=False, cmap='coolwarm')
    plt.show()

In [None]:
# show the correlation
show_correlation(df)

In [None]:
# split the data into features and target
# target is ARR_DELAY

X = df.drop(columns=['ARR_DELAY'])
y = df['ARR_DELAY']

In [None]:
# Split into training+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# Split the training+validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.3, shuffle=False)

In [None]:
scaler = MinMaxScaler()
df['ARR_DELAY'] = scaler.fit_transform(df[['ARR_DELAY']])

In [None]:
print(f"After Scaling - Min delay: {df['ARR_DELAY'].min()}")
print(f"After Scaling - Max delay: {df['ARR_DELAY'].max()}")
print(f"After Scaling - Mean delay: {df['ARR_DELAY'].mean()}")
print(f"After Scaling - Standard deviation: {df['ARR_DELAY'].std()}")

In [None]:
# Scale the data
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train = scaler_X.fit_transform(X_train)
X_val = scaler_X.transform(X_val)
X_test = scaler_X.transform(X_test)

# Reshape y_train, y_val, and y_test into 2D arrays, scale them
y_train = y_train.values.reshape(-1, 1)  # Convert to numpy array and reshape
y_val = y_val.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

y_train = scaler_y.fit_transform(y_train)
y_val = scaler_y.transform(y_val)
y_test = scaler_y.transform(y_test)

In [None]:
# Reshape the data for input to the LSTM model
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [None]:
# Define the Bidirectional LSTM model architecture
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(1, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(16))
model.add(Dense(1))

In [None]:
# Compile the model
optimizer = Adam(learning_rate=0.0001)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])

In [None]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test),
                    verbose=2, shuffle=False, callbacks=[early_stopping])

In [None]:
# Plot training & validation loss values
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Evaluate the model
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss (MSE):', score[0])
print('Test MAE:', score[1])

In [None]:
# Predict and plot the results
y_pred = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred)
y_test = scaler_y.inverse_transform(y_test)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='Actual Values', color='blue', alpha=0.7)
plt.plot(y_pred, label='Predicted Values', color='red', alpha=0.7)
plt.xlabel('Sample')
plt.ylabel('Value')
plt.title('Actual vs Predicted Values (Bidirectional LSTM)')
plt.legend()
plt.show()