# Flight Delay Prediction

## Data Loading and Preprocessing

In [None]:
import cudf
import cupy as cp

In [None]:
# Load the cleaned flight data
flights_df = cudf.read_parquet("cleaned_flights.parquet")

In [None]:
# Extract the hour from the 'DATE' column and create a new column 'DEPARTURE_HOUR'
flights_df['DEPARTURE_HOUR'] = flights_df['DATE'].dt.hour

In [None]:
# Replace missing values in 'DAILY_SNOWFALL' with 0
flights_df['DAILY_SNOWFALL'] = flights_df['DAILY_SNOWFALL'].fillna(0)

In [None]:
# Select only delayed flights from flights_df
delayed_flights = flights_df[flights_df['ARRIVAL_DELAY'] > 0].copy()

## PyTorch Implementation

In [None]:
import cupy as cp
from cuml.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split

In [None]:
# Standardize numeric columns for PyTorch models
for col in ['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK', 'DISTANCE']:
    col_mean = delayed_flights[col].mean()
    col_std  = delayed_flights[col].std()
    delayed_flights[col] = (delayed_flights[col] - col_mean) / col_std

In [None]:
# Extract features for PyTorch models
numeric_feats = delayed_flights[['MONTH', 'DEPARTURE_HOUR', 'DAY_OF_WEEK', 'DISTANCE', 'DAILY_SNOWFALL']].astype(cp.float32).values
categorical_feats = cudf.get_dummies(delayed_flights[['AIRLINE', 'origin_airport/AIRPORT', 'destination_airport/AIRPORT']]).values
X = cp.hstack([numeric_feats, categorical_feats])
y = delayed_flights['ARRIVAL_DELAY'].values

In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

In [None]:
# Convert data to PyTorch tensors
X_tensor = torch.as_tensor(X, device=device, dtype=torch.float32)
# Reshape y to be a 2D tensor (N, 1) for MSELoss
y_tensor = torch.as_tensor(y, device=device, dtype=torch.float32).unsqueeze(1)

In [None]:
# Create TensorDataset and DataLoader
dataset = TensorDataset(X_tensor, y_tensor)

# Define train/test split ratio
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# Split the dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Define the Feedforward Neural Network
class FeedForwardNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(FeedForwardNN, self).__init__()
        self.layer_1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.layer_3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = self.layer_1(x)
        x = self.relu1(x)
        x = self.layer_2(x)
        x = self.relu2(x)
        x = self.layer_3(x)
        return x

In [None]:
# Instantiate the model, define loss function and optimizer
input_size = X.shape[1]
hidden_size1 = 128  # Example hidden layer size
hidden_size2 = 64   # Example second hidden layer size
output_size = 1     # Predicting a single value (arrival delay)

model = FeedForwardNN(input_size, hidden_size1, hidden_size2, output_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
num_epochs = 20 # Adjust as needed

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # Move inputs and labels to the correct device
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Print statistics every 100 batches
        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

    epoch_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}] completed. Average Training Loss: {epoch_loss:.4f}')

In [None]:
# Evaluation
model.eval()  # Set model to evaluation mode
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        all_preds.append(outputs.cpu())
        all_labels.append(labels.cpu())

# Concatenate all predictions and labels
all_preds = torch.concatenate(all_preds, dim=0).numpy()
all_labels = torch.concatenate(all_labels, dim=0).numpy()

# Calculate metrics
r2 = r2_score(all_labels, all_preds)
mse = mean_squared_error(all_labels, all_preds)
rmse = cp.sqrt(mse)

print(f'Test R-squared (R2): {r2:.4f}')
print(f'Test Mean Squared Error (MSE): {mse:.4f}')
print(f'Test Root Mean Squared Error (RMSE): {rmse:.4f}')