In [6]:
import pandas as pd
import numpy as np
import glob
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import json
from tqdm import tqdm
import warnings
import torch.nn.functional as F
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(2025)
np.random.seed(2025)

# Set device (CPU or GPU if available)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Utility function for JSON serialization
def convert_to_serializable(obj):
    """
    Convert NumPy types to Python native types for JSON serialization
    """
    if isinstance(obj, (np.integer, np.int64, np.int32, np.int16, np.int8)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    elif isinstance(obj, (pd.DataFrame,)):
        return obj.to_dict('records')
    elif isinstance(obj, (pd.Series,)):
        return obj.to_dict()
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    else:
        return obj

# Set data paths
flight_data_path = './cleaned_data/'
weather_data_path = './cleaned_weather_data/'
top_airports_file = './top_100_airports.csv'  # File containing top 100 airports
output_dir = './dep_delay_nn/'  # PyTorch ResNet output directory

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print("Starting year-by-year flight delay prediction models (RNN)...")
print(f"Flight data directory: {flight_data_path}")
print(f"Weather data directory: {weather_data_path}")
print(f"Top airports file: {top_airports_file}")
print(f"Model output directory: {output_dir}")

# Load top 30 airports from the top 100 airports file
try:
    # Load the airport data with the exact format provided
    top_airports = pd.read_csv(top_airports_file, low_memory=False)
    
    # The file already has a Rank column, so we can just take the top 30
    top_airports = top_airports.head(30)
    
    # The airport codes are in ORIGIN_IATA column
    top_airport_codes = set(top_airports['ORIGIN_IATA'].str.strip().tolist())
    
    print(f"Loaded top 30 airports: {', '.join(sorted(top_airport_codes))}")
    print(f"Busiest airport: {top_airports.iloc[0]['ORIGIN_IATA']} with {top_airports.iloc[0]['Times']} flights")
    print(f"30th busiest airport: {top_airports.iloc[29]['ORIGIN_IATA']} with {top_airports.iloc[29]['Times']} flights")
except Exception as e:
    print(f"Error loading top airports file: {e}")
    # Fallback: if file doesn't exist, we'll use all airports
    top_airport_codes = None
    print("Will process all airports (top airports file not available)")

Using device: cuda:0
Starting year-by-year flight delay prediction models (RNN)...
Flight data directory: ./cleaned_data/
Weather data directory: ./cleaned_weather_data/
Top airports file: ./top_100_airports.csv
Model output directory: ./dep_delay_nn/
Loaded top 30 airports: ATL, AUS, BNA, BOS, BWI, CLT, DCA, DEN, DFW, DTW, EWR, FLL, IAD, IAH, JFK, LAS, LAX, LGA, MCO, MDW, MIA, MSP, ORD, PHL, PHX, SAN, SEA, SFO, SLC, TPA
Busiest airport: ATL with 457121 flights
30th busiest airport: TPA with 97235 flights


In [7]:
class AttentionLayer(nn.Module):
    """
    Attention mechanism layer to focus on important features
    """
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.Tanh(),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, x):
        # x shape: (batch_size, seq_len, hidden_dim)
        attention_weights = F.softmax(self.attention(x), dim=1)
        # attention_weights shape: (batch_size, seq_len, 1)

        context_vector = torch.sum(attention_weights * x, dim=1)
        # context_vector shape: (batch_size, hidden_dim)

        return context_vector, attention_weights

class RNNAttentionClassifier(nn.Module):
    """
    Flight delay classification model based on RNN and attention mechanism
    """
    def __init__(self, input_dim, hidden_dim=256, rnn_layers=2, dropout=0.3, bidirectional=True):
        super(RNNAttentionClassifier, self).__init__()

        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.rnn_layers = rnn_layers

        # Input feature embedding layer
        self.embedding = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # RNN layer (LSTM)
        self.rnn = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=rnn_layers,
            batch_first=True,
            dropout=dropout if rnn_layers > 1 else 0,
            bidirectional=bidirectional
        )

        # Attention layer
        self.attention = AttentionLayer(hidden_dim * 2 if bidirectional else hidden_dim)

        # Prediction layer
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        batch_size = x.size(0)

        # Apply embedding layer
        x = self.embedding(x)

        # Reshape for RNN (assuming we're dealing with single time step)
        x = x.unsqueeze(1)  # becomes (batch_size, 1, hidden_dim)

        # Apply RNN
        self.rnn.flatten_parameters()
        rnn_out, _ = self.rnn(x)
        # rnn_out shape: (batch_size, 1, hidden_dim*2) if bidirectional

        # Apply attention
        context, attention_weights = self.attention(rnn_out)

        # Predict delay probability
        delay_prob = self.classifier(context)
        return delay_prob

class RNNAttentionRegressor(nn.Module):
    """
    Flight delay time prediction model based on RNN and attention mechanism
    """
    def __init__(self, input_dim, hidden_dim=256, rnn_layers=2, dropout=0.3, bidirectional=True):
        super(RNNAttentionRegressor, self).__init__()

        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.rnn_layers = rnn_layers

        # Input feature embedding layer
        self.embedding = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout)
        )

        # RNN layer (GRU - may be better for time series)
        self.rnn = nn.GRU(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=rnn_layers,
            batch_first=True,
            dropout=dropout if rnn_layers > 1 else 0,
            bidirectional=bidirectional
        )

        # Attention layer
        self.attention = AttentionLayer(hidden_dim * 2 if bidirectional else hidden_dim)

        # Prediction layer
        self.regressor = nn.Sequential(
            nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, x):
        batch_size = x.size(0)

        # Apply embedding layer
        x = self.embedding(x)

        # Reshape for RNN (assuming we're dealing with single time step)
        x = x.unsqueeze(1)  # becomes (batch_size, 1, hidden_dim)

        # Apply RNN
        self.rnn.flatten_parameters()
        rnn_out, _ = self.rnn(x)
        # rnn_out shape: (batch_size, 1, hidden_dim*2) if bidirectional

        # Apply attention
        context, attention_weights = self.attention(rnn_out)

        # Predict delay time
        delay_time = self.regressor(context)
        return delay_time

# Custom Huber loss function to mitigate the impact of outliers
class HuberLoss(nn.Module):
    def __init__(self, delta=10.0):
        super(HuberLoss, self).__init__()
        self.delta = delta

    def forward(self, y_pred, y_true):
        y_pred = y_pred.squeeze()
        y_true = y_true.squeeze()
        residual = torch.abs(y_pred - y_true)
        condition = residual < self.delta
        squared_loss = 0.5 * residual ** 2
        linear_loss = self.delta * (residual - 0.5 * self.delta)
        return torch.mean(torch.where(condition, squared_loss, linear_loss))


In [8]:

# Function to load weather data
def load_weather_data():
    print("\nLoading weather data...")
    start_time = time.time()
    
    all_files = glob.glob(os.path.join(weather_data_path, "*.csv"))
    print(f"Found {len(all_files)} total weather data files")
    weather_dict = {}
    count = 0
    matching_count = 0
    
    # Process all weather files
    for file in all_files:
        try:
            # Extract airport code and date information from filename
            filename = os.path.basename(file)
            parts = filename.split('.')[0].split('_')
            
            if len(parts) >= 3:
                iata = parts[0]  # Airport code (e.g., ABI)
                year = parts[1]  # Year (e.g., 2021)
                month_name = parts[2]  # Month name (e.g., Aug)
                
                # Convert month name to number
                month_map = {
                    'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
                    'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
                    'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
                }
                
                if month_name in month_map:
                    month = month_map[month_name]
                    
                    # Only continue with top airports if we have a list
                    if top_airport_codes is None or iata in top_airport_codes:
                        # Read the weather data
                        weather_data = pd.read_csv(file, low_memory=False)
                        
                        # Ensure DATE column exists
                        if 'DATE' not in weather_data.columns:
                            print(f"Warning: DATE column not found in {filename}")
                            continue
                        
                        # Convert DATE to datetime
                        weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
                        
                        # Create the key for the weather dictionary
                        key = f"{iata}_{year}_{month}"
                        
                        # Store the weather data
                        weather_dict[key] = weather_data
                        matching_count += 1
                else:
                    print(f"Warning: Unknown month format in {filename}")
                
                count += 1
                
                # Print progress periodically
                if count % 100 == 0:
                    print(f"Processed {count} weather files, loaded {matching_count} matching files")
        except Exception as e:
            print(f"Error loading weather file {file}: {e}")
    
    print(f"Loaded {matching_count} weather files out of {count} processed files")
    print(f"Loading weather data took: {time.time() - start_time:.2f} seconds")
    return weather_dict

# Get specific May files from the cleaned_data directory
def get_may_files():
    may_files = [
        os.path.join(flight_data_path, "May2021.csv"),
        os.path.join(flight_data_path, "May2022.csv"),
        os.path.join(flight_data_path, "May2023.csv"),
        os.path.join(flight_data_path, "May2024.csv")
    ]
    
    # Verify each file exists
    existing_files = []
    for file_path in may_files:
        if os.path.exists(file_path):
            existing_files.append(file_path)
        else:
            print(f"Warning: File {file_path} not found")
    
    return existing_files

# Get the May 2021-2024 flight data files
flight_files = get_may_files()
print(f"\nFound {len(flight_files)} May files to process:")
for f in flight_files:
    print(f"  - {os.path.basename(f)}")

if not flight_files:
    print("No May 2021-2024 files were found. Please check file paths.")
    exit(1)

# Load all weather data once
weather_dict = load_weather_data()

# Function to extract year from filename
def extract_year_from_filename(filename):
    # Extract year from 'May2021.csv', 'May2022.csv', etc.
    base_name = os.path.basename(filename)
    year_str = base_name.replace('May', '').split('.')[0]
    return int(year_str)

# Function to create red-eye flight indicator
def create_redeye_indicator(df):
    """
    Creates a binary indicator for red-eye flights (0-6 AM scheduled departure or arrival)
    
    Args:
        df: DataFrame containing flight data with SCH_DEP_TIME and SCH_ARR_TIME
        
    Returns:
        DataFrame with IS_REDEYE column added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Initialize IS_REDEYE to 0 (not a red-eye flight)
    df['IS_REDEYE'] = 0
    
    # Convert time columns to standard format if they exist
    time_columns = []
    
    # Check for SCH_DEP_TIME
    if 'SCH_DEP_TIME' in df.columns:
        time_columns.append('SCH_DEP_TIME')
    
    # Check for SCH_ARR_TIME
    if 'SCH_ARR_TIME' in df.columns:
        time_columns.append('SCH_ARR_TIME')
    
    # Process each time column
    for col in time_columns:
        if df[col].dtype != 'float64':
            try:
                # Handle any non-numeric values
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except:
                print(f"Warning: Could not convert {col} to numeric")
    
    # Identify red-eye flights based on scheduled departure time (0-6 AM)
    if 'SCH_DEP_TIME' in time_columns:
        # Times are in HHMM format (e.g., 130 = 1:30 AM, 545 = 5:45 AM)
        redeye_departure = (df['SCH_DEP_TIME'] >= 0) & (df['SCH_DEP_TIME'] < 600)
        df.loc[redeye_departure, 'IS_REDEYE'] = 1
        
        # Count departures identified as red-eye
        dep_redeye_count = redeye_departure.sum()
        print(f"Identified {dep_redeye_count} red-eye flights based on departure time (0-6 AM)")
    
    # Identify red-eye flights based on scheduled arrival time (0-6 AM)
    if 'SCH_ARR_TIME' in time_columns:
        redeye_arrival = (df['SCH_ARR_TIME'] >= 0) & (df['SCH_ARR_TIME'] < 600)
        df.loc[redeye_arrival, 'IS_REDEYE'] = 1
        
        # Count arrivals identified as red-eye
        arr_redeye_count = redeye_arrival.sum()
        print(f"Identified {arr_redeye_count} red-eye flights based on arrival time (0-6 AM)")
    
    # Print statistics about red-eye flights
    redeye_count = df['IS_REDEYE'].sum()
    total_count = len(df)
    print(f"Total identified red-eye flights: {redeye_count} out of {total_count} total flights ({redeye_count/total_count*100:.2f}%)")
    
    # Add a more detailed time-of-day categorical feature if needed
    if 'SCH_DEP_TIME' in time_columns:
        # Create a categorical time of day feature
        df['DEP_TIME_OF_DAY'] = pd.cut(
            df['SCH_DEP_TIME'], 
            bins=[0, 600, 1200, 1800, 2400],
            labels=['Early Morning (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)'],
            include_lowest=True
        )
        
        # Print distribution of flights by time of day
        time_dist = df['DEP_TIME_OF_DAY'].value_counts()
        print("\nDistribution of flights by departure time of day:")
        for time_cat, count in time_dist.items():
            print(f"  - {time_cat}: {count} flights ({count/total_count*100:.2f}%)")
    
    return df

# Function to prepare departure delay data
def prepare_delay_data(df):
    """
    Prepares departure delay data for modeling
    
    Args:
        df: DataFrame containing flight data with DEP_DELAY column
        
    Returns:
        DataFrame with additional delay-related columns
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure DEP_DELAY is numeric
    if 'DEP_DELAY' in df.columns:
        if df['DEP_DELAY'].dtype != 'float64':
            try:
                df['DEP_DELAY'] = pd.to_numeric(df['DEP_DELAY'], errors='coerce')
            except:
                print(f"Warning: Could not convert DEP_DELAY to numeric")
    else:
        print("Warning: DEP_DELAY column not found in dataset")
        return df
    
    # Create a binary feature for on-time departure (<=0 means on time or early)
    df['IS_DELAYED'] = (df['DEP_DELAY'] > 0).astype(int)
    
    # Create a categorical delay feature
    df['DELAY_CATEGORY'] = pd.cut(
        df['DEP_DELAY'],
        bins=[-float('inf'), -15, 0, 15, 60, 120, float('inf')],
        labels=['Very Early', 'Early', 'On Time', 'Moderate Delay',
                'Significant Delay', 'Severe Delay'],
        include_lowest=True
    )
    
    # Add absolute delay (for prediction error metrics)
    df['ABS_DELAY'] = np.abs(df['DEP_DELAY'])
    
    # Print delay statistics
    delay_count = df['IS_DELAYED'].sum()
    total_count = len(df)
    delay_rate = delay_count / total_count * 100
    
    print(f"\nDelay statistics:")
    print(f"Delayed flights: {delay_count}/{total_count} ({delay_rate:.2f}%)")
    print(f"On-time or early flights: {total_count - delay_count}/{total_count} ({100 - delay_rate:.2f}%)")
    
    print("\nDelay magnitude statistics:")
    print(f"Mean delay: {df['DEP_DELAY'].mean():.2f} minutes")
    print(f"Median delay: {df['DEP_DELAY'].median():.2f} minutes")
    print(f"Min delay: {df['DEP_DELAY'].min():.2f} minutes (negative means early departure)")
    print(f"Max delay: {df['DEP_DELAY'].max():.2f} minutes")
    
    # Clip extreme values for neural network training
    # This is important for neural networks as extreme outliers can cause training issues
    upper_limit = df['DEP_DELAY'].quantile(0.995)  # 99.5th percentile
    df['DEP_DELAY_CLIPPED'] = df['DEP_DELAY'].clip(upper=upper_limit)
    
    print(f"Clipped delay values above {upper_limit:.2f} minutes for neural network training")
    print(f"Number of clipped values: {(df['DEP_DELAY'] > upper_limit).sum()}")
    
    # Print delay category distribution
    delay_cat_dist = df['DELAY_CATEGORY'].value_counts()
    print("\nDelay category distribution:")
    for cat, count in delay_cat_dist.sort_index().items():
        print(f"  - {cat}: {count} flights ({count/total_count*100:.2f}%)")
    
    return df

# Function to create advanced time features
def create_advanced_time_features(df):
    """
    Creates advanced time features including cyclic encoding, time blocks, and peak indicators
    
    Args:
        df: DataFrame containing flight data with SCH_DEP_TIME
        
    Returns:
        DataFrame with time features added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    if 'SCH_DEP_TIME' not in df.columns:
        print("Warning: SCH_DEP_TIME column not found for time features")
        return df
    
    # Ensure SCH_DEP_TIME is numeric
    if df['SCH_DEP_TIME'].dtype != 'float64':
        try:
            df['SCH_DEP_TIME'] = pd.to_numeric(df['SCH_DEP_TIME'], errors='coerce')
        except:
            print(f"Warning: Could not convert SCH_DEP_TIME to numeric")
            return df
    
    # Extract hour and minute from SCH_DEP_TIME (time format is HHMM)
    df['DEP_HOUR'] = (df['SCH_DEP_TIME'] // 100).astype(int)
    df['DEP_MINUTE'] = (df['SCH_DEP_TIME'] % 100).astype(int)
    
    # Calculate time in minutes from midnight
    df['TIME_MINS'] = df['DEP_HOUR'] * 60 + df['DEP_MINUTE']
    
    # Create normalized time of day (0-1)
    df['NORMALIZED_TIME'] = df['TIME_MINS'] / (24 * 60)
    
    # Create cyclic encodings at multiple frequencies
    # 24-hour cycle
    df['HOUR_SIN'] = np.sin(2 * np.pi * df['DEP_HOUR'] / 24)
    df['HOUR_COS'] = np.cos(2 * np.pi * df['DEP_HOUR'] / 24)
    
    # 12-hour cycle (AM/PM pattern)
    df['HALFDAY_SIN'] = np.sin(2 * np.pi * df['DEP_HOUR'] / 12)
    df['HALFDAY_COS'] = np.cos(2 * np.pi * df['DEP_HOUR'] / 12)
    
    # 6-hour cycle (captures 4 parts of day)
    df['QUARTER_DAY_SIN'] = np.sin(2 * np.pi * df['DEP_HOUR'] / 6)
    df['QUARTER_DAY_COS'] = np.cos(2 * np.pi * df['DEP_HOUR'] / 6)
    
    # Create time blocks (each block is 3 hours)
    time_blocks = {
        0: 'Late Night (0-3)',
        1: 'Late Night (0-3)',
        2: 'Late Night (0-3)',
        3: 'Early Morning (3-6)',
        4: 'Early Morning (3-6)',
        5: 'Early Morning (3-6)',
        6: 'Morning (6-9)',
        7: 'Morning (6-9)',
        8: 'Morning (6-9)',
        9: 'Mid-Day (9-12)',
        10: 'Mid-Day (9-12)',
        11: 'Mid-Day (9-12)',
        12: 'Afternoon (12-15)',
        13: 'Afternoon (12-15)',
        14: 'Afternoon (12-15)',
        15: 'Evening (15-18)',
        16: 'Evening (15-18)',
        17: 'Evening (15-18)',
        18: 'Night (18-21)',
        19: 'Night (18-21)',
        20: 'Night (18-21)',
        21: 'Late Night (21-24)',
        22: 'Late Night (21-24)',
        23: 'Late Night (21-24)'
    }
    
    # Map hours to time blocks
    df['TIME_BLOCK'] = df['DEP_HOUR'].map(time_blocks)
    
    # Create binary variables for peak times
    # Morning peak (7-9 AM)
    df['IS_MORNING_PEAK'] = ((df['DEP_HOUR'] >= 7) & (df['DEP_HOUR'] <= 9)).astype(int)
    
    # Evening peak (4-7 PM)
    df['IS_EVENING_PEAK'] = ((df['DEP_HOUR'] >= 16) & (df['DEP_HOUR'] <= 19)).astype(int)
    
    # Create busy airport time indicators (combine time and hub status)
    if 'IS_MAJOR_HUB_ORIGIN' in df.columns:
        df['HUB_MORNING_PEAK'] = df['IS_MAJOR_HUB_ORIGIN'] * df['IS_MORNING_PEAK']
        df['HUB_EVENING_PEAK'] = df['IS_MAJOR_HUB_ORIGIN'] * df['IS_EVENING_PEAK']
    
    return df

# Function to create day of week features
def create_advanced_day_features(df):
    """
    Creates advanced day of week features with cyclic encoding
    
    Args:
        df: DataFrame containing flight data with WEEK column
        
    Returns:
        DataFrame with day features added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Check if we have the WEEK column with text day names
    if 'WEEK' in df.columns:
        # Create a mapping from abbreviated day names to full day names
        day_name_map = {
            'Sun': 'Sunday',
            'Mon': 'Monday',
            'Tue': 'Tuesday',
            'Wed': 'Wednesday',
            'Thu': 'Thursday',
            'Fri': 'Friday',
            'Sat': 'Saturday'
        }
        
        # Map abbreviated names to full names
        df['DAY_NAME'] = df['WEEK'].map(day_name_map)
        
        # Create weekend indicator
        df['IS_WEEKEND'] = df['WEEK'].isin(['Sat', 'Sun']).astype(int)
        
        # Create day of week encoding - using cyclic encoding for better neural network performance
        # This captures the cyclical nature of days of the week
        # Convert day names to 0-6 numerical values (Monday=0, Sunday=6)
        day_to_num = {'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6}
        df['DAY_NUM'] = df['WEEK'].map(day_to_num)
        
        # Create cyclical features for day of week
        # Weekly cycle
        df['DAY_SIN'] = np.sin(2 * np.pi * df['DAY_NUM'] / 7)
        df['DAY_COS'] = np.cos(2 * np.pi * df['DAY_NUM'] / 7)
        
        # Weekday/weekend cycle
        df['WEEKDAY_SIN'] = np.sin(np.pi * df['IS_WEEKEND'])
        df['WEEKDAY_COS'] = np.cos(np.pi * df['IS_WEEKEND'])
        
        # Create workweek features (5-day cycle for business days)
        df['WORKWEEK_DAY'] = df['DAY_NUM'].apply(lambda x: x if x < 5 else np.nan)
        # Fill weekend days with the mean of weekdays
        work_day_mean = df['WORKWEEK_DAY'].mean()
        df['WORKWEEK_DAY'] = df['WORKWEEK_DAY'].fillna(work_day_mean)
        
        # Workweek cycle
        df['WORKWEEK_SIN'] = np.sin(2 * np.pi * df['WORKWEEK_DAY'] / 5)
        df['WORKWEEK_COS'] = np.cos(2 * np.pi * df['WORKWEEK_DAY'] / 5)
        
        # Print distribution of days
        day_counts = df['DAY_NAME'].value_counts()
        total = len(df)
        print("\nDistribution of flights by day of week:")
        for day, count in day_counts.items():
            print(f"  - {day}: {count} flights ({count/total*100:.2f}%)")
        
        # Print weekend vs. weekday distribution
        weekend_count = df['IS_WEEKEND'].sum()
        weekday_count = total - weekend_count
        print(f"\nWeekend flights: {weekend_count} ({weekend_count/total*100:.2f}%)")
        print(f"Weekday flights: {weekday_count} ({weekday_count/total*100:.2f}%)")
        
    elif 'DAY_OF_WEEK' in df.columns:
        # Assuming 1=Monday, ..., 7=Sunday or 0=Monday, ..., 6=Sunday
        max_day = df['DAY_OF_WEEK'].max()
        
        if max_day == 7:
            # 1-7 format (6,7 = weekend)
            df['IS_WEEKEND'] = ((df['DAY_OF_WEEK'] == 6) | (df['DAY_OF_WEEK'] == 7)).astype(int)
            
            # Map day numbers to names for better interpretability
            day_names = {1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 
                        4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}
            
            # For cyclic encoding, convert to 0-6 range
            df['DAY_NUM'] = df['DAY_OF_WEEK'] - 1
        else:
            # 0-6 format (5,6 = weekend)
            df['IS_WEEKEND'] = ((df['DAY_OF_WEEK'] == 5) | (df['DAY_OF_WEEK'] == 6)).astype(int)
            
            # Map day numbers to names for better interpretability
            day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 
                        3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
            
            # For cyclic encoding, we already have 0-6 range
            df['DAY_NUM'] = df['DAY_OF_WEEK']
        
        df['DAY_NAME'] = df['DAY_OF_WEEK'].map(day_names)
        
        # Add cyclic encoding
        df['DAY_SIN'] = np.sin(2 * np.pi * df['DAY_NUM'] / 7)
        df['DAY_COS'] = np.cos(2 * np.pi * df['DAY_NUM'] / 7)
        
        # Weekday/weekend cycle
        df['WEEKDAY_SIN'] = np.sin(np.pi * df['IS_WEEKEND'])
        df['WEEKDAY_COS'] = np.cos(np.pi * df['IS_WEEKEND'])
    else:
        print("Warning: No day of week column (WEEK or DAY_OF_WEEK) found")
    
    return df

# Function to create advanced airport features
def create_airport_features(df):
    """
    Creates advanced airport features
    
    Args:
        df: DataFrame containing flight data with airport information
        
    Returns:
        DataFrame with airport features added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Define major hub airports
    major_hubs = ['ATL', 'DFW', 'DEN', 'ORD', 'LAX', 'CLT', 'LAS', 'PHX', 'MCO', 'SEA']
    
    # Create hub indicators
    if 'ORIGIN_IATA' in df.columns:
        df['IS_MAJOR_HUB_ORIGIN'] = df['ORIGIN_IATA'].isin(major_hubs).astype(int)
    
    if 'DEST_IATA' in df.columns:
        df['IS_MAJOR_HUB_DEST'] = df['DEST_IATA'].isin(major_hubs).astype(int)
    
    # Create hub-to-hub flight indicator
    if 'IS_MAJOR_HUB_ORIGIN' in df.columns and 'IS_MAJOR_HUB_DEST' in df.columns:
        df['IS_HUB_TO_HUB'] = (df['IS_MAJOR_HUB_ORIGIN'] & df['IS_MAJOR_HUB_DEST']).astype(int)
    
    # Create region indicators (simplistic example)
    if 'ORIGIN_IATA' in df.columns:
        # West Coast airports
        west_coast = ['LAX', 'SFO', 'SEA', 'PDX', 'SAN', 'LAS']
        # East Coast airports
        east_coast = ['JFK', 'LGA', 'EWR', 'BOS', 'DCA', 'IAD', 'MIA', 'FLL', 'ATL', 'CLT']
        # Central/Midwest airports
        central = ['ORD', 'MDW', 'DFW', 'IAH', 'DEN', 'MSP', 'DTW', 'STL']
        
        df['IS_WEST_COAST_ORIGIN'] = df['ORIGIN_IATA'].isin(west_coast).astype(int)
        df['IS_EAST_COAST_ORIGIN'] = df['ORIGIN_IATA'].isin(east_coast).astype(int)
        df['IS_CENTRAL_ORIGIN'] = df['ORIGIN_IATA'].isin(central).astype(int)
        
        df['IS_WEST_COAST_DEST'] = df['DEST_IATA'].isin(west_coast).astype(int)
        df['IS_EAST_COAST_DEST'] = df['DEST_IATA'].isin(east_coast).astype(int)
        df['IS_CENTRAL_DEST'] = df['DEST_IATA'].isin(central).astype(int)
        
        # Transcontinental flight indicator
        df['IS_TRANSCON'] = ((df['IS_WEST_COAST_ORIGIN'] & df['IS_EAST_COAST_DEST']) | 
                             (df['IS_EAST_COAST_ORIGIN'] & df['IS_WEST_COAST_DEST'])).astype(int)
    
    # Create distance categories
    if 'DISTANCE' in df.columns:
        df['DISTANCE_CAT'] = pd.cut(
            df['DISTANCE'], 
            bins=[0, 500, 1000, 1500, 2000, float('inf')],
            labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long']
        )
        
        # Create distance-based features for neural network
        # Normalize distance
        max_dist = df['DISTANCE'].max()
        df['NORMALIZED_DISTANCE'] = df['DISTANCE'] / max_dist
        
        # Create logarithmic distance feature
        df['LOG_DISTANCE'] = np.log1p(df['DISTANCE'])
    
    return df

# Function to create advanced weather features
def create_weather_features(df):
    """
    Creates advanced weather features
    
    Args:
        df: DataFrame containing flight data with weather information
        
    Returns:
        DataFrame with weather features added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Check if we have the basic weather features
    if 'PRCP' in df.columns:
        # Create precipitation categories
        df['RAIN_SEVERITY'] = pd.cut(
            df['PRCP'],
            bins=[-0.01, 0.0, 0.1, 0.5, 1.0, float('inf')],
            labels=[0, 1, 2, 3, 4]
        ).astype(int)
    
    # Combine weather features
    if 'RAIN_SEVERITY' in df.columns and 'EXTREME_WEATHER' in df.columns:
        df['WEATHER_SCORE'] = df['RAIN_SEVERITY'] + df['EXTREME_WEATHER'] * 3
    
    # Create weather interaction features
    if 'IS_MAJOR_HUB_ORIGIN' in df.columns and 'WEATHER_SCORE' in df.columns:
        df['HUB_WEATHER_IMPACT'] = df['IS_MAJOR_HUB_ORIGIN'] * df['WEATHER_SCORE']
    
    # Create time-weather interactions
    if 'IS_MORNING_PEAK' in df.columns and 'WEATHER_SCORE' in df.columns:
        df['PEAK_WEATHER_IMPACT'] = (df['IS_MORNING_PEAK'] | df['IS_EVENING_PEAK']) * df['WEATHER_SCORE']
    
    return df

# Function to load and preprocess a single flight data file
def load_and_process_flight_data(file_path):
    """
    Load and preprocess a single flight data file
    
    Args:
        file_path: Path to the flight data file
        
    Returns:
        DataFrame with processed flight data
    """
    print(f"\nProcessing {os.path.basename(file_path)}...")
    start_time = time.time()
    
    try:
        # Load flight data
        df = pd.read_csv(file_path, low_memory=False)
        original_size = len(df)
        
        # Extract year from filename
        file_year = extract_year_from_filename(file_path)
        
        # Ensure the year is properly set
        if 'YEAR' in df.columns:
            # Verify that the year in the data matches the filename
            unique_years = df['YEAR'].unique()
            print(f"Years found in data: {unique_years}")
            
            # If data has multiple years, filter to only the year from filename
            if len(unique_years) > 1:
                df = df[df['YEAR'] == file_year]
                print(f"Filtered to only year {file_year}: {len(df)} rows")
        else:
            # If no YEAR column exists, create one based on filename
            df['YEAR'] = file_year
            print(f"Added YEAR column with value {file_year}")
        
        # Ensure we only have May data
        if 'MONTH' in df.columns:
            month_counts = df['MONTH'].value_counts()
            print(f"Months found in data: {dict(month_counts)}")
            
            if 5 in month_counts:
                df = df[df['MONTH'] == 5]
                print(f"Filtered to only May data: {len(df)} rows")
            else:
                print(f"Warning: No May data found in file, but proceeding anyway as this should be May data based on filename")
        
        # Check for DEP_DELAY column
        if 'DEP_DELAY' not in df.columns:
            print(f"DEP_DELAY column not found in {os.path.basename(file_path)}. Skipping file.")
            return None
        
        # Filter for top airports if we have the list
        if top_airport_codes is not None:
            df = df[
                df['ORIGIN_IATA'].str.strip().isin(top_airport_codes) & 
                df['DEST_IATA'].str.strip().isin(top_airport_codes)
            ]
            
            filtered_size = len(df)
            print(f"Filtered from {original_size} to {filtered_size} rows for top 30 airports")
            
            # If no data left after filtering, skip this file
            if filtered_size == 0:
                print(f"No data remaining after filtering for top 30 airports. Skipping file.")
                return None
        
        # Remove cancelled flights since they don't have actual departure times
        if 'CANCELLED' in df.columns:
            cancelled_count = df['CANCELLED'].sum()
            if cancelled_count > 0:
                df = df[df['CANCELLED'] == 0]
                print(f"Removed {cancelled_count} cancelled flights, remaining: {len(df)}")
        
        # Handle missing values in DEP_DELAY
        if df['DEP_DELAY'].isnull().any():
            missing_count = df['DEP_DELAY'].isnull().sum()
            print(f"Found {missing_count} rows with missing DEP_DELAY values. Removing them.")
            df = df.dropna(subset=['DEP_DELAY'])
            print(f"After removing rows with missing DEP_DELAY: {len(df)} rows")
        
        print(f"Processing took: {time.time() - start_time:.2f} seconds")
        return df
        
    except Exception as e:
        print(f"Error processing file {os.path.basename(file_path)}: {e}")
        return None

# Function to match weather data to flights
def match_weather_data(df):
    """
    Match weather data to flight records
    
    Args:
        df: DataFrame containing flight data
        
    Returns:
        DataFrame with weather data added
    """
    print("\nMatching weather data with flights...")
    start_time = time.time()
    
    # Make sure necessary date columns exist
    date_columns_exist = all(col in df.columns for col in ['YEAR', 'MONTH', 'DAY'])
    if not date_columns_exist:
        print("Warning: Missing one or more date columns (YEAR, MONTH, DAY)")
        print("Weather data cannot be matched")
        return df
    
    # Create a date column for matching - convert to datetime
    df['FLIGHT_DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])
    
    # Create a column to hold the weather key pattern
    df['WEATHER_KEY'] = df['ORIGIN_IATA'] + '_' + df['YEAR'].astype(str) + '_' + df['MONTH'].astype(str).str.zfill(2)
    
    # Create columns for weather features
    weather_columns = ['EXTREME_WEATHER', 'PRCP', 'WT01', 'WT03', 'WT04', 'WT05', 'WT08', 'WT11']
    for col in weather_columns:
        if col not in df.columns:
            df[col] = 0.0
    
    # Process in batches
    matched_count = 0
    batch_size = 10000
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx]
        
        for idx, row in batch.iterrows():
            try:
                # Get the weather key based on the origin airport and date
                weather_key = row['WEATHER_KEY']
                flight_date = row['FLIGHT_DATE']
                
                # Check if this key exists in our weather dictionary
                if weather_key in weather_dict:
                    weather_data = weather_dict[weather_key]
                    
                    # Find matching weather data for the flight date
                    matching_weather = weather_data[weather_data['DATE'] == flight_date]
                    
                    if not matching_weather.empty:
                        # Match available weather columns
                        for col in weather_columns:
                            if col in matching_weather.columns:
                                df.at[idx, col] = matching_weather[col].iloc[0]
                        matched_count += 1
            except Exception as e:
                # Less verbose error reporting for speed
                pass
        
        # Print progress
        print(f"Processed {end_idx}/{len(df)} rows, matched {matched_count} flights with weather data")
    
    print(f"Matched weather data for {matched_count} flights ({matched_count/len(df)*100:.2f}%)")
    print(f"Weather matching took: {time.time() - start_time:.2f} seconds")
    
    return df


Found 4 May files to process:
  - May2021.csv
  - May2022.csv
  - May2023.csv
  - May2024.csv

Loading weather data...
Found 3550 total weather data files
Processed 100 weather files, loaded 0 matching files
Processed 200 weather files, loaded 0 matching files
Processed 300 weather files, loaded 16 matching files
Processed 400 weather files, loaded 32 matching files
Processed 500 weather files, loaded 32 matching files
Processed 600 weather files, loaded 48 matching files
Processed 700 weather files, loaded 48 matching files
Processed 800 weather files, loaded 48 matching files
Processed 900 weather files, loaded 64 matching files
Processed 1000 weather files, loaded 64 matching files
Processed 1100 weather files, loaded 80 matching files
Processed 1200 weather files, loaded 96 matching files
Processed 1300 weather files, loaded 112 matching files
Processed 1400 weather files, loaded 112 matching files
Processed 1500 weather files, loaded 112 matching files
Processed 1600 weather file

In [9]:

# Function to train RNN models
def train_rnn_attention_models(X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg, year, output_dir, device):
    """
    Build and train neural network models based on RNN+Attention

    Args:
        X_train: Training feature data
        X_test: Test feature data
        y_train_class: Training class labels
        y_test_class: Test class labels
        y_train_reg: Training regression values
        y_test_reg: Test regression values
        year: Year of the data
        output_dir: Output directory for model and results
        device: Device to use (CPU/GPU)

    Returns:
        Dictionary with model results and metrics
    """
    print("\nBuilding and training RNN+Attention neural network models...")
    start_time = time.time()

    # Create year-specific output directories
    plots_dir = os.path.join(output_dir, f'plots_{year}')
    models_dir = os.path.join(output_dir, f'models_{year}')
    metrics_dir = os.path.join(output_dir, f'metrics_{year}')

    os.makedirs(plots_dir, exist_ok=True)
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(metrics_dir, exist_ok=True)

    # Convert data to PyTorch tensors and create data loaders
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)

    y_train_class_tensor = torch.FloatTensor(y_train_class.values).to(device)
    y_test_class_tensor = torch.FloatTensor(y_test_class.values).to(device)

    y_train_reg_tensor = torch.FloatTensor(y_train_reg.values).to(device)
    y_test_reg_tensor = torch.FloatTensor(y_test_reg.values).to(device)

    # Create datasets
    train_class_dataset = TensorDataset(X_train_tensor, y_train_class_tensor)
    test_class_dataset = TensorDataset(X_test_tensor, y_test_class_tensor)

    train_reg_dataset = TensorDataset(X_train_tensor, y_train_reg_tensor)
    test_reg_dataset = TensorDataset(X_test_tensor, y_test_reg_tensor)

    # Create data loaders
    batch_size = 1024  # Adjust based on available memory
    train_class_loader = DataLoader(train_class_dataset, batch_size=batch_size, shuffle=True)
    test_class_loader = DataLoader(test_class_dataset, batch_size=batch_size)

    train_reg_loader = DataLoader(train_reg_dataset, batch_size=batch_size, shuffle=True)
    test_reg_loader = DataLoader(test_reg_dataset, batch_size=batch_size)

    # 1. Train Classification Model
    print("\nTraining RNN+Attention classification model...")
    class_model_start_time = time.time()

    # Initialize model
    input_dim = X_train.shape[1]
    classifier = RNNAttentionClassifier(
        input_dim=input_dim,
        hidden_dim=256,
        rnn_layers=2,
        dropout=0.3,
        bidirectional=True
    ).to(device)

    # Define loss function and optimizer
    criterion_class = nn.BCELoss()
    optimizer_class = torch.optim.Adam(classifier.parameters(), lr=0.001, weight_decay=1e-5)

    # Learning rate scheduler
    scheduler_class = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer_class, mode='min', factor=0.5, patience=3, verbose=True
    )

    # Training loop parameters
    num_epochs = 50
    best_val_loss = float('inf')
    patience = 10
    patience_counter = 0
    best_model_state = None

    # Metrics tracking
    train_losses_class = []
    val_losses_class = []

    # Training loop
    for epoch in range(num_epochs):
        classifier.train()
        running_loss = 0.0

        for inputs, targets in tqdm(train_class_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            # Zero the parameter gradients
            optimizer_class.zero_grad()

            # Forward pass
            outputs = classifier(inputs)
            outputs = outputs.squeeze()
            loss = criterion_class(outputs, targets)

            # Backward pass and optimize
            loss.backward()
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(classifier.parameters(), max_norm=1.0)
            optimizer_class.step()

            running_loss += loss.item() * inputs.size(0)

        epoch_train_loss = running_loss / len(train_class_dataset)
        train_losses_class.append(epoch_train_loss)

        # Validation phase
        classifier.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in test_class_loader:
                outputs = classifier(inputs)
                outputs = outputs.squeeze()
                loss = criterion_class(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

        epoch_val_loss = val_loss / len(test_class_dataset)
        val_losses_class.append(epoch_val_loss)

        # Learning rate scheduling
        scheduler_class.step(epoch_val_loss)

        # Print stats
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')

        # Save best model
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = classifier.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= patience:
            print(f'Early stopping triggered after {epoch+1} epochs')
            break

    # Load best model
    classifier.load_state_dict(best_model_state)

    # Save model
    torch.save(classifier.state_dict(), os.path.join(models_dir, f'rnn_attention_classifier_{year}.pth'))

    class_model_training_time = time.time() - class_model_start_time
    print(f"Classification model training took: {class_model_training_time:.2f} seconds")

    # 2. Train Regression Model
    print("\nTraining RNN+Attention regression model...")
    reg_model_start_time = time.time()

    # Initialize model
    regressor = RNNAttentionRegressor(
        input_dim=input_dim,
        hidden_dim=256,
        rnn_layers=2,
        dropout=0.3,
        bidirectional=True
    ).to(device)

    # Define loss function and optimizer (using custom Huber loss)
    criterion_reg = HuberLoss(delta=15.0)
    optimizer_reg = torch.optim.Adam(regressor.parameters(), lr=0.001, weight_decay=1e-5)

    # Learning rate scheduler
    scheduler_reg = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer_reg, mode='min', factor=0.5, patience=3, verbose=True
    )

    # Reset training loop parameters
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    # Metrics tracking
    train_losses_reg = []
    val_losses_reg = []

    # Training loop
    for epoch in range(num_epochs):
        regressor.train()
        running_loss = 0.0

        for inputs, targets in tqdm(train_reg_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            # Zero the parameter gradients
            optimizer_reg.zero_grad()

            # Forward pass
            outputs = regressor(inputs)
            outputs = outputs.squeeze()
            loss = criterion_reg(outputs, targets)

            # Backward pass and optimize
            loss.backward()
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(regressor.parameters(), max_norm=1.0)
            optimizer_reg.step()

            running_loss += loss.item() * inputs.size(0)

        epoch_train_loss = running_loss / len(train_reg_dataset)
        train_losses_reg.append(epoch_train_loss)

        # Validation phase
        regressor.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in test_reg_loader:
                outputs = regressor(inputs)
                outputs = outputs.squeeze()
                loss = criterion_reg(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

        epoch_val_loss = val_loss / len(test_reg_dataset)
        val_losses_reg.append(epoch_val_loss)

        # Learning rate scheduling
        scheduler_reg.step(epoch_val_loss)

        # Print stats
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')

        # Save best model
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = regressor.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= patience:
            print(f'Early stopping triggered after {epoch+1} epochs')
            break

    # Load best model
    regressor.load_state_dict(best_model_state)

    # Save model
    torch.save(regressor.state_dict(), os.path.join(models_dir, f'rnn_attention_regressor_{year}.pth'))

    reg_model_training_time = time.time() - reg_model_start_time
    print(f"Regression model training took: {reg_model_training_time:.2f} seconds")

    # 3. Evaluate Classification Model
    print("\nEvaluating classification model...")

    classifier.eval()
    all_preds = []
    all_targets = []
    all_probs = []

    with torch.no_grad():
        for inputs, targets in test_class_loader:
            outputs = classifier(inputs)
            outputs = outputs.squeeze()

            probs = outputs.cpu().numpy()
            preds = (outputs >= 0.5).float().cpu().numpy()

            all_probs.extend(probs)
            all_preds.extend(preds)
            all_targets.extend(targets.cpu().numpy())

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)
    all_probs = np.array(all_probs)

    # Calculate metrics
    class_accuracy = (all_preds == all_targets).mean() * 100

    # Check if there are enough classes for ROC/AUC evaluation
    if len(np.unique(all_targets)) > 1:
        class_roc_auc = roc_auc_score(all_targets, all_probs)
    else:
        print("Warning: Only one class present in test set. ROC AUC score cannot be calculated.")
        class_roc_auc = 0.0

    class_report = classification_report(all_targets, all_preds, output_dict=True)
    class_cm = confusion_matrix(all_targets, all_preds)

    print(f"Classification Accuracy: {class_accuracy:.2f}%")
    print(f"Classification ROC AUC: {class_roc_auc:.4f}")

    # Check if '1' key exists in classification report
    if '1' in class_report:
        print(f"Classification Precision (Delayed): {class_report['1']['precision']:.4f}")
        print(f"Classification Recall (Delayed): {class_report['1']['recall']:.4f}")
        print(f"Classification F1 Score (Delayed): {class_report['1']['f1-score']:.4f}")
        precision = class_report['1']['precision']
        recall = class_report['1']['recall']
        f1_score = class_report['1']['f1-score']
    else:
        print("Warning: Class '1' not present in test results. Using default metrics.")
        precision = 0.0
        recall = 0.0
        f1_score = 0.0

    # 4. Evaluate Regression Model
    print("\nEvaluating regression model...")

    regressor.eval()
    all_reg_preds = []
    all_reg_targets = []

    with torch.no_grad():
        for inputs, targets in test_reg_loader:
            outputs = regressor(inputs)
            outputs = outputs.squeeze()

            all_reg_preds.extend(outputs.cpu().numpy())
            all_reg_targets.extend(targets.cpu().numpy())

    all_reg_preds = np.array(all_reg_preds)
    all_reg_targets = np.array(all_reg_targets)

    # Calculate metrics
    reg_mse = mean_squared_error(all_reg_targets, all_reg_preds)
    reg_rmse = np.sqrt(reg_mse)
    reg_mae = mean_absolute_error(all_reg_targets, all_reg_preds)
    reg_r2 = r2_score(all_reg_targets, all_reg_preds)

    print(f"Regression Mean Squared Error: {reg_mse:.2f}")
    print(f"Regression Root Mean Squared Error: {reg_rmse:.2f} minutes")
    print(f"Regression Mean Absolute Error: {reg_mae:.2f} minutes")
    print(f"Regression R² Score: {reg_r2:.4f}")

    # 5. Create plots and visualizations

    # Plot training/validation loss for classification
    plt.figure(figsize=(16, 10))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses_class, label='Train Loss')
    plt.plot(val_losses_class, label='Validation Loss')
    plt.title(f'Classification Loss Curves - {year}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot training/validation loss for regression
    plt.subplot(1, 2, 2)
    plt.plot(train_losses_reg, label='Train Loss')
    plt.plot(val_losses_reg, label='Validation Loss')
    plt.title(f'Regression Loss Curves - {year}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'rnn_learning_curves_{year}.png'))
    plt.close()

    # Plot confusion matrix for classification
    plt.figure(figsize=(16, 10))
    sns.heatmap(class_cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Not Delayed', 'Delayed'],
               yticklabels=['Not Delayed', 'Delayed'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Delay Classification Confusion Matrix ({year})')
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'rnn_confusion_matrix_{year}.png'))
    plt.close()

    # Check if ROC curve can be plotted
    if len(np.unique(all_targets)) > 1:
        # Plot ROC curve for classification
        plt.figure(figsize=(16, 10))
        fpr, tpr, _ = roc_curve(all_targets, all_probs)
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {class_roc_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for Delay Classification ({year})')
        plt.legend()
        plt.savefig(os.path.join(plots_dir, f'rnn_roc_curve_{year}.png'))
        plt.close()

    # Plot actual vs predicted delays for regression
    plt.figure(figsize=(16, 10))

    # Create a scatterplot with limited points for clarity
    max_points = 5000
    if len(all_reg_targets) > max_points:
        idx = np.random.choice(len(all_reg_targets), max_points, replace=False)
        sample_actual = all_reg_targets[idx]
        sample_pred = all_reg_preds[idx]
    else:
        sample_actual = all_reg_targets
        sample_pred = all_reg_preds

    plt.scatter(sample_actual, sample_pred, alpha=0.3)

    # Add perfect prediction line
    max_val = max(np.max(sample_actual), np.max(sample_pred))
    min_val = min(np.min(sample_actual), np.min(sample_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--')

    plt.xlabel('Actual Delay (minutes)')
    plt.ylabel('Predicted Delay (minutes)')
    plt.title(f'Actual vs Predicted Delay ({year})')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'rnn_actual_vs_predicted_{year}.png'))
    plt.close()

    # Plot delay prediction error distribution
    plt.figure(figsize=(16, 10))
    prediction_errors = all_reg_targets - all_reg_preds
    sns.histplot(prediction_errors, bins=50, kde=True)
    plt.axvline(0, color='red', linestyle='--')
    plt.xlabel('Prediction Error (minutes)')
    plt.ylabel('Frequency')
    plt.title(f'Delay Prediction Error Distribution ({year})')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'rnn_error_distribution_{year}.png'))
    plt.close()

    # Return metrics
    metrics = {
        'model_name': f'rnn_attention_{year}',
        'year': year,

        # Classification metrics
        'class_accuracy': class_accuracy,
        'class_roc_auc': class_roc_auc,
        'class_precision': precision,
        'class_recall': recall,
        'class_f1': f1_score,
        'class_training_time': class_model_training_time,
        'class_epochs': len(train_losses_class),

        # Regression metrics
        'reg_mse': reg_mse,
        'reg_rmse': reg_rmse,
        'reg_mae': reg_mae,
        'reg_r2': reg_r2,
        'reg_training_time': reg_model_training_time,
        'reg_epochs': len(train_losses_reg),

        # Confusion matrix values
        'true_negative': int(class_cm[0, 0]) if class_cm.shape == (2, 2) else 0,
        'false_positive': int(class_cm[0, 1]) if class_cm.shape == (2, 2) else 0,
        'false_negative': int(class_cm[1, 0]) if class_cm.shape == (2, 2) else 0,
        'true_positive': int(class_cm[1, 1]) if class_cm.shape == (2, 2) else 0,

        'status': 'success',
        'total_processing_time': time.time() - start_time
    }

    # Save metrics to JSON
    with open(os.path.join(metrics_dir, f'rnn_attention_model_metrics_{year}.json'), 'w') as f:
        json.dump(convert_to_serializable(metrics), f, indent=4)

    return metrics, classifier, regressor

# Function to train model for a specific year
def train_year_rnn_model(year, flight_data_file, output_dir):
    """
    Train a model for a specific year's data using RNN+Attention architecture

    Args:
        year: Year to train model for
        flight_data_file: Path to the flight data file
        output_dir: Output directory

    Returns:
        Dictionary with model results or None if error
    """
    print(f"\n{'='*80}")
    print(f"Training RNN+Attention model for year {year}")
    print(f"{'='*80}")

    # Create year-specific output directories
    year_output_dir = os.path.join(output_dir, f'year_{year}_rnn')
    os.makedirs(year_output_dir, exist_ok=True)

    start_time = time.time()

    # Step 1: Load and preprocess the year's flight data
    flight_data = load_and_process_flight_data(flight_data_file)
    if flight_data is None or len(flight_data) == 0:
        print(f"No valid flight data available for {year}. Skipping this year.")
        return None

    # Step 2: Match weather data
    flight_data = match_weather_data(flight_data)

    # Step 3: Add advanced feature enhancements
    # Add red-eye flight indicator
    print(f"\nCreating red-eye flight indicator for {year}...")
    flight_data = create_redeye_indicator(flight_data)

    # Prepare delay data
    print(f"\nPreparing delay data for {year}...")
    flight_data = prepare_delay_data(flight_data)

    # Create time block features
    print(f"\nCreating advanced time features for {year}...")
    flight_data = create_advanced_time_features(flight_data)

    # Create day features
    print(f"\nCreating advanced day features for {year}...")
    flight_data = create_advanced_day_features(flight_data)

    # Create airport features
    print(f"\nCreating advanced airport features for {year}...")
    flight_data = create_airport_features(flight_data)

    # Create weather features
    print(f"\nCreating advanced weather features for {year}...")
    flight_data = create_weather_features(flight_data)

    # Step 4: Feature selection
    print(f"\nSelecting features for delay prediction for {year}...")

    # Categorical features
    cat_features = [
        'TIME_BLOCK', 'DAY_NAME', 'MKT_AIRLINE', 'ORIGIN_IATA', 'DEST_IATA',
        'DISTANCE_CAT'
    ]

    # Numerical features - include advanced features
    num_features = [
        # Basic features
        'DISTANCE', 'PRCP', 'EXTREME_WEATHER',
        'IS_REDEYE', 'IS_WEEKEND', 'IS_MORNING_PEAK', 'IS_EVENING_PEAK',

        # Cyclic time encodings
        'HOUR_SIN', 'HOUR_COS', 'HALFDAY_SIN', 'HALFDAY_COS',
        'QUARTER_DAY_SIN', 'QUARTER_DAY_COS',

        # Cyclic day encodings
        'DAY_SIN', 'DAY_COS', 'WEEKDAY_SIN', 'WEEKDAY_COS',
        'WORKWEEK_SIN', 'WORKWEEK_COS',

        # Advanced airport features
        'IS_MAJOR_HUB_ORIGIN', 'IS_MAJOR_HUB_DEST', 'IS_HUB_TO_HUB',
        'IS_WEST_COAST_ORIGIN', 'IS_EAST_COAST_ORIGIN', 'IS_CENTRAL_ORIGIN',
        'IS_WEST_COAST_DEST', 'IS_EAST_COAST_DEST', 'IS_CENTRAL_DEST',
        'IS_TRANSCON', 'NORMALIZED_DISTANCE', 'LOG_DISTANCE',

        # Advanced weather features
        'RAIN_SEVERITY', 'WEATHER_SCORE', 'HUB_WEATHER_IMPACT', 'PEAK_WEATHER_IMPACT'
    ]

    # Ensure all selected features exist in the dataframe
    cat_features = [f for f in cat_features if f in flight_data.columns]
    num_features = [f for f in num_features if f in flight_data.columns]

    print(f"Using categorical features: {cat_features}")
    print(f"Using numerical features: {num_features}")

    # Step 5: Prepare data for modeling
    X = flight_data[cat_features + num_features].copy()
    y_class = flight_data['IS_DELAYED']

    # For regression, use the clipped delay values to avoid extreme outliers
    if 'DEP_DELAY_CLIPPED' in flight_data.columns:
        y_reg = flight_data['DEP_DELAY_CLIPPED']
        print("Using clipped delay values for regression to improve neural network training")
    else:
        y_reg = flight_data['DEP_DELAY']

    # Handle missing values
    for col in cat_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna('unknown', inplace=True)
    for col in num_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].median(), inplace=True)

    # Preprocess the data - standardize numerical features and one-hot encode categorical features
    print(f"\nPreprocessing features for neural network training...")

    # Create preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_features),
            ('cat', categorical_transformer, cat_features)
        ]
    )

    # Fit and transform the data
    X_processed = preprocessor.fit_transform(X)

    print(f"Processed feature shape: {X_processed.shape}")

    # Split data for training and testing
    X_train, X_test, y_train_class, y_test_class = train_test_split(
        X_processed, y_class, test_size=0.1, random_state=2025, stratify=y_class
    )

    # Split data for regression model
    _, _, y_train_reg, y_test_reg = train_test_split(
        X_processed, y_reg, test_size=0.1, random_state=2025
    )

    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")

    # Step 6: Train RNN+Attention models
    metrics, classifier, regressor = train_rnn_attention_models(
        X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg,
        year, year_output_dir, device
    )

    # Step 7: Add dataset metrics
    metrics.update({
        'total_flights': len(flight_data),
        'delayed_flights_rate': flight_data['IS_DELAYED'].mean() * 100,
        'mean_delay': flight_data['DEP_DELAY'].mean(),
        'median_delay': flight_data['DEP_DELAY'].median(),
        'max_delay': float(flight_data['DEP_DELAY'].max()),
        'min_delay': float(flight_data['DEP_DELAY'].min()),
        'feature_count': X_processed.shape[1],
        'categorical_features': len(cat_features),
        'numerical_features': len(num_features)
    })

    # Save preprocessor for inference
    import joblib
    joblib.dump(preprocessor, os.path.join(year_output_dir, f'rnn_preprocessor_{year}.joblib'))
    print(f"Preprocessor saved to {os.path.join(year_output_dir, f'rnn_preprocessor_{year}.joblib')}")

    print(f"\nRNN+Attention model training for {year} complete! Total processing time: {time.time() - start_time:.2f} seconds")
    return metrics

In [10]:
def compare_models(all_results, model_type="resnet"):
    """
    Compare models across different years with support for different model types

    Args:
        all_results: Dictionary with results for each year
        model_type: Type of model (e.g., "resnet", "rnn_attention")

    Returns:
        None (saves comparison plots)
    """
    print(f"\nComparing {model_type.upper()} models across years...")

    if not all_results or len(all_results) < 2:
        print("Not enough year models to compare.")
        return

    # Create a comparison directory
    comparison_dir = os.path.join(output_dir, f'comparison_{model_type}')
    os.makedirs(comparison_dir, exist_ok=True)

    # Extract years and sort them
    years = sorted([r['year'] for r in all_results])

    # Create DataFrames for different metrics
    class_metrics = pd.DataFrame({
        'Year': years,
        'Accuracy (%)': [r['class_accuracy'] for r in all_results],
        'AUC': [r['class_roc_auc'] for r in all_results],
        'Precision': [r['class_precision'] for r in all_results],
        'Recall': [r['class_recall'] for r in all_results],
        'F1 Score': [r['class_f1'] for r in all_results],
    })

    reg_metrics = pd.DataFrame({
        'Year': years,
        'RMSE (min)': [r['reg_rmse'] for r in all_results],
        'MAE (min)': [r['reg_mae'] for r in all_results],
        'R² Score': [r['reg_r2'] for r in all_results],
    })

    timing_metrics = pd.DataFrame({
        'Year': years,
        'Classification Training Time (s)': [r['class_training_time'] for r in all_results],
        'Regression Training Time (s)': [r['reg_training_time'] for r in all_results],
        'Classification Epochs': [r.get('class_epochs', 0) for r in all_results],
        'Regression Epochs': [r.get('reg_epochs', 0) for r in all_results],
    })

    delay_stats = pd.DataFrame({
        'Year': years,
        'Mean Delay (min)': [r['mean_delay'] for r in all_results],
        'Delayed Flights (%)': [r['delayed_flights_rate'] for r in all_results],
        'Total Flights': [r['total_flights'] for r in all_results],
    })

    # 1. Plot classification metrics
    plt.figure(figsize=(16, 10))

    # Set up bar positions
    bar_width = 0.15
    r1 = np.arange(len(years))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]
    r4 = [x + bar_width for x in r3]
    r5 = [x + bar_width for x in r4]

    # Create bars
    plt.bar(r1, class_metrics['Accuracy (%)'] / 100, width=bar_width, label='Accuracy', color='blue')
    plt.bar(r2, class_metrics['AUC'], width=bar_width, label='AUC', color='green')
    plt.bar(r3, class_metrics['Precision'], width=bar_width, label='Precision', color='red')
    plt.bar(r4, class_metrics['Recall'], width=bar_width, label='Recall', color='purple')
    plt.bar(r5, class_metrics['F1 Score'], width=bar_width, label='F1 Score', color='orange')

    # Add texts on bars
    for i, r in enumerate([r1, r2, r3, r4, r5]):
        values = class_metrics.iloc[:, i+1].values
        if i == 0:  # Accuracy needs to be divided by 100
            values = values / 100
        for j, v in enumerate(values):
            plt.text(r[j], v + 0.01, f'{v:.2f}' if i > 0 else f'{v*100:.1f}%',
                    ha='center', va='bottom', rotation=0, fontsize=8)

    # Add labels and title
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.title(f'{model_type.upper()} Classification Metrics by Year')
    plt.xticks([r + 2*bar_width for r in range(len(years))], years)
    plt.legend()
    plt.ylim(0, 1.0)  # Set y-axis limits for better visualization
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, f'{model_type}_classification_metrics_by_year.png'))
    plt.close()

    # 2. Plot regression metrics
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot RMSE and MAE
    x = np.arange(len(years))
    width = 0.35

    ax1.bar(x - width/2, reg_metrics['RMSE (min)'], width, label='RMSE')
    ax1.bar(x + width/2, reg_metrics['MAE (min)'], width, label='MAE')

    # Add text labels
    for i, v in enumerate(reg_metrics['RMSE (min)']):
        ax1.text(i - width/2, v + 0.5, f'{v:.1f}', ha='center', va='bottom')
    for i, v in enumerate(reg_metrics['MAE (min)']):
        ax1.text(i + width/2, v + 0.5, f'{v:.1f}', ha='center', va='bottom')

    ax1.set_xlabel('Year')
    ax1.set_ylabel('Minutes')
    ax1.set_title(f'{model_type.upper()} Regression Error Metrics')
    ax1.set_xticks(x)
    ax1.set_xticklabels(years)
    ax1.legend()
    ax1.grid(axis='y', linestyle='--', alpha=0.7)

    # Plot R² Score
    bars = ax2.bar(years, reg_metrics['R² Score'], color='green')

    # Add text labels
    for bar, value in zip(bars, reg_metrics['R² Score']):
        ax2.text(bar.get_x() + bar.get_width()/2, value + 0.01, f'{value:.3f}',
                ha='center', va='bottom')

    ax2.set_xlabel('Year')
    ax2.set_ylabel('R² Score')
    ax2.set_title(f'{model_type.upper()} Regression R² Score')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)

    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, f'{model_type}_regression_metrics_by_year.png'))
    plt.close()

    # 3. Plot training times and epochs
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot training times
    ax1.bar(x - width/2, timing_metrics['Classification Training Time (s)'], width, label='Classification')
    ax1.bar(x + width/2, timing_metrics['Regression Training Time (s)'], width, label='Regression')

    # Add text labels
    for i, v in enumerate(timing_metrics['Classification Training Time (s)']):
        ax1.text(i - width/2, v + 5, f'{v:.0f}s', ha='center', va='bottom')
    for i, v in enumerate(timing_metrics['Regression Training Time (s)']):
        ax1.text(i + width/2, v + 5, f'{v:.0f}s', ha='center', va='bottom')

    ax1.set_xlabel('Year')
    ax1.set_ylabel('Training Time (seconds)')
    ax1.set_title(f'{model_type.upper()} Training Times')
    ax1.set_xticks(x)
    ax1.set_xticklabels(years)
    ax1.legend()
    ax1.grid(axis='y', linestyle='--', alpha=0.7)

    # Plot epochs
    ax2.bar(x - width/2, timing_metrics['Classification Epochs'], width, label='Classification')
    ax2.bar(x + width/2, timing_metrics['Regression Epochs'], width, label='Regression')

    # Add text labels
    for i, v in enumerate(timing_metrics['Classification Epochs']):
        ax2.text(i - width/2, v + 0.5, f'{v:.0f}', ha='center', va='bottom')
    for i, v in enumerate(timing_metrics['Regression Epochs']):
        ax2.text(i + width/2, v + 0.5, f'{v:.0f}', ha='center', va='bottom')

    ax2.set_xlabel('Year')
    ax2.set_ylabel('Number of Epochs')
    ax2.set_title(f'{model_type.upper()} Training Epochs')
    ax2.set_xticks(x)
    ax2.set_xticklabels(years)
    ax2.legend()
    ax2.grid(axis='y', linestyle='--', alpha=0.7)

    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, f'{model_type}_training_metrics_by_year.png'))
    plt.close()

    # 4. Plot delay statistics
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot mean delay
    bars1 = ax1.bar(years, delay_stats['Mean Delay (min)'], color='blue')

    # Add text labels
    for bar, value in zip(bars1, delay_stats['Mean Delay (min)']):
        ax1.text(bar.get_x() + bar.get_width()/2, value + 0.3, f'{value:.1f}',
                ha='center', va='bottom')

    ax1.set_xlabel('Year')
    ax1.set_ylabel('Minutes')
    ax1.set_title('Mean Delay by Year')
    ax1.grid(axis='y', linestyle='--', alpha=0.7)

    # Plot delay rate
    bars2 = ax2.bar(years, delay_stats['Delayed Flights (%)'], color='red')

    # Add text labels
    for bar, value in zip(bars2, delay_stats['Delayed Flights (%)']):
        ax2.text(bar.get_x() + bar.get_width()/2, value + 0.5, f'{value:.1f}%',
                ha='center', va='bottom')

    ax2.set_xlabel('Year')
    ax2.set_ylabel('Percentage')
    ax2.set_title('Delayed Flights Rate by Year')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)

    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, f'{model_type}_delay_stats_by_year.png'))
    plt.close()

    # 5. Create a summary table for all metrics
    summary = pd.concat([
        delay_stats.set_index('Year'),
        class_metrics.set_index('Year').iloc[:, 1:],  # Skip the Year column
        reg_metrics.set_index('Year').iloc[:, 1:],    # Skip the Year column
        timing_metrics.set_index('Year').iloc[:, 1:]  # Skip the Year column
    ], axis=1)

    # Save the summary to CSV
    summary.to_csv(os.path.join(comparison_dir, f'{model_type}_delay_summary.csv'))
    print(f"Comparison summary saved to {os.path.join(comparison_dir, f'{model_type}_delay_summary.csv')}")

    # 6. If we have both ResNet and RNN results, create comparison between model types
    resnet_summary_file = os.path.join(output_dir, 'comparison', 'dep_delay_nn_summary.csv')
    if model_type == "rnn_attention" and os.path.exists(resnet_summary_file):
        try:
            # Load ResNet summary for comparison
            resnet_summary = pd.read_csv(resnet_summary_file)
            resnet_summary.set_index('Year', inplace=True)

            # Compare classification metrics
            compare_metrics(summary, resnet_summary, 'class_accuracy', 'Accuracy (%)', 'Classification Accuracy', comparison_dir)
            compare_metrics(summary, resnet_summary, 'class_roc_auc', 'AUC', 'Classification AUC', comparison_dir)

            # Compare regression metrics
            compare_metrics(summary, resnet_summary, 'reg_rmse', 'RMSE (min)', 'Regression RMSE', comparison_dir)
            compare_metrics(summary, resnet_summary, 'reg_r2', 'R² Score', 'Regression R² Score', comparison_dir)

            print(f"Model comparison (RNN vs ResNet) completed and saved to {comparison_dir}")
        except Exception as e:
            print(f"Error creating model comparison: {e}")

    print(f"{model_type.upper()} model comparison completed!")

def compare_metrics(rnn_summary, resnet_summary, metric_key, metric_label, title, output_dir):
    """
    Create comparison plots between RNN and ResNet models for a specific metric

    Args:
        rnn_summary: DataFrame with RNN model metrics
        resnet_summary: DataFrame with ResNet model metrics
        metric_key: Key of the metric to compare
        metric_label: Label for the metric
        title: Title for the plot
        output_dir: Directory to save the plot
    """
    plt.figure(figsize=(12, 6))

    # Ensure we're comparing the same years
    common_years = sorted(list(set(rnn_summary.index) & set(resnet_summary.index)))

    # Extract data for common years
    rnn_data = [rnn_summary.loc[year, metric_key] for year in common_years]
    resnet_data = [resnet_summary.loc[year, metric_key] for year in common_years]

    # Set up bar positions
    x = np.arange(len(common_years))
    width = 0.35

    # Create bars
    bars1 = plt.bar(x - width/2, rnn_data, width, label='RNN+Attention', color='blue')
    bars2 = plt.bar(x + width/2, resnet_data, width, label='ResNet', color='red')

    # Add text labels
    for i, (v1, v2) in enumerate(zip(rnn_data, resnet_data)):
        if metric_key == 'class_accuracy':
            # Format as percentage
            plt.text(i - width/2, v1 + 1, f'{v1:.1f}%', ha='center', va='bottom')
            plt.text(i + width/2, v2 + 1, f'{v2:.1f}%', ha='center', va='bottom')
        else:
            # Format as float
            plt.text(i - width/2, v1 + 0.01, f'{v1:.3f}', ha='center', va='bottom')
            plt.text(i + width/2, v2 + 0.01, f'{v2:.3f}', ha='center', va='bottom')

    # Add labels and title
    plt.xlabel('Year')
    plt.ylabel(metric_label)
    plt.title(f'Model Comparison: {title}')
    plt.xticks(x, common_years)
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'comparison_{metric_key}.png'))
    plt.close()

# Main execution for RNN+Attention models
rnn_all_results = []

# Process each year's file separately
for file_path in flight_files:
    year = extract_year_from_filename(file_path)
    results = train_year_rnn_model(year, file_path, output_dir)

    if results:
        rnn_all_results.append(results)
        print(f"\nRNN+Attention model for year {year} completed successfully!")
    else:
        print(f"\nRNN+Attention model for year {year} failed.")

# After all individual models are trained, compare them
if len(rnn_all_results) > 1:
    # You can reuse the comparison function with the RNN results
    compare_models(rnn_all_results, model_type="rnn_attention")
else:
    print("\nNot enough successful RNN+Attention models to perform comparison.")

# Print final summary
print("\nYear-by-Year RNN+Attention Model Training Summary:")
for year_result in rnn_all_results:
    year = year_result['year']
    print(f"\nYear {year}:")
    print(f"  Total flights: {year_result['total_flights']:,}")
    print(f"  Classification accuracy: {year_result['class_accuracy']:.2f}%")
    print(f"  Classification AUC: {year_result['class_roc_auc']:.4f}")
    print(f"  Regression RMSE: {year_result['reg_rmse']:.2f} minutes")
    print(f"  Regression R²: {year_result['reg_r2']:.4f}")
    print(f"  Mean delay: {year_result['mean_delay']:.2f} minutes")
    print(f"  Delay rate: {year_result['delayed_flights_rate']:.2f}%")
    print(f"  Training time: {year_result['total_processing_time']:.2f} seconds")

print("\nRNN+Attention model training complete! Check output directories for detailed results.")


Training RNN+Attention model for year 2021

Processing May2021.csv...
Years found in data: [2021]
Months found in data: {5: 520059}
Filtered to only May data: 520059 rows
Filtered from 520059 to 171867 rows for top 30 airports
Removed 485.0 cancelled flights, remaining: 171382
Processing took: 2.77 seconds

Matching weather data with flights...
Processed 10000/171382 rows, matched 7633 flights with weather data
Processed 20000/171382 rows, matched 15255 flights with weather data
Processed 30000/171382 rows, matched 22811 flights with weather data
Processed 40000/171382 rows, matched 30386 flights with weather data
Processed 50000/171382 rows, matched 37926 flights with weather data
Processed 60000/171382 rows, matched 45609 flights with weather data
Processed 70000/171382 rows, matched 53275 flights with weather data
Processed 80000/171382 rows, matched 60864 flights with weather data
Processed 90000/171382 rows, matched 68367 flights with weather data
Processed 100000/171382 rows, ma

Epoch 1/50: 100%|██████████| 151/151 [00:01<00:00, 78.82it/s]


Epoch 1/50, Train Loss: 0.5936, Val Loss: 0.5717


Epoch 2/50: 100%|██████████| 151/151 [00:01<00:00, 85.42it/s]


Epoch 2/50, Train Loss: 0.5708, Val Loss: 0.5676


Epoch 3/50: 100%|██████████| 151/151 [00:01<00:00, 85.34it/s]


Epoch 3/50, Train Loss: 0.5652, Val Loss: 0.5653


Epoch 4/50: 100%|██████████| 151/151 [00:01<00:00, 88.75it/s]


Epoch 4/50, Train Loss: 0.5616, Val Loss: 0.5640


Epoch 5/50: 100%|██████████| 151/151 [00:01<00:00, 83.93it/s]


Epoch 5/50, Train Loss: 0.5586, Val Loss: 0.5623


Epoch 6/50: 100%|██████████| 151/151 [00:01<00:00, 91.54it/s]


Epoch 6/50, Train Loss: 0.5563, Val Loss: 0.5625


Epoch 7/50: 100%|██████████| 151/151 [00:01<00:00, 82.04it/s]


Epoch 7/50, Train Loss: 0.5539, Val Loss: 0.5596


Epoch 8/50: 100%|██████████| 151/151 [00:01<00:00, 87.07it/s]


Epoch 8/50, Train Loss: 0.5521, Val Loss: 0.5596


Epoch 9/50: 100%|██████████| 151/151 [00:01<00:00, 85.40it/s]


Epoch 9/50, Train Loss: 0.5500, Val Loss: 0.5592


Epoch 10/50: 100%|██████████| 151/151 [00:01<00:00, 84.06it/s]


Epoch 10/50, Train Loss: 0.5483, Val Loss: 0.5584


Epoch 11/50: 100%|██████████| 151/151 [00:01<00:00, 81.80it/s]


Epoch 11/50, Train Loss: 0.5468, Val Loss: 0.5570


Epoch 12/50: 100%|██████████| 151/151 [00:01<00:00, 85.14it/s]


Epoch 12/50, Train Loss: 0.5463, Val Loss: 0.5572


Epoch 13/50: 100%|██████████| 151/151 [00:01<00:00, 84.87it/s]


Epoch 13/50, Train Loss: 0.5440, Val Loss: 0.5579


Epoch 14/50: 100%|██████████| 151/151 [00:01<00:00, 81.77it/s]


Epoch 14/50, Train Loss: 0.5426, Val Loss: 0.5585


Epoch 15/50: 100%|██████████| 151/151 [00:01<00:00, 80.98it/s]


Epoch 15/50, Train Loss: 0.5413, Val Loss: 0.5566


Epoch 16/50: 100%|██████████| 151/151 [00:01<00:00, 84.83it/s]


Epoch 16/50, Train Loss: 0.5399, Val Loss: 0.5562


Epoch 17/50: 100%|██████████| 151/151 [00:01<00:00, 87.94it/s]


Epoch 17/50, Train Loss: 0.5391, Val Loss: 0.5578


Epoch 18/50: 100%|██████████| 151/151 [00:01<00:00, 82.77it/s]


Epoch 18/50, Train Loss: 0.5383, Val Loss: 0.5573


Epoch 19/50: 100%|██████████| 151/151 [00:01<00:00, 82.92it/s]


Epoch 19/50, Train Loss: 0.5371, Val Loss: 0.5568


Epoch 20/50: 100%|██████████| 151/151 [00:01<00:00, 88.46it/s]


Epoch 20/50, Train Loss: 0.5360, Val Loss: 0.5578


Epoch 21/50: 100%|██████████| 151/151 [00:01<00:00, 78.31it/s]


Epoch 21/50, Train Loss: 0.5309, Val Loss: 0.5561


Epoch 22/50: 100%|██████████| 151/151 [00:01<00:00, 82.86it/s]


Epoch 22/50, Train Loss: 0.5288, Val Loss: 0.5573


Epoch 23/50: 100%|██████████| 151/151 [00:01<00:00, 84.18it/s]


Epoch 23/50, Train Loss: 0.5276, Val Loss: 0.5570


Epoch 24/50: 100%|██████████| 151/151 [00:01<00:00, 85.47it/s]


Epoch 24/50, Train Loss: 0.5270, Val Loss: 0.5561


Epoch 25/50: 100%|██████████| 151/151 [00:01<00:00, 84.32it/s]


Epoch 25/50, Train Loss: 0.5234, Val Loss: 0.5566


Epoch 26/50: 100%|██████████| 151/151 [00:01<00:00, 87.09it/s]


Epoch 26/50, Train Loss: 0.5220, Val Loss: 0.5570


Epoch 27/50: 100%|██████████| 151/151 [00:01<00:00, 85.18it/s]


Epoch 27/50, Train Loss: 0.5211, Val Loss: 0.5569


Epoch 28/50: 100%|██████████| 151/151 [00:01<00:00, 83.05it/s]


Epoch 28/50, Train Loss: 0.5204, Val Loss: 0.5573


Epoch 29/50: 100%|██████████| 151/151 [00:01<00:00, 81.40it/s]


Epoch 29/50, Train Loss: 0.5179, Val Loss: 0.5577


Epoch 30/50: 100%|██████████| 151/151 [00:01<00:00, 81.28it/s]


Epoch 30/50, Train Loss: 0.5183, Val Loss: 0.5579


Epoch 31/50: 100%|██████████| 151/151 [00:01<00:00, 84.86it/s]


Epoch 31/50, Train Loss: 0.5177, Val Loss: 0.5579
Early stopping triggered after 31 epochs
Classification model training took: 60.55 seconds

Training RNN+Attention regression model...


Epoch 1/50: 100%|██████████| 151/151 [00:01<00:00, 86.02it/s]


Epoch 1/50, Train Loss: 117.6980, Val Loss: 115.5055


Epoch 2/50: 100%|██████████| 151/151 [00:01<00:00, 79.83it/s]


Epoch 2/50, Train Loss: 117.6468, Val Loss: 115.5032


Epoch 3/50: 100%|██████████| 151/151 [00:01<00:00, 76.27it/s]


Epoch 3/50, Train Loss: 117.6329, Val Loss: 115.5083


Epoch 4/50: 100%|██████████| 151/151 [00:01<00:00, 84.35it/s]


Epoch 4/50, Train Loss: 117.6249, Val Loss: 115.5056


Epoch 5/50: 100%|██████████| 151/151 [00:01<00:00, 89.29it/s]


Epoch 5/50, Train Loss: 117.6096, Val Loss: 115.5142


Epoch 6/50: 100%|██████████| 151/151 [00:01<00:00, 81.83it/s]


Epoch 6/50, Train Loss: 117.5850, Val Loss: 115.5563


Epoch 7/50: 100%|██████████| 151/151 [00:01<00:00, 79.56it/s]


Epoch 7/50, Train Loss: 117.5584, Val Loss: 115.5650


Epoch 8/50: 100%|██████████| 151/151 [00:01<00:00, 80.40it/s]


Epoch 8/50, Train Loss: 117.5415, Val Loss: 115.6051


Epoch 9/50: 100%|██████████| 151/151 [00:01<00:00, 77.88it/s]


Epoch 9/50, Train Loss: 117.5248, Val Loss: 115.6344


Epoch 10/50: 100%|██████████| 151/151 [00:01<00:00, 81.46it/s]


Epoch 10/50, Train Loss: 117.4792, Val Loss: 115.6512


Epoch 11/50: 100%|██████████| 151/151 [00:01<00:00, 80.33it/s]


Epoch 11/50, Train Loss: 117.4706, Val Loss: 115.6685


Epoch 12/50: 100%|██████████| 151/151 [00:01<00:00, 83.71it/s]


Epoch 12/50, Train Loss: 117.4373, Val Loss: 115.6902
Early stopping triggered after 12 epochs
Regression model training took: 23.91 seconds

Evaluating classification model...
Classification Accuracy: 72.61%
Classification ROC AUC: 0.7241

Evaluating regression model...
Regression Mean Squared Error: 849.15
Regression Root Mean Squared Error: 29.14 minutes
Regression Mean Absolute Error: 12.02 minutes
Regression R² Score: -0.0345
Preprocessor saved to ./dep_delay_nn/year_2021_rnn\rnn_preprocessor_2021.joblib

RNN+Attention model training for 2021 complete! Total processing time: 150.41 seconds

RNN+Attention model for year 2021 completed successfully!

Training RNN+Attention model for year 2022

Processing May2022.csv...
Years found in data: [2022]
Months found in data: {5: 602950}
Filtered to only May data: 602950 rows
Filtered from 602950 to 210079 rows for top 30 airports
Removed 4659.0 cancelled flights, remaining: 205420
Processing took: 3.09 seconds

Matching weather data with f

Epoch 1/50: 100%|██████████| 181/181 [00:02<00:00, 78.18it/s]


Epoch 1/50, Train Loss: 0.6382, Val Loss: 0.6230


Epoch 2/50: 100%|██████████| 181/181 [00:02<00:00, 82.83it/s]


Epoch 2/50, Train Loss: 0.6241, Val Loss: 0.6202


Epoch 3/50: 100%|██████████| 181/181 [00:02<00:00, 84.41it/s]


Epoch 3/50, Train Loss: 0.6202, Val Loss: 0.6192


Epoch 4/50: 100%|██████████| 181/181 [00:02<00:00, 82.56it/s]


Epoch 4/50, Train Loss: 0.6180, Val Loss: 0.6169


Epoch 5/50: 100%|██████████| 181/181 [00:02<00:00, 83.31it/s]


Epoch 5/50, Train Loss: 0.6160, Val Loss: 0.6154


Epoch 6/50: 100%|██████████| 181/181 [00:02<00:00, 80.55it/s]


Epoch 6/50, Train Loss: 0.6133, Val Loss: 0.6144


Epoch 7/50: 100%|██████████| 181/181 [00:02<00:00, 85.17it/s]


Epoch 7/50, Train Loss: 0.6119, Val Loss: 0.6141


Epoch 8/50: 100%|██████████| 181/181 [00:02<00:00, 79.68it/s]


Epoch 8/50, Train Loss: 0.6103, Val Loss: 0.6140


Epoch 9/50: 100%|██████████| 181/181 [00:02<00:00, 82.86it/s]


Epoch 9/50, Train Loss: 0.6088, Val Loss: 0.6144


Epoch 10/50: 100%|██████████| 181/181 [00:02<00:00, 81.10it/s]


Epoch 10/50, Train Loss: 0.6078, Val Loss: 0.6130


Epoch 11/50: 100%|██████████| 181/181 [00:02<00:00, 84.36it/s]


Epoch 11/50, Train Loss: 0.6064, Val Loss: 0.6127


Epoch 12/50: 100%|██████████| 181/181 [00:02<00:00, 85.75it/s]


Epoch 12/50, Train Loss: 0.6047, Val Loss: 0.6134


Epoch 13/50: 100%|██████████| 181/181 [00:02<00:00, 77.35it/s]


Epoch 13/50, Train Loss: 0.6040, Val Loss: 0.6117


Epoch 14/50: 100%|██████████| 181/181 [00:02<00:00, 83.81it/s]


Epoch 14/50, Train Loss: 0.6027, Val Loss: 0.6121


Epoch 15/50: 100%|██████████| 181/181 [00:02<00:00, 81.33it/s]


Epoch 15/50, Train Loss: 0.6017, Val Loss: 0.6107


Epoch 16/50: 100%|██████████| 181/181 [00:02<00:00, 82.63it/s]


Epoch 16/50, Train Loss: 0.6008, Val Loss: 0.6103


Epoch 17/50: 100%|██████████| 181/181 [00:02<00:00, 85.42it/s]


Epoch 17/50, Train Loss: 0.6001, Val Loss: 0.6113


Epoch 18/50: 100%|██████████| 181/181 [00:02<00:00, 83.71it/s]


Epoch 18/50, Train Loss: 0.5991, Val Loss: 0.6117


Epoch 19/50: 100%|██████████| 181/181 [00:02<00:00, 83.11it/s]


Epoch 19/50, Train Loss: 0.5980, Val Loss: 0.6126


Epoch 20/50: 100%|██████████| 181/181 [00:02<00:00, 83.99it/s]


Epoch 20/50, Train Loss: 0.5972, Val Loss: 0.6099


Epoch 21/50: 100%|██████████| 181/181 [00:02<00:00, 84.94it/s]


Epoch 21/50, Train Loss: 0.5966, Val Loss: 0.6111


Epoch 22/50: 100%|██████████| 181/181 [00:02<00:00, 79.53it/s]


Epoch 22/50, Train Loss: 0.5957, Val Loss: 0.6094


Epoch 23/50: 100%|██████████| 181/181 [00:02<00:00, 83.04it/s]


Epoch 23/50, Train Loss: 0.5944, Val Loss: 0.6090


Epoch 24/50: 100%|██████████| 181/181 [00:02<00:00, 83.37it/s]


Epoch 24/50, Train Loss: 0.5933, Val Loss: 0.6108


Epoch 25/50: 100%|██████████| 181/181 [00:02<00:00, 83.97it/s]


Epoch 25/50, Train Loss: 0.5929, Val Loss: 0.6102


Epoch 26/50: 100%|██████████| 181/181 [00:02<00:00, 83.77it/s]


Epoch 26/50, Train Loss: 0.5915, Val Loss: 0.6103


Epoch 27/50: 100%|██████████| 181/181 [00:02<00:00, 80.82it/s]


Epoch 27/50, Train Loss: 0.5909, Val Loss: 0.6099


Epoch 28/50: 100%|██████████| 181/181 [00:02<00:00, 83.75it/s]


Epoch 28/50, Train Loss: 0.5869, Val Loss: 0.6104


Epoch 29/50: 100%|██████████| 181/181 [00:02<00:00, 81.22it/s]


Epoch 29/50, Train Loss: 0.5847, Val Loss: 0.6104


Epoch 30/50: 100%|██████████| 181/181 [00:02<00:00, 82.88it/s]


Epoch 30/50, Train Loss: 0.5844, Val Loss: 0.6094


Epoch 31/50: 100%|██████████| 181/181 [00:02<00:00, 86.89it/s]


Epoch 31/50, Train Loss: 0.5835, Val Loss: 0.6103


Epoch 32/50: 100%|██████████| 181/181 [00:02<00:00, 83.11it/s]


Epoch 32/50, Train Loss: 0.5799, Val Loss: 0.6103


Epoch 33/50: 100%|██████████| 181/181 [00:02<00:00, 83.40it/s]


Epoch 33/50, Train Loss: 0.5791, Val Loss: 0.6100
Early stopping triggered after 33 epochs
Classification model training took: 77.59 seconds

Training RNN+Attention regression model...


Epoch 1/50: 100%|██████████| 181/181 [00:02<00:00, 83.70it/s]


Epoch 1/50, Train Loss: 198.3098, Val Loss: 197.3488


Epoch 2/50: 100%|██████████| 181/181 [00:02<00:00, 83.98it/s]


Epoch 2/50, Train Loss: 197.8185, Val Loss: 197.2995


Epoch 3/50: 100%|██████████| 181/181 [00:02<00:00, 81.04it/s]


Epoch 3/50, Train Loss: 197.7668, Val Loss: 197.3083


Epoch 4/50: 100%|██████████| 181/181 [00:02<00:00, 79.27it/s]


Epoch 4/50, Train Loss: 197.7687, Val Loss: 197.3158


Epoch 5/50: 100%|██████████| 181/181 [00:02<00:00, 83.55it/s]


Epoch 5/50, Train Loss: 197.7393, Val Loss: 197.3236


Epoch 6/50: 100%|██████████| 181/181 [00:02<00:00, 85.00it/s]


Epoch 6/50, Train Loss: 197.7209, Val Loss: 197.3637


Epoch 7/50: 100%|██████████| 181/181 [00:02<00:00, 79.78it/s]


Epoch 7/50, Train Loss: 197.6708, Val Loss: 197.3452


Epoch 8/50: 100%|██████████| 181/181 [00:02<00:00, 77.38it/s]


Epoch 8/50, Train Loss: 197.6257, Val Loss: 197.4048


Epoch 9/50: 100%|██████████| 181/181 [00:02<00:00, 79.65it/s]


Epoch 9/50, Train Loss: 197.6098, Val Loss: 197.3930


Epoch 10/50: 100%|██████████| 181/181 [00:02<00:00, 83.36it/s]


Epoch 10/50, Train Loss: 197.5695, Val Loss: 197.4370


Epoch 11/50: 100%|██████████| 181/181 [00:02<00:00, 81.43it/s]


Epoch 11/50, Train Loss: 197.5317, Val Loss: 197.4763


Epoch 12/50: 100%|██████████| 181/181 [00:02<00:00, 82.31it/s]


Epoch 12/50, Train Loss: 197.5023, Val Loss: 197.4759
Early stopping triggered after 12 epochs
Regression model training took: 28.64 seconds

Evaluating classification model...
Classification Accuracy: 66.33%
Classification ROC AUC: 0.7188

Evaluating regression model...
Regression Mean Squared Error: 1576.67
Regression Root Mean Squared Error: 39.71 minutes
Regression Mean Absolute Error: 18.52 minutes
Regression R² Score: -0.0657
Preprocessor saved to ./dep_delay_nn/year_2022_rnn\rnn_preprocessor_2022.joblib

RNN+Attention model training for 2022 complete! Total processing time: 179.85 seconds

RNN+Attention model for year 2022 completed successfully!

Training RNN+Attention model for year 2023

Processing May2023.csv...
Years found in data: [2023]
Months found in data: {5: 616630}
Filtered to only May data: 616630 rows
Filtered from 616630 to 220469 rows for top 30 airports
Removed 1293.0 cancelled flights, remaining: 219176
Processing took: 3.18 seconds

Matching weather data with 

Epoch 1/50: 100%|██████████| 193/193 [00:02<00:00, 77.68it/s]


Epoch 1/50, Train Loss: 0.6221, Val Loss: 0.6009


Epoch 2/50: 100%|██████████| 193/193 [00:02<00:00, 86.19it/s]


Epoch 2/50, Train Loss: 0.6082, Val Loss: 0.5997


Epoch 3/50: 100%|██████████| 193/193 [00:02<00:00, 84.42it/s]


Epoch 3/50, Train Loss: 0.6041, Val Loss: 0.5964


Epoch 4/50: 100%|██████████| 193/193 [00:02<00:00, 81.82it/s]


Epoch 4/50, Train Loss: 0.6014, Val Loss: 0.5951


Epoch 5/50: 100%|██████████| 193/193 [00:02<00:00, 82.68it/s]


Epoch 5/50, Train Loss: 0.5992, Val Loss: 0.5937


Epoch 6/50: 100%|██████████| 193/193 [00:02<00:00, 81.65it/s]


Epoch 6/50, Train Loss: 0.5973, Val Loss: 0.5928


Epoch 7/50: 100%|██████████| 193/193 [00:02<00:00, 81.83it/s]


Epoch 7/50, Train Loss: 0.5956, Val Loss: 0.5915


Epoch 8/50: 100%|██████████| 193/193 [00:02<00:00, 84.84it/s]


Epoch 8/50, Train Loss: 0.5943, Val Loss: 0.5905


Epoch 9/50: 100%|██████████| 193/193 [00:02<00:00, 81.93it/s]


Epoch 9/50, Train Loss: 0.5923, Val Loss: 0.5892


Epoch 10/50: 100%|██████████| 193/193 [00:02<00:00, 84.03it/s]


Epoch 10/50, Train Loss: 0.5907, Val Loss: 0.5887


Epoch 11/50: 100%|██████████| 193/193 [00:02<00:00, 83.19it/s]


Epoch 11/50, Train Loss: 0.5891, Val Loss: 0.5875


Epoch 12/50: 100%|██████████| 193/193 [00:02<00:00, 83.35it/s]


Epoch 12/50, Train Loss: 0.5882, Val Loss: 0.5889


Epoch 13/50: 100%|██████████| 193/193 [00:02<00:00, 83.09it/s]


Epoch 13/50, Train Loss: 0.5872, Val Loss: 0.5874


Epoch 14/50: 100%|██████████| 193/193 [00:02<00:00, 83.23it/s]


Epoch 14/50, Train Loss: 0.5861, Val Loss: 0.5874


Epoch 15/50: 100%|██████████| 193/193 [00:02<00:00, 82.87it/s]


Epoch 15/50, Train Loss: 0.5849, Val Loss: 0.5866


Epoch 16/50: 100%|██████████| 193/193 [00:02<00:00, 84.83it/s]


Epoch 16/50, Train Loss: 0.5839, Val Loss: 0.5870


Epoch 17/50: 100%|██████████| 193/193 [00:02<00:00, 84.27it/s]


Epoch 17/50, Train Loss: 0.5825, Val Loss: 0.5863


Epoch 18/50: 100%|██████████| 193/193 [00:02<00:00, 84.18it/s]


Epoch 18/50, Train Loss: 0.5820, Val Loss: 0.5852


Epoch 19/50: 100%|██████████| 193/193 [00:02<00:00, 84.33it/s]


Epoch 19/50, Train Loss: 0.5812, Val Loss: 0.5849


Epoch 20/50: 100%|██████████| 193/193 [00:02<00:00, 82.52it/s]


Epoch 20/50, Train Loss: 0.5796, Val Loss: 0.5855


Epoch 21/50: 100%|██████████| 193/193 [00:02<00:00, 82.89it/s]


Epoch 21/50, Train Loss: 0.5785, Val Loss: 0.5865


Epoch 22/50: 100%|██████████| 193/193 [00:02<00:00, 83.62it/s]


Epoch 22/50, Train Loss: 0.5783, Val Loss: 0.5857


Epoch 23/50: 100%|██████████| 193/193 [00:02<00:00, 84.40it/s]


Epoch 23/50, Train Loss: 0.5776, Val Loss: 0.5862


Epoch 24/50: 100%|██████████| 193/193 [00:02<00:00, 82.72it/s]


Epoch 24/50, Train Loss: 0.5725, Val Loss: 0.5844


Epoch 25/50: 100%|██████████| 193/193 [00:02<00:00, 86.10it/s]


Epoch 25/50, Train Loss: 0.5711, Val Loss: 0.5848


Epoch 26/50: 100%|██████████| 193/193 [00:02<00:00, 82.75it/s]


Epoch 26/50, Train Loss: 0.5703, Val Loss: 0.5849


Epoch 27/50: 100%|██████████| 193/193 [00:02<00:00, 82.22it/s]


Epoch 27/50, Train Loss: 0.5701, Val Loss: 0.5851


Epoch 28/50: 100%|██████████| 193/193 [00:02<00:00, 86.20it/s]


Epoch 28/50, Train Loss: 0.5692, Val Loss: 0.5852


Epoch 29/50: 100%|██████████| 193/193 [00:02<00:00, 82.77it/s]


Epoch 29/50, Train Loss: 0.5657, Val Loss: 0.5857


Epoch 30/50: 100%|██████████| 193/193 [00:02<00:00, 83.03it/s]


Epoch 30/50, Train Loss: 0.5656, Val Loss: 0.5846


Epoch 31/50: 100%|██████████| 193/193 [00:02<00:00, 82.69it/s]


Epoch 31/50, Train Loss: 0.5651, Val Loss: 0.5849


Epoch 32/50: 100%|██████████| 193/193 [00:02<00:00, 83.79it/s]


Epoch 32/50, Train Loss: 0.5638, Val Loss: 0.5857


Epoch 33/50: 100%|██████████| 193/193 [00:02<00:00, 83.47it/s]


Epoch 33/50, Train Loss: 0.5628, Val Loss: 0.5853


Epoch 34/50: 100%|██████████| 193/193 [00:02<00:00, 82.09it/s]


Epoch 34/50, Train Loss: 0.5620, Val Loss: 0.5859
Early stopping triggered after 34 epochs
Classification model training took: 85.09 seconds

Training RNN+Attention regression model...


Epoch 1/50: 100%|██████████| 193/193 [00:02<00:00, 85.22it/s]


Epoch 1/50, Train Loss: 178.4502, Val Loss: 181.5029


Epoch 2/50: 100%|██████████| 193/193 [00:02<00:00, 84.61it/s]


Epoch 2/50, Train Loss: 178.2558, Val Loss: 181.4534


Epoch 3/50: 100%|██████████| 193/193 [00:02<00:00, 80.29it/s]


Epoch 3/50, Train Loss: 178.2152, Val Loss: 181.4742


Epoch 4/50: 100%|██████████| 193/193 [00:02<00:00, 82.00it/s]


Epoch 4/50, Train Loss: 178.1975, Val Loss: 181.4658


Epoch 5/50: 100%|██████████| 193/193 [00:02<00:00, 79.88it/s]


Epoch 5/50, Train Loss: 178.1875, Val Loss: 181.4559


Epoch 6/50: 100%|██████████| 193/193 [00:02<00:00, 78.39it/s]


Epoch 6/50, Train Loss: 178.1477, Val Loss: 181.4764


Epoch 7/50: 100%|██████████| 193/193 [00:02<00:00, 84.19it/s]


Epoch 7/50, Train Loss: 178.0923, Val Loss: 181.4957


Epoch 8/50: 100%|██████████| 193/193 [00:02<00:00, 83.84it/s]


Epoch 8/50, Train Loss: 178.0669, Val Loss: 181.5065


Epoch 9/50: 100%|██████████| 193/193 [00:02<00:00, 81.87it/s]


Epoch 9/50, Train Loss: 178.0402, Val Loss: 181.5756


Epoch 10/50: 100%|██████████| 193/193 [00:02<00:00, 81.09it/s]


Epoch 10/50, Train Loss: 177.9989, Val Loss: 181.5468


Epoch 11/50: 100%|██████████| 193/193 [00:02<00:00, 82.24it/s]


Epoch 11/50, Train Loss: 177.9372, Val Loss: 181.6159


Epoch 12/50: 100%|██████████| 193/193 [00:02<00:00, 82.35it/s]


Epoch 12/50, Train Loss: 177.8967, Val Loss: 181.6143
Early stopping triggered after 12 epochs
Regression model training took: 30.48 seconds

Evaluating classification model...
Classification Accuracy: 69.25%
Classification ROC AUC: 0.7355

Evaluating regression model...
Regression Mean Squared Error: 1464.75
Regression Root Mean Squared Error: 38.27 minutes
Regression Mean Absolute Error: 17.53 minutes
Regression R² Score: -0.0470
Preprocessor saved to ./dep_delay_nn/year_2023_rnn\rnn_preprocessor_2023.joblib

RNN+Attention model training for 2023 complete! Total processing time: 194.73 seconds

RNN+Attention model for year 2023 completed successfully!

Training RNN+Attention model for year 2024

Processing May2024.csv...
Years found in data: [2024]
Months found in data: {5: 649428}
Filtered to only May data: 649428 rows
Filtered from 649428 to 228159 rows for top 30 airports
Removed 2994.0 cancelled flights, remaining: 225165
Processing took: 3.29 seconds

Matching weather data with 

Epoch 1/50: 100%|██████████| 198/198 [00:02<00:00, 74.54it/s]


Epoch 1/50, Train Loss: 0.6294, Val Loss: 0.6099


Epoch 2/50: 100%|██████████| 198/198 [00:02<00:00, 84.20it/s]


Epoch 2/50, Train Loss: 0.6157, Val Loss: 0.6064


Epoch 3/50: 100%|██████████| 198/198 [00:02<00:00, 82.37it/s]


Epoch 3/50, Train Loss: 0.6116, Val Loss: 0.6050


Epoch 4/50: 100%|██████████| 198/198 [00:02<00:00, 85.23it/s]


Epoch 4/50, Train Loss: 0.6091, Val Loss: 0.6035


Epoch 5/50: 100%|██████████| 198/198 [00:02<00:00, 84.19it/s]


Epoch 5/50, Train Loss: 0.6066, Val Loss: 0.6022


Epoch 6/50: 100%|██████████| 198/198 [00:02<00:00, 82.84it/s]


Epoch 6/50, Train Loss: 0.6050, Val Loss: 0.6010


Epoch 7/50: 100%|██████████| 198/198 [00:02<00:00, 83.46it/s]


Epoch 7/50, Train Loss: 0.6027, Val Loss: 0.6004


Epoch 8/50: 100%|██████████| 198/198 [00:02<00:00, 81.14it/s]


Epoch 8/50, Train Loss: 0.6017, Val Loss: 0.5995


Epoch 9/50: 100%|██████████| 198/198 [00:02<00:00, 80.38it/s]


Epoch 9/50, Train Loss: 0.6002, Val Loss: 0.5994


Epoch 10/50: 100%|██████████| 198/198 [00:02<00:00, 86.02it/s]


Epoch 10/50, Train Loss: 0.5988, Val Loss: 0.5990


Epoch 11/50: 100%|██████████| 198/198 [00:02<00:00, 80.85it/s]


Epoch 11/50, Train Loss: 0.5979, Val Loss: 0.5988


Epoch 12/50: 100%|██████████| 198/198 [00:02<00:00, 83.15it/s]


Epoch 12/50, Train Loss: 0.5964, Val Loss: 0.5979


Epoch 13/50: 100%|██████████| 198/198 [00:02<00:00, 80.29it/s]


Epoch 13/50, Train Loss: 0.5954, Val Loss: 0.5969


Epoch 14/50: 100%|██████████| 198/198 [00:02<00:00, 85.77it/s]


Epoch 14/50, Train Loss: 0.5938, Val Loss: 0.5975


Epoch 15/50: 100%|██████████| 198/198 [00:02<00:00, 85.23it/s]


Epoch 15/50, Train Loss: 0.5929, Val Loss: 0.5959


Epoch 16/50: 100%|██████████| 198/198 [00:02<00:00, 83.04it/s]


Epoch 16/50, Train Loss: 0.5920, Val Loss: 0.5963


Epoch 17/50: 100%|██████████| 198/198 [00:02<00:00, 85.05it/s]


Epoch 17/50, Train Loss: 0.5909, Val Loss: 0.5957


Epoch 18/50: 100%|██████████| 198/198 [00:02<00:00, 83.58it/s]


Epoch 18/50, Train Loss: 0.5900, Val Loss: 0.5956


Epoch 19/50: 100%|██████████| 198/198 [00:02<00:00, 84.03it/s]


Epoch 19/50, Train Loss: 0.5892, Val Loss: 0.5964


Epoch 20/50: 100%|██████████| 198/198 [00:02<00:00, 86.24it/s]


Epoch 20/50, Train Loss: 0.5880, Val Loss: 0.5943


Epoch 21/50: 100%|██████████| 198/198 [00:02<00:00, 76.01it/s]


Epoch 21/50, Train Loss: 0.5871, Val Loss: 0.5956


Epoch 22/50: 100%|██████████| 198/198 [00:02<00:00, 84.46it/s]


Epoch 22/50, Train Loss: 0.5863, Val Loss: 0.5956


Epoch 23/50: 100%|██████████| 198/198 [00:02<00:00, 83.99it/s]


Epoch 23/50, Train Loss: 0.5854, Val Loss: 0.5944


Epoch 24/50: 100%|██████████| 198/198 [00:02<00:00, 82.36it/s]


Epoch 24/50, Train Loss: 0.5843, Val Loss: 0.5956


Epoch 25/50: 100%|██████████| 198/198 [00:02<00:00, 81.28it/s]


Epoch 25/50, Train Loss: 0.5807, Val Loss: 0.5947


Epoch 26/50: 100%|██████████| 198/198 [00:02<00:00, 83.67it/s]


Epoch 26/50, Train Loss: 0.5787, Val Loss: 0.5949


Epoch 27/50: 100%|██████████| 198/198 [00:02<00:00, 85.78it/s]


Epoch 27/50, Train Loss: 0.5783, Val Loss: 0.5952


Epoch 28/50: 100%|██████████| 198/198 [00:02<00:00, 86.98it/s]


Epoch 28/50, Train Loss: 0.5768, Val Loss: 0.5944


Epoch 29/50: 100%|██████████| 198/198 [00:02<00:00, 86.41it/s]


Epoch 29/50, Train Loss: 0.5740, Val Loss: 0.5947


Epoch 30/50: 100%|██████████| 198/198 [00:02<00:00, 85.68it/s]


Epoch 30/50, Train Loss: 0.5733, Val Loss: 0.5949
Early stopping triggered after 30 epochs
Classification model training took: 76.78 seconds

Training RNN+Attention regression model...


Epoch 1/50: 100%|██████████| 198/198 [00:02<00:00, 81.27it/s]


Epoch 1/50, Train Loss: 282.2681, Val Loss: 277.0486


Epoch 2/50: 100%|██████████| 198/198 [00:02<00:00, 80.46it/s]


Epoch 2/50, Train Loss: 281.2665, Val Loss: 277.0605


Epoch 3/50: 100%|██████████| 198/198 [00:02<00:00, 87.04it/s]


Epoch 3/50, Train Loss: 281.2316, Val Loss: 277.0642


Epoch 4/50: 100%|██████████| 198/198 [00:02<00:00, 77.16it/s]


Epoch 4/50, Train Loss: 281.2075, Val Loss: 277.0618


Epoch 5/50: 100%|██████████| 198/198 [00:02<00:00, 84.58it/s]


Epoch 5/50, Train Loss: 281.1832, Val Loss: 277.0081


Epoch 6/50: 100%|██████████| 198/198 [00:02<00:00, 84.55it/s]


Epoch 6/50, Train Loss: 281.1751, Val Loss: 277.0498


Epoch 7/50: 100%|██████████| 198/198 [00:02<00:00, 85.90it/s]


Epoch 7/50, Train Loss: 281.1611, Val Loss: 277.1065


Epoch 8/50: 100%|██████████| 198/198 [00:02<00:00, 83.88it/s]


Epoch 8/50, Train Loss: 281.1283, Val Loss: 277.0750


Epoch 9/50: 100%|██████████| 198/198 [00:02<00:00, 82.82it/s]


Epoch 9/50, Train Loss: 281.1221, Val Loss: 277.0810


Epoch 10/50: 100%|██████████| 198/198 [00:02<00:00, 81.55it/s]


Epoch 10/50, Train Loss: 281.0370, Val Loss: 277.1225


Epoch 11/50: 100%|██████████| 198/198 [00:02<00:00, 81.27it/s]


Epoch 11/50, Train Loss: 281.0006, Val Loss: 277.1855


Epoch 12/50: 100%|██████████| 198/198 [00:02<00:00, 85.52it/s]


Epoch 12/50, Train Loss: 280.9641, Val Loss: 277.1365


Epoch 13/50: 100%|██████████| 198/198 [00:02<00:00, 78.90it/s]


Epoch 13/50, Train Loss: 280.9303, Val Loss: 277.2553


Epoch 14/50: 100%|██████████| 198/198 [00:02<00:00, 82.16it/s]


Epoch 14/50, Train Loss: 280.8497, Val Loss: 277.2706


Epoch 15/50: 100%|██████████| 198/198 [00:02<00:00, 83.09it/s]


Epoch 15/50, Train Loss: 280.8234, Val Loss: 277.3394
Early stopping triggered after 15 epochs
Regression model training took: 38.38 seconds

Evaluating classification model...
Classification Accuracy: 68.04%
Classification ROC AUC: 0.7449

Evaluating regression model...
Regression Mean Squared Error: 2628.27
Regression Root Mean Squared Error: 51.27 minutes
Regression Mean Absolute Error: 24.28 minutes
Regression R² Score: -0.0870
Preprocessor saved to ./dep_delay_nn/year_2024_rnn\rnn_preprocessor_2024.joblib

RNN+Attention model training for 2024 complete! Total processing time: 196.57 seconds

RNN+Attention model for year 2024 completed successfully!

Comparing RNN_ATTENTION models across years...
Comparison summary saved to ./dep_delay_nn/comparison_rnn_attention\rnn_attention_delay_summary.csv
Error creating model comparison: 'class_accuracy'
RNN_ATTENTION model comparison completed!

Year-by-Year RNN+Attention Model Training Summary:

Year 2021:
  Total flights: 171,382
  Classif

<Figure size 1200x600 with 0 Axes>