In [1]:
import pandas as pd
import numpy as np
import glob
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(2025)
np.random.seed(2025)

# Set device (CPU or GPU if available)
device = torch.device("cpu")
print(f"Using device: {device}")

# Utility function for JSON serialization
def convert_to_serializable(obj):

    if isinstance(obj, (np.integer, np.int64, np.int32, np.int16, np.int8)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    elif isinstance(obj, (pd.DataFrame,)):
        return obj.to_dict('records')
    elif isinstance(obj, (pd.Series,)):
        return obj.to_dict()
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    else:
        return obj

# Set data paths
flight_data_path = './cleaned_data/'
weather_data_path = './cleaned_weather_data/'
top_airports_file = './top_100_airports.csv'
output_dir = './dep_delay_nn/'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print("Starting year-by-year flight delay prediction models (PyTorch ResNet)...")
print(f"Flight data directory: {flight_data_path}")
print(f"Weather data directory: {weather_data_path}")
print(f"Top airports file: {top_airports_file}")
print(f"Model output directory: {output_dir}")

# Load top 30 airports from the top 100 airports file
try:
    top_airports = pd.read_csv(top_airports_file, low_memory=False)

    top_airports = top_airports.head(30)

    top_airport_codes = set(top_airports['ORIGIN_IATA'].str.strip().tolist())
    
    print(f"Loaded top 30 airports: {', '.join(sorted(top_airport_codes))}")
    print(f"Busiest airport: {top_airports.iloc[0]['ORIGIN_IATA']} with {top_airports.iloc[0]['Times']} flights")
    print(f"30th busiest airport: {top_airports.iloc[29]['ORIGIN_IATA']} with {top_airports.iloc[29]['Times']} flights")
except Exception as e:
    print(f"Error loading top airports file: {e}")
    top_airport_codes = None
    print("Will process all airports (top airports file not available)")

Using device: cpu
Starting year-by-year flight delay prediction models (PyTorch ResNet)...
Flight data directory: ./cleaned_data/
Weather data directory: ./cleaned_weather_data/
Top airports file: ./top_100_airports.csv
Model output directory: ./dep_delay_nn/
Loaded top 30 airports: ATL, AUS, BNA, BOS, BWI, CLT, DCA, DEN, DFW, DTW, EWR, FLL, IAD, IAH, JFK, LAS, LAX, LGA, MCO, MDW, MIA, MSP, ORD, PHL, PHX, SAN, SEA, SFO, SLC, TPA
Busiest airport: ATL with 457121 flights
30th busiest airport: TPA with 97235 flights


In [2]:
# Define ResNet-style blocks for neural networks
class ResidualBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim=None):
        super(ResidualBlock, self).__init__()
        if hidden_dim is None:
            hidden_dim = input_dim
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, input_dim)
        self.bn2 = nn.BatchNorm1d(input_dim)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        identity = x
        
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        
        out += identity  # Skip connection
        out = self.relu(out)
        
        return out

class BottleneckResidualBlock(nn.Module):
    def __init__(self, input_dim, bottleneck_dim):
        super(BottleneckResidualBlock, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, bottleneck_dim)
        self.bn1 = nn.BatchNorm1d(bottleneck_dim)
        self.fc2 = nn.Linear(bottleneck_dim, bottleneck_dim)
        self.bn2 = nn.BatchNorm1d(bottleneck_dim)
        self.fc3 = nn.Linear(bottleneck_dim, input_dim)
        self.bn3 = nn.BatchNorm1d(input_dim)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        identity = x
        
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.bn3(out)
        
        out += identity  # Skip connection
        out = self.relu(out)
        
        return out
    

# Define FlightDelayClassifier with ResNet architecture
class FlightDelayClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super(FlightDelayClassifier, self).__init__()
        
        # Initial embedding layer
        self.embedding = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Residual blocks
        self.res_block1 = ResidualBlock(hidden_dim)
        self.res_block2 = ResidualBlock(hidden_dim)
        self.res_block3 = ResidualBlock(hidden_dim)
        
        # Bottleneck residual block
        self.bottleneck = BottleneckResidualBlock(hidden_dim, hidden_dim // 2)
        
        # Final prediction layers
        self.prediction = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.res_block1(x)
        x = self.res_block2(x)
        x = self.res_block3(x)
        x = self.bottleneck(x)
        x = self.prediction(x)
        return x

# Define FlightDelayRegressor with ResNet architecture
class FlightDelayRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super(FlightDelayRegressor, self).__init__()
        
        # Initial embedding layer
        self.embedding = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3)
        )
        
        # Residual blocks
        self.res_block1 = ResidualBlock(hidden_dim)
        self.res_block2 = ResidualBlock(hidden_dim)
        self.res_block3 = ResidualBlock(hidden_dim)
        
        # Bottleneck residual block
        self.bottleneck = BottleneckResidualBlock(hidden_dim, hidden_dim // 2)
        
        # Final prediction layers
        self.prediction = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.res_block1(x)
        x = self.res_block2(x)
        x = self.res_block3(x)
        x = self.bottleneck(x)
        x = self.prediction(x)
        return x

In [3]:
# Function to load weather data
def load_weather_data():
    print("\nLoading weather data...")
    start_time = time.time()
    
    all_files = glob.glob(os.path.join(weather_data_path, "*.csv"))
    print(f"Found {len(all_files)} total weather data files")
    weather_dict = {}
    count = 0
    matching_count = 0
    
    for file in all_files:
        try:
            filename = os.path.basename(file)
            parts = filename.split('.')[0].split('_')
            
            if len(parts) >= 3:
                iata = parts[0]
                year = parts[1]
                month_name = parts[2]
                
                # Convert month name to number
                month_map = {
                    'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
                    'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
                    'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
                }
                
                if month_name in month_map:
                    month = month_map[month_name]
                    
                    if top_airport_codes is None or iata in top_airport_codes:
                        weather_data = pd.read_csv(file, low_memory=False)

                        if 'DATE' not in weather_data.columns:
                            print(f"Warning: DATE column not found in {filename}")
                            continue

                        weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])

                        key = f"{iata}_{year}_{month}"

                        weather_dict[key] = weather_data
                        matching_count += 1
                else:
                    print(f"Warning: Unknown month format in {filename}")
                
                count += 1

                if count % 100 == 0:
                    print(f"Processed {count} weather files, loaded {matching_count} matching files")
        except Exception as e:
            print(f"Error loading weather file {file}: {e}")
    
    print(f"Loaded {matching_count} weather files out of {count} processed files")
    print(f"Loading weather data took: {time.time() - start_time:.2f} seconds")
    return weather_dict

# Get specific May files from the cleaned_data directory
def get_may_files():
    may_files = [
        os.path.join(flight_data_path, "May2021.csv"),
        os.path.join(flight_data_path, "May2022.csv"),
        os.path.join(flight_data_path, "May2023.csv"),
        os.path.join(flight_data_path, "May2024.csv")
    ]

    existing_files = []
    for file_path in may_files:
        if os.path.exists(file_path):
            existing_files.append(file_path)
        else:
            print(f"Warning: File {file_path} not found")
    
    return existing_files

# Get the May 2021-2024 flight data files
flight_files = get_may_files()
print(f"\nFound {len(flight_files)} May files to process:")
for f in flight_files:
    print(f"  - {os.path.basename(f)}")

if not flight_files:
    print("No May 2021-2024 files were found. Please check file paths.")
    exit(1)

weather_dict = load_weather_data()

# Function to extract year from filename
def extract_year_from_filename(filename):
    base_name = os.path.basename(filename)
    year_str = base_name.replace('May', '').split('.')[0]
    return int(year_str)

# Function to create red-eye flight indicator
def create_redeye_indicator(df):
    df = df.copy()
    
    df['IS_REDEYE'] = 0

    time_columns = []

    if 'SCH_DEP_TIME' in df.columns:
        time_columns.append('SCH_DEP_TIME')

    if 'SCH_ARR_TIME' in df.columns:
        time_columns.append('SCH_ARR_TIME')

    for col in time_columns:
        if df[col].dtype != 'float64':
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except:
                print(f"Warning: Could not convert {col} to numeric")
    
    # Identify red-eye flights based on scheduled departure time (0-6 AM)
    if 'SCH_DEP_TIME' in time_columns:
        redeye_departure = (df['SCH_DEP_TIME'] >= 0) & (df['SCH_DEP_TIME'] < 600)
        df.loc[redeye_departure, 'IS_REDEYE'] = 1

        dep_redeye_count = redeye_departure.sum()
        print(f"Identified {dep_redeye_count} red-eye flights based on departure time (0-6 AM)")
    
    # Identify red-eye flights based on scheduled arrival time (0-6 AM)
    if 'SCH_ARR_TIME' in time_columns:
        redeye_arrival = (df['SCH_ARR_TIME'] >= 0) & (df['SCH_ARR_TIME'] < 600)
        df.loc[redeye_arrival, 'IS_REDEYE'] = 1

        arr_redeye_count = redeye_arrival.sum()
        print(f"Identified {arr_redeye_count} red-eye flights based on arrival time (0-6 AM)")

    redeye_count = df['IS_REDEYE'].sum()
    total_count = len(df)
    print(f"Total identified red-eye flights: {redeye_count} out of {total_count} total flights ({redeye_count/total_count*100:.2f}%)")
    
    # Add a more detailed time-of-day categorical feature if needed
    if 'SCH_DEP_TIME' in time_columns:
        df['DEP_TIME_OF_DAY'] = pd.cut(
            df['SCH_DEP_TIME'], 
            bins=[0, 600, 1200, 1800, 2400],
            labels=['Early Morning (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)'],
            include_lowest=True
        )

        time_dist = df['DEP_TIME_OF_DAY'].value_counts()
        print("\nDistribution of flights by departure time of day:")
        for time_cat, count in time_dist.items():
            print(f"  - {time_cat}: {count} flights ({count/total_count*100:.2f}%)")
    
    return df

# Function to prepare departure delay data
def prepare_delay_data(df):
    # Make a copy to avoid modifying the original
    df = df.copy()

    if 'DEP_DELAY' in df.columns:
        if df['DEP_DELAY'].dtype != 'float64':
            try:
                df['DEP_DELAY'] = pd.to_numeric(df['DEP_DELAY'], errors='coerce')
            except:
                print(f"Warning: Could not convert DEP_DELAY to numeric")
    else:
        print("Warning: DEP_DELAY column not found in dataset")
        return df
    
    # Create a binary feature for on-time departure (<=0 means on time or early)
    df['IS_DELAYED'] = (df['DEP_DELAY'] > 0).astype(int)
    
    # Create a categorical delay feature
    df['DELAY_CATEGORY'] = pd.cut(
        df['DEP_DELAY'],
        bins=[-float('inf'), -15, 0, 15, 60, 120, float('inf')],
        labels=['Very Early', 'Early', 'On Time', 'Moderate Delay',
                'Significant Delay', 'Severe Delay'],
        include_lowest=True
    )
    
    df['ABS_DELAY'] = np.abs(df['DEP_DELAY'])

    delay_count = df['IS_DELAYED'].sum()
    total_count = len(df)
    delay_rate = delay_count / total_count * 100
    
    print(f"\nDelay statistics:")
    print(f"Delayed flights: {delay_count}/{total_count} ({delay_rate:.2f}%)")
    print(f"On-time or early flights: {total_count - delay_count}/{total_count} ({100 - delay_rate:.2f}%)")
    
    print("\nDelay magnitude statistics:")
    print(f"Mean delay: {df['DEP_DELAY'].mean():.2f} minutes")
    print(f"Median delay: {df['DEP_DELAY'].median():.2f} minutes")
    print(f"Min delay: {df['DEP_DELAY'].min():.2f} minutes (negative means early departure)")
    print(f"Max delay: {df['DEP_DELAY'].max():.2f} minutes")

    upper_limit = df['DEP_DELAY'].quantile(0.995)
    df['DEP_DELAY_CLIPPED'] = df['DEP_DELAY'].clip(upper=upper_limit)
    
    print(f"Clipped delay values above {upper_limit:.2f} minutes for neural network training")
    print(f"Number of clipped values: {(df['DEP_DELAY'] > upper_limit).sum()}")

    delay_cat_dist = df['DELAY_CATEGORY'].value_counts()
    print("\nDelay category distribution:")
    for cat, count in delay_cat_dist.sort_index().items():
        print(f"  - {cat}: {count} flights ({count/total_count*100:.2f}%)")
    
    return df

# Function to create advanced time features
def create_advanced_time_features(df):
    df = df.copy()
    
    if 'SCH_DEP_TIME' not in df.columns:
        print("Warning: SCH_DEP_TIME column not found for time features")
        return df

    if df['SCH_DEP_TIME'].dtype != 'float64':
        try:
            df['SCH_DEP_TIME'] = pd.to_numeric(df['SCH_DEP_TIME'], errors='coerce')
        except:
            print(f"Warning: Could not convert SCH_DEP_TIME to numeric")
            return df

    df['DEP_HOUR'] = (df['SCH_DEP_TIME'] // 100).astype(int)
    df['DEP_MINUTE'] = (df['SCH_DEP_TIME'] % 100).astype(int)

    df['TIME_MINS'] = df['DEP_HOUR'] * 60 + df['DEP_MINUTE']

    df['NORMALIZED_TIME'] = df['TIME_MINS'] / (24 * 60)

    df['HOUR_SIN'] = np.sin(2 * np.pi * df['DEP_HOUR'] / 24)
    df['HOUR_COS'] = np.cos(2 * np.pi * df['DEP_HOUR'] / 24)
    
    # 12-hour cycle
    df['HALFDAY_SIN'] = np.sin(2 * np.pi * df['DEP_HOUR'] / 12)
    df['HALFDAY_COS'] = np.cos(2 * np.pi * df['DEP_HOUR'] / 12)
    
    # 6-hour cycle
    df['QUARTER_DAY_SIN'] = np.sin(2 * np.pi * df['DEP_HOUR'] / 6)
    df['QUARTER_DAY_COS'] = np.cos(2 * np.pi * df['DEP_HOUR'] / 6)
    
    # Create time blocks
    time_blocks = {
        0: 'Late Night (0-3)',
        1: 'Late Night (0-3)',
        2: 'Late Night (0-3)',
        3: 'Early Morning (3-6)',
        4: 'Early Morning (3-6)',
        5: 'Early Morning (3-6)',
        6: 'Morning (6-9)',
        7: 'Morning (6-9)',
        8: 'Morning (6-9)',
        9: 'Mid-Day (9-12)',
        10: 'Mid-Day (9-12)',
        11: 'Mid-Day (9-12)',
        12: 'Afternoon (12-15)',
        13: 'Afternoon (12-15)',
        14: 'Afternoon (12-15)',
        15: 'Evening (15-18)',
        16: 'Evening (15-18)',
        17: 'Evening (15-18)',
        18: 'Night (18-21)',
        19: 'Night (18-21)',
        20: 'Night (18-21)',
        21: 'Late Night (21-24)',
        22: 'Late Night (21-24)',
        23: 'Late Night (21-24)'
    }
    
    # Map hours to time blocks
    df['TIME_BLOCK'] = df['DEP_HOUR'].map(time_blocks)

    # Morning peak (7-9 AM)
    df['IS_MORNING_PEAK'] = ((df['DEP_HOUR'] >= 7) & (df['DEP_HOUR'] <= 9)).astype(int)
    
    # Evening peak (4-7 PM)
    df['IS_EVENING_PEAK'] = ((df['DEP_HOUR'] >= 16) & (df['DEP_HOUR'] <= 19)).astype(int)

    if 'IS_MAJOR_HUB_ORIGIN' in df.columns:
        df['HUB_MORNING_PEAK'] = df['IS_MAJOR_HUB_ORIGIN'] * df['IS_MORNING_PEAK']
        df['HUB_EVENING_PEAK'] = df['IS_MAJOR_HUB_ORIGIN'] * df['IS_EVENING_PEAK']
    
    return df

# Function to create day of week features
def create_advanced_day_features(df):
    df = df.copy()

    if 'WEEK' in df.columns:
        day_name_map = {
            'Sun': 'Sunday',
            'Mon': 'Monday',
            'Tue': 'Tuesday',
            'Wed': 'Wednesday',
            'Thu': 'Thursday',
            'Fri': 'Friday',
            'Sat': 'Saturday'
        }

        df['DAY_NAME'] = df['WEEK'].map(day_name_map)

        df['IS_WEEKEND'] = df['WEEK'].isin(['Sat', 'Sun']).astype(int)

        day_to_num = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 0}
        df['DAY_NUM'] = df['WEEK'].map(day_to_num)
        
        # Weekly cycle
        df['DAY_SIN'] = np.sin(2 * np.pi * df['DAY_NUM'] / 7)
        df['DAY_COS'] = np.cos(2 * np.pi * df['DAY_NUM'] / 7)
        
        # Weekday/weekend cycle
        df['WEEKDAY_SIN'] = np.sin(np.pi * df['IS_WEEKEND'])
        df['WEEKDAY_COS'] = np.cos(np.pi * df['IS_WEEKEND'])
        
        # Create workweek features
        df['WORKWEEK_DAY'] = df['DAY_NUM'].apply(lambda x: x if x < 5 else np.nan)
        work_day_mean = df['WORKWEEK_DAY'].mean()
        df['WORKWEEK_DAY'] = df['WORKWEEK_DAY'].fillna(work_day_mean)
        
        # Workweek cycle
        df['WORKWEEK_SIN'] = np.sin(2 * np.pi * df['WORKWEEK_DAY'] / 5)
        df['WORKWEEK_COS'] = np.cos(2 * np.pi * df['WORKWEEK_DAY'] / 5)

        day_counts = df['DAY_NAME'].value_counts()
        total = len(df)
        print("\nDistribution of flights by day of week:")
        for day, count in day_counts.items():
            print(f"  - {day}: {count} flights ({count/total*100:.2f}%)")
        
        # Print weekend vs. weekday distribution
        weekend_count = df['IS_WEEKEND'].sum()
        weekday_count = total - weekend_count
        print(f"\nWeekend flights: {weekend_count} ({weekend_count/total*100:.2f}%)")
        print(f"Weekday flights: {weekday_count} ({weekday_count/total*100:.2f}%)")
        
    elif 'DAY_OF_WEEK' in df.columns:
        max_day = df['DAY_OF_WEEK'].max()
        
        if max_day == 7:
            df['IS_WEEKEND'] = ((df['DAY_OF_WEEK'] == 6) | (df['DAY_OF_WEEK'] == 7)).astype(int)

            day_names = {1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 
                        4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}

            df['DAY_NUM'] = df['DAY_OF_WEEK'] - 1
        else:
            df['IS_WEEKEND'] = ((df['DAY_OF_WEEK'] == 5) | (df['DAY_OF_WEEK'] == 6)).astype(int)

            day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 
                        3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

            df['DAY_NUM'] = df['DAY_OF_WEEK']
        
        df['DAY_NAME'] = df['DAY_OF_WEEK'].map(day_names)
        
        # Add cyclic encoding
        df['DAY_SIN'] = np.sin(2 * np.pi * df['DAY_NUM'] / 7)
        df['DAY_COS'] = np.cos(2 * np.pi * df['DAY_NUM'] / 7)
        
        # Weekday/weekend cycle
        df['WEEKDAY_SIN'] = np.sin(np.pi * df['IS_WEEKEND'])
        df['WEEKDAY_COS'] = np.cos(np.pi * df['IS_WEEKEND'])
    else:
        print("Warning: No day of week column (WEEK or DAY_OF_WEEK) found")
    
    return df

# Function to create advanced airport features
def create_airport_features(df):
    df = df.copy()

    major_hubs = ['ATL', 'DFW', 'DEN', 'ORD', 'LAX', 'CLT', 'LAS', 'PHX', 'MCO', 'SEA']

    if 'ORIGIN_IATA' in df.columns:
        df['IS_MAJOR_HUB_ORIGIN'] = df['ORIGIN_IATA'].isin(major_hubs).astype(int)
    
    if 'DEST_IATA' in df.columns:
        df['IS_MAJOR_HUB_DEST'] = df['DEST_IATA'].isin(major_hubs).astype(int)

    if 'IS_MAJOR_HUB_ORIGIN' in df.columns and 'IS_MAJOR_HUB_DEST' in df.columns:
        df['IS_HUB_TO_HUB'] = (df['IS_MAJOR_HUB_ORIGIN'] & df['IS_MAJOR_HUB_DEST']).astype(int)
    
    # Create region indicators (simplistic example)
    if 'ORIGIN_IATA' in df.columns:
        west_coast = ['LAX', 'SFO', 'SEA', 'PDX', 'SAN', 'LAS']
        east_coast = ['JFK', 'LGA', 'EWR', 'BOS', 'DCA', 'IAD', 'MIA', 'FLL', 'ATL', 'CLT']
        central = ['ORD', 'MDW', 'DFW', 'IAH', 'DEN', 'MSP', 'DTW', 'STL']
        
        df['IS_WEST_COAST_ORIGIN'] = df['ORIGIN_IATA'].isin(west_coast).astype(int)
        df['IS_EAST_COAST_ORIGIN'] = df['ORIGIN_IATA'].isin(east_coast).astype(int)
        df['IS_CENTRAL_ORIGIN'] = df['ORIGIN_IATA'].isin(central).astype(int)
        
        df['IS_WEST_COAST_DEST'] = df['DEST_IATA'].isin(west_coast).astype(int)
        df['IS_EAST_COAST_DEST'] = df['DEST_IATA'].isin(east_coast).astype(int)
        df['IS_CENTRAL_DEST'] = df['DEST_IATA'].isin(central).astype(int)

        df['IS_TRANSCON'] = ((df['IS_WEST_COAST_ORIGIN'] & df['IS_EAST_COAST_DEST']) | 
                             (df['IS_EAST_COAST_ORIGIN'] & df['IS_WEST_COAST_DEST'])).astype(int)
    
    # Create distance categories
    if 'DISTANCE' in df.columns:
        df['DISTANCE_CAT'] = pd.cut(
            df['DISTANCE'], 
            bins=[0, 500, 1000, 1500, 2000, float('inf')],
            labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long']
        )

        # Normalize distance
        max_dist = df['DISTANCE'].max()
        df['NORMALIZED_DISTANCE'] = df['DISTANCE'] / max_dist
        
        # Create logarithmic distance feature
        df['LOG_DISTANCE'] = np.log1p(df['DISTANCE'])
    
    return df

# Function to create advanced weather features
def create_weather_features(df):
    df = df.copy()

    if 'PRCP' in df.columns:
        df['RAIN_SEVERITY'] = pd.cut(
            df['PRCP'],
            bins=[-0.01, 0.0, 0.1, 0.5, 1.0, float('inf')],
            labels=[0, 1, 2, 3, 4]
        ).astype(int)
    
    # Combine weather features
    if 'RAIN_SEVERITY' in df.columns and 'EXTREME_WEATHER' in df.columns:
        df['WEATHER_SCORE'] = df['RAIN_SEVERITY'] + df['EXTREME_WEATHER'] * 3
    
    # Create weather interaction features
    if 'IS_MAJOR_HUB_ORIGIN' in df.columns and 'WEATHER_SCORE' in df.columns:
        df['HUB_WEATHER_IMPACT'] = df['IS_MAJOR_HUB_ORIGIN'] * df['WEATHER_SCORE']
    
    # Create time-weather interactions
    if 'IS_MORNING_PEAK' in df.columns and 'WEATHER_SCORE' in df.columns:
        df['PEAK_WEATHER_IMPACT'] = (df['IS_MORNING_PEAK'] | df['IS_EVENING_PEAK']) * df['WEATHER_SCORE']
    
    return df

# Function to load and preprocess a single flight data file
def load_and_process_flight_data(file_path):
    print(f"\nProcessing {os.path.basename(file_path)}...")
    start_time = time.time()
    
    try:
        df = pd.read_csv(file_path, low_memory=False)
        original_size = len(df)

        file_year = extract_year_from_filename(file_path)

        if 'YEAR' in df.columns:
            unique_years = df['YEAR'].unique()
            print(f"Years found in data: {unique_years}")

            if len(unique_years) > 1:
                df = df[df['YEAR'] == file_year]
                print(f"Filtered to only year {file_year}: {len(df)} rows")
        else:
            df['YEAR'] = file_year
            print(f"Added YEAR column with value {file_year}")

        if 'MONTH' in df.columns:
            month_counts = df['MONTH'].value_counts()
            print(f"Months found in data: {dict(month_counts)}")
            
            if 5 in month_counts:
                df = df[df['MONTH'] == 5]
                print(f"Filtered to only May data: {len(df)} rows")
            else:
                print(f"Warning: No May data found in file, but proceeding anyway as this should be May data based on filename")

        if 'DEP_DELAY' not in df.columns:
            print(f"DEP_DELAY column not found in {os.path.basename(file_path)}. Skipping file.")
            return None

        if top_airport_codes is not None:
            df = df[
                df['ORIGIN_IATA'].str.strip().isin(top_airport_codes) & 
                df['DEST_IATA'].str.strip().isin(top_airport_codes)
            ]
            
            filtered_size = len(df)
            print(f"Filtered from {original_size} to {filtered_size} rows for top 30 airports")

            if filtered_size == 0:
                print(f"No data remaining after filtering for top 30 airports. Skipping file.")
                return None

        if 'CANCELLED' in df.columns:
            cancelled_count = df['CANCELLED'].sum()
            if cancelled_count > 0:
                df = df[df['CANCELLED'] == 0]
                print(f"Removed {cancelled_count} cancelled flights, remaining: {len(df)}")

        if df['DEP_DELAY'].isnull().any():
            missing_count = df['DEP_DELAY'].isnull().sum()
            print(f"Found {missing_count} rows with missing DEP_DELAY values. Removing them.")
            df = df.dropna(subset=['DEP_DELAY'])
            print(f"After removing rows with missing DEP_DELAY: {len(df)} rows")
        
        print(f"Processing took: {time.time() - start_time:.2f} seconds")
        return df
        
    except Exception as e:
        print(f"Error processing file {os.path.basename(file_path)}: {e}")
        return None

# Function to match weather data to flights
def match_weather_data(df):

    print("\nMatching weather data with flights...")
    start_time = time.time()
    
    # Make sure necessary date columns exist
    date_columns_exist = all(col in df.columns for col in ['YEAR', 'MONTH', 'DAY'])
    if not date_columns_exist:
        print("Warning: Missing one or more date columns (YEAR, MONTH, DAY)")
        print("Weather data cannot be matched")
        return df
    
    # Create a date column for matching
    df['FLIGHT_DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])
    
    # Create a column to hold the weather key pattern
    df['WEATHER_KEY'] = df['ORIGIN_IATA'] + '_' + df['YEAR'].astype(str) + '_' + df['MONTH'].astype(str).str.zfill(2)
    
    # Create columns for weather features
    weather_columns = ['EXTREME_WEATHER', 'PRCP', 'WT01', 'WT03', 'WT04', 'WT05', 'WT08', 'WT11']
    for col in weather_columns:
        if col not in df.columns:
            df[col] = 0.0
    
    # Process in batches
    matched_count = 0
    batch_size = 20000
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx]
        
        for idx, row in batch.iterrows():
            try:
                weather_key = row['WEATHER_KEY']
                flight_date = row['FLIGHT_DATE']

                if weather_key in weather_dict:
                    weather_data = weather_dict[weather_key]

                    matching_weather = weather_data[weather_data['DATE'] == flight_date]
                    
                    if not matching_weather.empty:
                        for col in weather_columns:
                            if col in matching_weather.columns:
                                df.at[idx, col] = matching_weather[col].iloc[0]
                        matched_count += 1
            except Exception as e:
                pass
        
        # Print progress
        print(f"Processed {end_idx}/{len(df)} rows, matched {matched_count} flights with weather data")
    
    print(f"Matched weather data for {matched_count} flights ({matched_count/len(df)*100:.2f}%)")
    print(f"Weather matching took: {time.time() - start_time:.2f} seconds")
    
    return df


Found 4 May files to process:
  - May2021.csv
  - May2022.csv
  - May2023.csv
  - May2024.csv

Loading weather data...
Found 3550 total weather data files
Processed 100 weather files, loaded 0 matching files
Processed 200 weather files, loaded 0 matching files
Processed 300 weather files, loaded 16 matching files
Processed 400 weather files, loaded 32 matching files
Processed 500 weather files, loaded 32 matching files
Processed 600 weather files, loaded 48 matching files
Processed 700 weather files, loaded 48 matching files
Processed 800 weather files, loaded 48 matching files
Processed 900 weather files, loaded 64 matching files
Processed 1000 weather files, loaded 64 matching files
Processed 1100 weather files, loaded 80 matching files
Processed 1200 weather files, loaded 96 matching files
Processed 1300 weather files, loaded 112 matching files
Processed 1400 weather files, loaded 112 matching files
Processed 1500 weather files, loaded 112 matching files
Processed 1600 weather file

In [4]:
def train_resnet_models(X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg, year, output_dir):
    print("\nBuilding and training ResNet neural network models...")
    start_time = time.time()

    plots_dir = os.path.join(output_dir, f'plots_{year}')
    models_dir = os.path.join(output_dir, f'models_{year}')
    metrics_dir = os.path.join(output_dir, f'metrics_{year}')
    
    os.makedirs(plots_dir, exist_ok=True)
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(metrics_dir, exist_ok=True)

    X_train_tensor = torch.FloatTensor(X_train).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)
    
    y_train_class_tensor = torch.FloatTensor(y_train_class.values).to(device)
    y_test_class_tensor = torch.FloatTensor(y_test_class.values).to(device)
    
    y_train_reg_tensor = torch.FloatTensor(y_train_reg.values).to(device)
    y_test_reg_tensor = torch.FloatTensor(y_test_reg.values).to(device)

    train_class_dataset = TensorDataset(X_train_tensor, y_train_class_tensor)
    test_class_dataset = TensorDataset(X_test_tensor, y_test_class_tensor)
    
    train_reg_dataset = TensorDataset(X_train_tensor, y_train_reg_tensor)
    test_reg_dataset = TensorDataset(X_test_tensor, y_test_reg_tensor)

    batch_size = 1024
    train_class_loader = DataLoader(train_class_dataset, batch_size=batch_size, shuffle=True)
    test_class_loader = DataLoader(test_class_dataset, batch_size=batch_size)
    
    train_reg_loader = DataLoader(train_reg_dataset, batch_size=batch_size, shuffle=True)
    test_reg_loader = DataLoader(test_reg_dataset, batch_size=batch_size)

    print("\nTraining ResNet classification model...")
    class_model_start_time = time.time()

    # Initialize model
    input_dim = X_train.shape[1]
    classifier = FlightDelayClassifier(input_dim).to(device)

    # Define loss function and optimizer
    criterion_class = nn.BCELoss()
    optimizer_class = optim.Adam(classifier.parameters(), lr=0.001, weight_decay=1e-5)

    # Learning rate scheduler
    scheduler_class = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer_class, mode='min', factor=0.5, patience=3, verbose=True
    )

    # Training loop parameters
    num_epochs = 30
    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0
    best_model_state = None
    
    # Metrics tracking
    train_losses_class = []
    val_losses_class = []
    
    # Training loop
    for epoch in range(num_epochs):
        classifier.train()
        running_loss = 0.0
        
        for inputs, targets in tqdm(train_class_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            optimizer_class.zero_grad()
            
            # Forward pass
            outputs = classifier(inputs)
            outputs = outputs.squeeze()
            loss = criterion_class(outputs, targets)
            
            # Backward pass and optimize
            loss.backward()

            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(classifier.parameters(), max_norm=1.0)
            optimizer_class.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        epoch_train_loss = running_loss / len(train_class_dataset)
        train_losses_class.append(epoch_train_loss)
        
        # Validation
        classifier.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in test_class_loader:
                outputs = classifier(inputs)
                outputs = outputs.squeeze()
                loss = criterion_class(outputs, targets)
                val_loss += loss.item() * inputs.size(0)
        
        epoch_val_loss = val_loss / len(test_class_dataset)
        val_losses_class.append(epoch_val_loss)
        
        # Learning rate scheduling
        scheduler_class.step(epoch_val_loss)
        
        # Print stats
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
        
        # Save best model
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = classifier.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
        
        # Early stopping
        if patience_counter >= patience:
            print(f'Early stopping triggered after {epoch+1} epochs')
            break
    
    # Load best model
    classifier.load_state_dict(best_model_state)
    
    # Save model
    torch.save(classifier.state_dict(), os.path.join(models_dir, f'resnet_classifier_{year}.pth'))
    
    class_model_training_time = time.time() - class_model_start_time
    print(f"Classification model training took: {class_model_training_time:.2f} seconds")
    
    # Train Regression Model
    print("\nTraining ResNet regression model...")
    reg_model_start_time = time.time()
    
    # Initialize model
    regressor = FlightDelayRegressor(input_dim).to(device)
    
    # Define loss function and optimizer
    criterion_reg = nn.MSELoss()
    optimizer_reg = optim.Adam(regressor.parameters(), lr=0.001, weight_decay=1e-5)
    
    # Learning rate scheduler
    scheduler_reg = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer_reg, mode='min', factor=0.5, patience=3, verbose=True
    )
    
    # Reset training loop parameters
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    # Metrics tracking
    train_losses_reg = []
    val_losses_reg = []
    
    # Training loop
    for epoch in range(num_epochs):
        regressor.train()
        running_loss = 0.0
        
        for inputs, targets in tqdm(train_reg_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            # Zero the parameter gradients
            optimizer_reg.zero_grad()
            
            # Forward pass
            outputs = regressor(inputs)
            outputs = outputs.squeeze()
            loss = criterion_reg(outputs, targets)
            
            # Backward pass and optimize
            loss.backward()
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(regressor.parameters(), max_norm=1.0)
            optimizer_reg.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        epoch_train_loss = running_loss / len(train_reg_dataset)
        train_losses_reg.append(epoch_train_loss)
        
        # Validation
        regressor.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in test_reg_loader:
                outputs = regressor(inputs)
                outputs = outputs.squeeze()
                loss = criterion_reg(outputs, targets)
                val_loss += loss.item() * inputs.size(0)
        
        epoch_val_loss = val_loss / len(test_reg_dataset)
        val_losses_reg.append(epoch_val_loss)
        
        # Learning rate scheduling
        scheduler_reg.step(epoch_val_loss)
        
        # Print stats
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
        
        # Save best model
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            best_model_state = regressor.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
        
        # Early stopping
        if patience_counter >= patience:
            print(f'Early stopping triggered after {epoch+1} epochs')
            break
    
    # Load best model
    regressor.load_state_dict(best_model_state)
    
    # Save model
    torch.save(regressor.state_dict(), os.path.join(models_dir, f'resnet_regressor_{year}.pth'))
    
    reg_model_training_time = time.time() - reg_model_start_time
    print(f"Regression model training took: {reg_model_training_time:.2f} seconds")
    
    # 3. Evaluate Classification Model
    print("\nEvaluating classification model...")
    
    classifier.eval()
    all_preds = []
    all_targets = []
    all_probs = []
    
    with torch.no_grad():
        for inputs, targets in test_class_loader:
            outputs = classifier(inputs)
            outputs = outputs.squeeze()
            
            probs = outputs.cpu().numpy()
            preds = (outputs >= 0.5).float().cpu().numpy()
            
            all_probs.extend(probs)
            all_preds.extend(preds)
            all_targets.extend(targets.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)
    all_probs = np.array(all_probs)
    
    # Calculate metrics
    class_accuracy = (all_preds == all_targets).mean() * 100

    if len(np.unique(all_targets)) > 1:
        class_roc_auc = roc_auc_score(all_targets, all_probs)
    else:
        print("Warning: Only one class present in test set. ROC AUC score cannot be calculated.")
        class_roc_auc = 0.0
    
    class_report = classification_report(all_targets, all_preds, output_dict=True)
    class_cm = confusion_matrix(all_targets, all_preds)
    
    print(f"Classification Accuracy: {class_accuracy:.2f}%")
    print(f"Classification ROC AUC: {class_roc_auc:.4f}")

    if '1' in class_report:
        print(f"Classification Precision (Delayed): {class_report['1']['precision']:.4f}")
        print(f"Classification Recall (Delayed): {class_report['1']['recall']:.4f}")
        print(f"Classification F1 Score (Delayed): {class_report['1']['f1-score']:.4f}")
        precision = class_report['1']['precision']
        recall = class_report['1']['recall']
        f1_score = class_report['1']['f1-score']
    else:
        print("Warning: Class '1' not present in test results. Using default metrics.")
        precision = 0.0
        recall = 0.0
        f1_score = 0.0
    
    # 4. Evaluate Regression Model
    print("\nEvaluating regression model...")
    
    regressor.eval()
    all_reg_preds = []
    all_reg_targets = []
    
    with torch.no_grad():
        for inputs, targets in test_reg_loader:
            outputs = regressor(inputs)
            outputs = outputs.squeeze()
            
            all_reg_preds.extend(outputs.cpu().numpy())
            all_reg_targets.extend(targets.cpu().numpy())
    
    all_reg_preds = np.array(all_reg_preds)
    all_reg_targets = np.array(all_reg_targets)
    
    # Calculate metrics
    reg_mse = mean_squared_error(all_reg_targets, all_reg_preds)
    reg_rmse = np.sqrt(reg_mse)
    reg_mae = mean_absolute_error(all_reg_targets, all_reg_preds)
    reg_r2 = r2_score(all_reg_targets, all_reg_preds)
    
    print(f"Regression Mean Squared Error: {reg_mse:.2f}")
    print(f"Regression Root Mean Squared Error: {reg_rmse:.2f} minutes")
    print(f"Regression Mean Absolute Error: {reg_mae:.2f} minutes")
    print(f"Regression R² Score: {reg_r2:.4f}")
    
    # 5. Create plots and visualizations
    
    # Plot training/validation loss for classification
    plt.figure(figsize=(16, 10))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses_class, label='Train Loss')
    plt.plot(val_losses_class, label='Validation Loss')
    plt.title(f'Classification Loss Curves - {year}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot training/validation loss for regression
    plt.subplot(1, 2, 2)
    plt.plot(train_losses_reg, label='Train Loss')
    plt.plot(val_losses_reg, label='Validation Loss')
    plt.title(f'Regression Loss Curves - {year}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'learning_curves_{year}.png'))
    plt.close()
    
    # Plot confusion matrix for classification
    plt.figure(figsize=(16, 10))
    sns.heatmap(class_cm, annot=True, fmt='d', cmap='Blues', 
               xticklabels=['Not Delayed', 'Delayed'],
               yticklabels=['Not Delayed', 'Delayed'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Delay Classification Confusion Matrix ({year})')
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'confusion_matrix_{year}.png'))
    plt.close()

    if len(np.unique(all_targets)) > 1:
        # Plot ROC curve for classification
        plt.figure(figsize=(16, 10))
        fpr, tpr, _ = roc_curve(all_targets, all_probs)
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {class_roc_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for Delay Classification ({year})')
        plt.legend()
        plt.savefig(os.path.join(plots_dir, f'roc_curve_{year}.png'))
        plt.close()
    
    # Plot actual vs predicted delays for regression
    plt.figure(figsize=(16, 10))
    
    # Create a scatterplot with limited points for clarity
    max_points = 5000
    if len(all_reg_targets) > max_points:
        idx = np.random.choice(len(all_reg_targets), max_points, replace=False)
        sample_actual = all_reg_targets[idx]
        sample_pred = all_reg_preds[idx]
    else:
        sample_actual = all_reg_targets
        sample_pred = all_reg_preds
    
    plt.scatter(sample_actual, sample_pred, alpha=0.3)
    
    # Add perfect prediction line
    max_val = max(np.max(sample_actual), np.max(sample_pred))
    min_val = min(np.min(sample_actual), np.min(sample_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--')
    
    plt.xlabel('Actual Delay (minutes)')
    plt.ylabel('Predicted Delay (minutes)')
    plt.title(f'Actual vs Predicted Delay ({year})')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'actual_vs_predicted_{year}.png'))
    plt.close()
    
    # Plot delay prediction error distribution
    plt.figure(figsize=(16, 10))
    prediction_errors = all_reg_targets - all_reg_preds
    sns.histplot(prediction_errors, bins=50, kde=True)
    plt.axvline(0, color='red', linestyle='--')
    plt.xlabel('Prediction Error (minutes)')
    plt.ylabel('Frequency')
    plt.title(f'Delay Prediction Error Distribution ({year})')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'error_distribution_{year}.png'))
    plt.close()
    
    # Return metrics
    metrics = {
        'model_name': f'resnet_{year}',
        'year': year,
        
        # Classification metrics
        'class_accuracy': class_accuracy,
        'class_roc_auc': class_roc_auc,
        'class_precision': precision,
        'class_recall': recall,
        'class_f1': f1_score,
        'class_training_time': class_model_training_time,
        'class_epochs': len(train_losses_class),
        
        # Regression metrics
        'reg_mse': reg_mse,
        'reg_rmse': reg_rmse,
        'reg_mae': reg_mae,
        'reg_r2': reg_r2,
        'reg_training_time': reg_model_training_time,
        'reg_epochs': len(train_losses_reg),

        'true_negative': int(class_cm[0, 0]) if class_cm.shape == (2, 2) else 0,
        'false_positive': int(class_cm[0, 1]) if class_cm.shape == (2, 2) else 0,
        'false_negative': int(class_cm[1, 0]) if class_cm.shape == (2, 2) else 0,
        'true_positive': int(class_cm[1, 1]) if class_cm.shape == (2, 2) else 0,
        
        'status': 'success',
        'total_processing_time': time.time() - start_time
    }
    
    # Save metrics to JSON
    with open(os.path.join(metrics_dir, f'resnet_model_metrics_{year}.json'), 'w') as f:
        json.dump(convert_to_serializable(metrics), f, indent=4)
    
    return metrics, classifier, regressor

# Function to train model for a specific year
def train_year_model(year, flight_data_file):
    print(f"\n{'='*80}")
    print(f"Training ResNet model for year {year}")
    print(f"{'='*80}")
    
    # Create year-specific output directories
    year_output_dir = os.path.join(output_dir, f'year_{year}')
    os.makedirs(year_output_dir, exist_ok=True)
    
    start_time = time.time()
    
    # Load and preprocess the year's flight data
    flight_data = load_and_process_flight_data(flight_data_file)
    if flight_data is None or len(flight_data) == 0:
        print(f"No valid flight data available for {year}. Skipping this year.")
        return None
    
    # Match weather data
    flight_data = match_weather_data(flight_data)
    
    # Add advanced feature enhancements
    # Add red-eye flight indicator
    print(f"\nCreating red-eye flight indicator for {year}...")
    flight_data = create_redeye_indicator(flight_data)

    print(f"\nPreparing delay data for {year}...")
    flight_data = prepare_delay_data(flight_data)

    print(f"\nCreating advanced time features for {year}...")
    flight_data = create_advanced_time_features(flight_data)

    print(f"\nCreating advanced day features for {year}...")
    flight_data = create_advanced_day_features(flight_data)

    print(f"\nCreating advanced airport features for {year}...")
    flight_data = create_airport_features(flight_data)

    print(f"\nCreating advanced weather features for {year}...")
    flight_data = create_weather_features(flight_data)
    
    # Feature selection
    print(f"\nSelecting features for delay prediction for {year}...")
    
    # Categorical features
    cat_features = [
        'TIME_BLOCK', 'DAY_NAME', 'MKT_AIRLINE', 'ORIGIN_IATA', 'DEST_IATA',
        'DISTANCE_CAT', 'EXTREME_WEATHER',
        'IS_REDEYE', 'IS_WEEKEND', 'IS_MORNING_PEAK', 'IS_EVENING_PEAK',
        'IS_MAJOR_HUB_ORIGIN', 'IS_MAJOR_HUB_DEST', 'IS_HUB_TO_HUB',
        'IS_WEST_COAST_ORIGIN', 'IS_EAST_COAST_ORIGIN', 'IS_CENTRAL_ORIGIN',
        'IS_WEST_COAST_DEST', 'IS_EAST_COAST_DEST', 'IS_CENTRAL_DEST',
        'IS_TRANSCON',
    ]
    
    # Numerical features - include advanced features
    num_features = [
        # Basic features
        'DISTANCE', 'PRCP',
        
        # Cyclic time encodings
        'HOUR_SIN', 'HOUR_COS', 'HALFDAY_SIN', 'HALFDAY_COS', 
        'QUARTER_DAY_SIN', 'QUARTER_DAY_COS',
        
        # Cyclic day encodings
        'DAY_SIN', 'DAY_COS', 'WEEKDAY_SIN', 'WEEKDAY_COS',
        'WORKWEEK_SIN', 'WORKWEEK_COS',
        
        # Advanced airport features
        'NORMALIZED_DISTANCE', 'LOG_DISTANCE',
        
        # Advanced weather features
        'RAIN_SEVERITY', 'WEATHER_SCORE', 'HUB_WEATHER_IMPACT', 'PEAK_WEATHER_IMPACT'
    ]
    
    # Ensure all selected features exist in the dataframe
    cat_features = [f for f in cat_features if f in flight_data.columns]
    num_features = [f for f in num_features if f in flight_data.columns]
    
    print(f"Using categorical features: {cat_features}")
    print(f"Using numerical features: {num_features}")
    
    # Prepare data for modeling
    X = flight_data[cat_features + num_features].copy()
    y_class = flight_data['IS_DELAYED']
    
    # For regression, use the clipped delay values to avoid extreme outliers
    if 'DEP_DELAY_CLIPPED' in flight_data.columns:
        y_reg = flight_data['DEP_DELAY_CLIPPED']
        print("Using clipped delay values for regression to improve neural network training")
    else:
        y_reg = flight_data['DEP_DELAY']
    
    # Handle missing values
    for col in cat_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna('unknown', inplace=True)
    for col in num_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].median(), inplace=True)
    
    # Preprocess the data
    print(f"\nPreprocessing features for neural network training...")
    
    # Create preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_features),
            ('cat', categorical_transformer, cat_features)
        ]
    )
    
    # Fit and transform the data
    X_processed = preprocessor.fit_transform(X)
    
    print(f"Processed feature shape: {X_processed.shape}")
    
    # Split data for training and testing
    X_train, X_test, y_train_class, y_test_class = train_test_split(
        X_processed, y_class, test_size=0.1, random_state=2025, stratify=y_class
    )
    
    # Split data for regression model
    _, _, y_train_reg, y_test_reg = train_test_split(
        X_processed, y_reg, test_size=0.1, random_state=2025
    )
    
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    
    # Train ResNet models
    metrics, classifier, regressor = train_resnet_models(
        X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg, 
        year, year_output_dir
    )
    
    # Add dataset metrics
    metrics.update({
        'total_flights': len(flight_data),
        'delayed_flights_rate': flight_data['IS_DELAYED'].mean() * 100,
        'mean_delay': flight_data['DEP_DELAY'].mean(),
        'median_delay': flight_data['DEP_DELAY'].median(),
        'max_delay': float(flight_data['DEP_DELAY'].max()),
        'min_delay': float(flight_data['DEP_DELAY'].min()),
        'feature_count': X_processed.shape[1],
        'categorical_features': len(cat_features),
        'numerical_features': len(num_features)
    })
    
    # Save preprocessor for inference
    import joblib
    joblib.dump(preprocessor, os.path.join(year_output_dir, f'resnet_preprocessor_{year}.joblib'))
    print(f"Preprocessor saved to {os.path.join(year_output_dir, f'resnet_preprocessor_{year}.joblib')}")
    
    print(f"\nResNet model training for {year} complete! Total processing time: {time.time() - start_time:.2f} seconds")
    return metrics


In [5]:
# Function to compare ResNet models across years
def compare_resnet_models(all_results):
    print("\nComparing ResNet models across years...")
    
    if not all_results or len(all_results) < 2:
        print("Not enough year models to compare.")
        return

    comparison_dir = os.path.join(output_dir, 'comparison')
    os.makedirs(comparison_dir, exist_ok=True)

    years = sorted([r['year'] for r in all_results])
    
    # Create DataFrames for different metrics
    class_metrics = pd.DataFrame({
        'Year': years,
        'Accuracy (%)': [r['class_accuracy'] for r in all_results],
        'AUC': [r['class_roc_auc'] for r in all_results],
        'Precision': [r['class_precision'] for r in all_results],
        'Recall': [r['class_recall'] for r in all_results],
        'F1 Score': [r['class_f1'] for r in all_results],
    })
    
    reg_metrics = pd.DataFrame({
        'Year': years,
        'RMSE (min)': [r['reg_rmse'] for r in all_results],
        'MAE (min)': [r['reg_mae'] for r in all_results],
        'R² Score': [r['reg_r2'] for r in all_results],
    })
    
    timing_metrics = pd.DataFrame({
        'Year': years,
        'Classification Training Time (s)': [r['class_training_time'] for r in all_results],
        'Regression Training Time (s)': [r['reg_training_time'] for r in all_results],
        'Classification Epochs': [r.get('class_epochs', 0) for r in all_results],
        'Regression Epochs': [r.get('reg_epochs', 0) for r in all_results],
    })
    
    delay_stats = pd.DataFrame({
        'Year': years,
        'Mean Delay (min)': [r['mean_delay'] for r in all_results],
        'Delayed Flights (%)': [r['delayed_flights_rate'] for r in all_results],
        'Total Flights': [r['total_flights'] for r in all_results],
    })
    
    # Plot classification metrics
    plt.figure(figsize=(16, 10))

    bar_width = 0.15
    r1 = np.arange(len(years))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]
    r4 = [x + bar_width for x in r3]
    r5 = [x + bar_width for x in r4]

    plt.bar(r1, class_metrics['Accuracy (%)'] / 100, width=bar_width, label='Accuracy', color='blue')
    plt.bar(r2, class_metrics['AUC'], width=bar_width, label='AUC', color='green')
    plt.bar(r3, class_metrics['Precision'], width=bar_width, label='Precision', color='red')
    plt.bar(r4, class_metrics['Recall'], width=bar_width, label='Recall', color='purple')
    plt.bar(r5, class_metrics['F1 Score'], width=bar_width, label='F1 Score', color='orange')

    for i, r in enumerate([r1, r2, r3, r4, r5]):
        values = class_metrics.iloc[:, i+1].values
        if i == 0:
            values = values / 100
        for j, v in enumerate(values):
            plt.text(r[j], v + 0.01, f'{v:.2f}' if i > 0 else f'{v*100:.1f}%', 
                    ha='center', va='bottom', rotation=0, fontsize=8)

    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.title('ResNet Classification Metrics by Year')
    plt.xticks([r + 2*bar_width for r in range(len(years))], years)
    plt.legend()
    plt.ylim(0, 1.0)  # Set y-axis limits for better visualization
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, 'resnet_classification_metrics_by_year.png'))
    plt.close()
    
    # Plot regression metrics
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot RMSE and MAE
    x = np.arange(len(years))
    width = 0.35
    
    ax1.bar(x - width/2, reg_metrics['RMSE (min)'], width, label='RMSE')
    ax1.bar(x + width/2, reg_metrics['MAE (min)'], width, label='MAE')

    for i, v in enumerate(reg_metrics['RMSE (min)']):
        ax1.text(i - width/2, v + 0.5, f'{v:.1f}', ha='center', va='bottom')
    for i, v in enumerate(reg_metrics['MAE (min)']):
        ax1.text(i + width/2, v + 0.5, f'{v:.1f}', ha='center', va='bottom')
    
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Minutes')
    ax1.set_title('ResNet Regression Error Metrics')
    ax1.set_xticks(x)
    ax1.set_xticklabels(years)
    ax1.legend()
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot R² Score
    bars = ax2.bar(years, reg_metrics['R² Score'], color='green')

    for bar, value in zip(bars, reg_metrics['R² Score']):
        ax2.text(bar.get_x() + bar.get_width()/2, value + 0.01, f'{value:.3f}', 
                ha='center', va='bottom')
    
    ax2.set_xlabel('Year')
    ax2.set_ylabel('R² Score')
    ax2.set_title('ResNet Regression R² Score')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, 'resnet_regression_metrics_by_year.png'))
    plt.close()
    
    # Plot training times and epochs
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot training times
    ax1.bar(x - width/2, timing_metrics['Classification Training Time (s)'], width, label='Classification')
    ax1.bar(x + width/2, timing_metrics['Regression Training Time (s)'], width, label='Regression')

    for i, v in enumerate(timing_metrics['Classification Training Time (s)']):
        ax1.text(i - width/2, v + 5, f'{v:.0f}s', ha='center', va='bottom')
    for i, v in enumerate(timing_metrics['Regression Training Time (s)']):
        ax1.text(i + width/2, v + 5, f'{v:.0f}s', ha='center', va='bottom')
    
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Training Time (seconds)')
    ax1.set_title('ResNet Training Times')
    ax1.set_xticks(x)
    ax1.set_xticklabels(years)
    ax1.legend()
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot epochs
    ax2.bar(x - width/2, timing_metrics['Classification Epochs'], width, label='Classification')
    ax2.bar(x + width/2, timing_metrics['Regression Epochs'], width, label='Regression')
    
    # Add text labels
    for i, v in enumerate(timing_metrics['Classification Epochs']):
        ax2.text(i - width/2, v + 0.5, f'{v:.0f}', ha='center', va='bottom')
    for i, v in enumerate(timing_metrics['Regression Epochs']):
        ax2.text(i + width/2, v + 0.5, f'{v:.0f}', ha='center', va='bottom')
    
    ax2.set_xlabel('Year')
    ax2.set_ylabel('Number of Epochs')
    ax2.set_title('ResNet Training Epochs')
    ax2.set_xticks(x)
    ax2.set_xticklabels(years)
    ax2.legend()
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, 'resnet_training_metrics_by_year.png'))
    plt.close()
    
    # Plot delay statistics
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot mean delay
    bars1 = ax1.bar(years, delay_stats['Mean Delay (min)'], color='blue')
    
    # Add text labels
    for bar, value in zip(bars1, delay_stats['Mean Delay (min)']):
        ax1.text(bar.get_x() + bar.get_width()/2, value + 0.3, f'{value:.1f}', 
                ha='center', va='bottom')
    
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Minutes')
    ax1.set_title('Mean Delay by Year')
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot delay rate
    bars2 = ax2.bar(years, delay_stats['Delayed Flights (%)'], color='red')

    for bar, value in zip(bars2, delay_stats['Delayed Flights (%)']):
        ax2.text(bar.get_x() + bar.get_width()/2, value + 0.5, f'{value:.1f}%', 
                ha='center', va='bottom')
    
    ax2.set_xlabel('Year')
    ax2.set_ylabel('Percentage')
    ax2.set_title('Delayed Flights Rate by Year')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, 'resnet_delay_stats_by_year.png'))
    plt.close()
    
    # Create a summary table for all metrics
    summary = pd.concat([
        delay_stats.set_index('Year'),
        class_metrics.set_index('Year').iloc[:, 1:],
        reg_metrics.set_index('Year').iloc[:, 1:],
        timing_metrics.set_index('Year').iloc[:, 1:]
    ], axis=1)
    
    # Save the summary to CSV
    summary.to_csv(os.path.join(comparison_dir, 'dep_delay_nn_summary.csv'))
    print(f"Comparison summary saved to {os.path.join(comparison_dir, 'dep_delay_nn_summary.csv')}")
    
    print("ResNet model comparison completed!")

# Main execution
all_results = []

# Process each year's file separately
for file_path in flight_files:
    year = extract_year_from_filename(file_path)
    results = train_year_model(year, file_path)
    
    if results:
        all_results.append(results)
        print(f"\nModel for year {year} completed successfully!")
    else:
        print(f"\nModel for year {year} failed.")

if len(all_results) > 1:
    compare_resnet_models(all_results)
else:
    print("\nNot enough successful models to perform comparison.")

# Print final summary
print("\nYear-by-Year ResNet Model Training Summary:")
for year_result in all_results:
    year = year_result['year']
    print(f"\nYear {year}:")
    print(f"  Total flights: {year_result['total_flights']:,}")
    print(f"  Classification accuracy: {year_result['class_accuracy']:.2f}%")
    print(f"  Classification AUC: {year_result['class_roc_auc']:.4f}")
    print(f"  Regression RMSE: {year_result['reg_rmse']:.2f} minutes")
    print(f"  Regression R²: {year_result['reg_r2']:.4f}")
    print(f"  Mean delay: {year_result['mean_delay']:.2f} minutes")
    print(f"  Delay rate: {year_result['delayed_flights_rate']:.2f}%")
    print(f"  Training time: {year_result['total_processing_time']:.2f} seconds")

print("\nResNet model training complete! Check output directories for detailed results.")


Training ResNet model for year 2021

Processing May2021.csv...
Years found in data: [2021]
Months found in data: {5: 520059}
Filtered to only May data: 520059 rows
Filtered from 520059 to 171867 rows for top 30 airports
Removed 485.0 cancelled flights, remaining: 171382
Processing took: 2.72 seconds

Matching weather data with flights...
Processed 20000/171382 rows, matched 15255 flights with weather data
Processed 40000/171382 rows, matched 30386 flights with weather data
Processed 60000/171382 rows, matched 45609 flights with weather data
Processed 80000/171382 rows, matched 60864 flights with weather data
Processed 100000/171382 rows, matched 75976 flights with weather data
Processed 120000/171382 rows, matched 91287 flights with weather data
Processed 140000/171382 rows, matched 106365 flights with weather data
Processed 160000/171382 rows, matched 121693 flights with weather data
Processed 171382/171382 rows, matched 130389 flights with weather data
Matched weather data for 13038

Epoch 1/30: 100%|██████████| 151/151 [00:04<00:00, 30.35it/s]


Epoch 1/30, Train Loss: 0.5937, Val Loss: 0.5704


Epoch 2/30: 100%|██████████| 151/151 [00:04<00:00, 31.62it/s]


Epoch 2/30, Train Loss: 0.5699, Val Loss: 0.5673


Epoch 3/30: 100%|██████████| 151/151 [00:04<00:00, 31.56it/s]


Epoch 3/30, Train Loss: 0.5648, Val Loss: 0.5644


Epoch 4/30: 100%|██████████| 151/151 [00:04<00:00, 30.30it/s]


Epoch 4/30, Train Loss: 0.5610, Val Loss: 0.5632


Epoch 5/30: 100%|██████████| 151/151 [00:04<00:00, 31.56it/s]


Epoch 5/30, Train Loss: 0.5587, Val Loss: 0.5616


Epoch 6/30: 100%|██████████| 151/151 [00:04<00:00, 31.31it/s]


Epoch 6/30, Train Loss: 0.5567, Val Loss: 0.5611


Epoch 7/30: 100%|██████████| 151/151 [00:04<00:00, 32.14it/s]


Epoch 7/30, Train Loss: 0.5545, Val Loss: 0.5601


Epoch 8/30: 100%|██████████| 151/151 [00:04<00:00, 32.13it/s]


Epoch 8/30, Train Loss: 0.5527, Val Loss: 0.5590


Epoch 9/30: 100%|██████████| 151/151 [00:04<00:00, 31.57it/s]


Epoch 9/30, Train Loss: 0.5515, Val Loss: 0.5609


Epoch 10/30: 100%|██████████| 151/151 [00:04<00:00, 32.41it/s]


Epoch 10/30, Train Loss: 0.5497, Val Loss: 0.5578


Epoch 11/30: 100%|██████████| 151/151 [00:04<00:00, 31.80it/s]


Epoch 11/30, Train Loss: 0.5481, Val Loss: 0.5580


Epoch 12/30: 100%|██████████| 151/151 [00:04<00:00, 31.70it/s]


Epoch 12/30, Train Loss: 0.5464, Val Loss: 0.5584


Epoch 13/30: 100%|██████████| 151/151 [00:04<00:00, 31.61it/s]


Epoch 13/30, Train Loss: 0.5460, Val Loss: 0.5563


Epoch 14/30: 100%|██████████| 151/151 [00:04<00:00, 31.65it/s]


Epoch 14/30, Train Loss: 0.5448, Val Loss: 0.5583


Epoch 15/30: 100%|██████████| 151/151 [00:04<00:00, 32.16it/s]


Epoch 15/30, Train Loss: 0.5440, Val Loss: 0.5568


Epoch 16/30: 100%|██████████| 151/151 [00:04<00:00, 30.37it/s]


Epoch 16/30, Train Loss: 0.5422, Val Loss: 0.5571


Epoch 17/30: 100%|██████████| 151/151 [00:04<00:00, 30.53it/s]


Epoch 17/30, Train Loss: 0.5412, Val Loss: 0.5580


Epoch 18/30: 100%|██████████| 151/151 [00:04<00:00, 30.57it/s]


Epoch 18/30, Train Loss: 0.5376, Val Loss: 0.5573
Early stopping triggered after 18 epochs
Classification model training took: 90.79 seconds

Training ResNet regression model...


Epoch 1/30: 100%|██████████| 151/151 [00:05<00:00, 29.24it/s]


Epoch 1/30, Train Loss: 847.7161, Val Loss: 822.7479


Epoch 2/30: 100%|██████████| 151/151 [00:04<00:00, 30.59it/s]


Epoch 2/30, Train Loss: 831.0405, Val Loss: 821.3936


Epoch 3/30: 100%|██████████| 151/151 [00:04<00:00, 30.43it/s]


Epoch 3/30, Train Loss: 830.3063, Val Loss: 821.4701


Epoch 4/30: 100%|██████████| 151/151 [00:04<00:00, 31.63it/s]


Epoch 4/30, Train Loss: 829.9547, Val Loss: 821.2881


Epoch 5/30: 100%|██████████| 151/151 [00:04<00:00, 31.98it/s]


Epoch 5/30, Train Loss: 829.8969, Val Loss: 821.0923


Epoch 6/30: 100%|██████████| 151/151 [00:04<00:00, 31.58it/s]


Epoch 6/30, Train Loss: 829.1811, Val Loss: 821.7577


Epoch 7/30: 100%|██████████| 151/151 [00:04<00:00, 31.15it/s]


Epoch 7/30, Train Loss: 828.7648, Val Loss: 822.0656


Epoch 8/30: 100%|██████████| 151/151 [00:04<00:00, 30.81it/s]


Epoch 8/30, Train Loss: 828.4775, Val Loss: 822.1150


Epoch 9/30: 100%|██████████| 151/151 [00:04<00:00, 31.79it/s]


Epoch 9/30, Train Loss: 828.3532, Val Loss: 823.1203


Epoch 10/30: 100%|██████████| 151/151 [00:04<00:00, 31.22it/s]


Epoch 10/30, Train Loss: 826.8704, Val Loss: 823.7136
Early stopping triggered after 10 epochs
Regression model training took: 50.48 seconds

Evaluating classification model...
Classification Accuracy: 72.71%
Classification ROC AUC: 0.7215

Evaluating regression model...
Regression Mean Squared Error: 823.71
Regression Root Mean Squared Error: 28.70 minutes
Regression Mean Absolute Error: 14.64 minutes
Regression R² Score: -0.0035
Preprocessor saved to ./dep_delay_nn/year_2021\resnet_preprocessor_2021.joblib

ResNet model training for 2021 complete! Total processing time: 210.91 seconds

Model for year 2021 completed successfully!

Training ResNet model for year 2022

Processing May2022.csv...
Years found in data: [2022]
Months found in data: {5: 602950}
Filtered to only May data: 602950 rows
Filtered from 602950 to 210079 rows for top 30 airports
Removed 4659.0 cancelled flights, remaining: 205420
Processing took: 3.03 seconds

Matching weather data with flights...
Processed 20000/205

Epoch 1/30: 100%|██████████| 181/181 [00:05<00:00, 31.06it/s]


Epoch 1/30, Train Loss: 0.6405, Val Loss: 0.6244


Epoch 2/30: 100%|██████████| 181/181 [00:05<00:00, 31.41it/s]


Epoch 2/30, Train Loss: 0.6251, Val Loss: 0.6229


Epoch 3/30: 100%|██████████| 181/181 [00:05<00:00, 31.85it/s]


Epoch 3/30, Train Loss: 0.6215, Val Loss: 0.6199


Epoch 4/30: 100%|██████████| 181/181 [00:05<00:00, 31.62it/s]


Epoch 4/30, Train Loss: 0.6189, Val Loss: 0.6171


Epoch 5/30: 100%|██████████| 181/181 [00:05<00:00, 32.08it/s]


Epoch 5/30, Train Loss: 0.6168, Val Loss: 0.6163


Epoch 6/30: 100%|██████████| 181/181 [00:05<00:00, 31.45it/s]


Epoch 6/30, Train Loss: 0.6148, Val Loss: 0.6152


Epoch 7/30: 100%|██████████| 181/181 [00:05<00:00, 32.26it/s]


Epoch 7/30, Train Loss: 0.6133, Val Loss: 0.6149


Epoch 8/30: 100%|██████████| 181/181 [00:05<00:00, 31.82it/s]


Epoch 8/30, Train Loss: 0.6118, Val Loss: 0.6135


Epoch 9/30: 100%|██████████| 181/181 [00:05<00:00, 31.70it/s]


Epoch 9/30, Train Loss: 0.6100, Val Loss: 0.6134


Epoch 10/30: 100%|██████████| 181/181 [00:05<00:00, 32.29it/s]


Epoch 10/30, Train Loss: 0.6082, Val Loss: 0.6125


Epoch 11/30: 100%|██████████| 181/181 [00:05<00:00, 31.72it/s]


Epoch 11/30, Train Loss: 0.6075, Val Loss: 0.6123


Epoch 12/30: 100%|██████████| 181/181 [00:05<00:00, 32.16it/s]


Epoch 12/30, Train Loss: 0.6063, Val Loss: 0.6125


Epoch 13/30: 100%|██████████| 181/181 [00:05<00:00, 31.97it/s]


Epoch 13/30, Train Loss: 0.6047, Val Loss: 0.6113


Epoch 14/30: 100%|██████████| 181/181 [00:05<00:00, 31.82it/s]


Epoch 14/30, Train Loss: 0.6042, Val Loss: 0.6108


Epoch 15/30: 100%|██████████| 181/181 [00:05<00:00, 31.54it/s]


Epoch 15/30, Train Loss: 0.6035, Val Loss: 0.6109


Epoch 16/30: 100%|██████████| 181/181 [00:05<00:00, 32.20it/s]


Epoch 16/30, Train Loss: 0.6023, Val Loss: 0.6108


Epoch 17/30: 100%|██████████| 181/181 [00:05<00:00, 31.64it/s]


Epoch 17/30, Train Loss: 0.6015, Val Loss: 0.6122


Epoch 18/30: 100%|██████████| 181/181 [00:05<00:00, 31.90it/s]


Epoch 18/30, Train Loss: 0.5999, Val Loss: 0.6116


Epoch 19/30: 100%|██████████| 181/181 [00:05<00:00, 31.92it/s]


Epoch 19/30, Train Loss: 0.5966, Val Loss: 0.6121
Early stopping triggered after 19 epochs
Classification model training took: 112.29 seconds

Training ResNet regression model...


Epoch 1/30: 100%|██████████| 181/181 [00:05<00:00, 30.82it/s]


Epoch 1/30, Train Loss: 1614.9247, Val Loss: 1515.9708


Epoch 2/30: 100%|██████████| 181/181 [00:05<00:00, 31.99it/s]


Epoch 2/30, Train Loss: 1515.5970, Val Loss: 1480.4517


Epoch 3/30: 100%|██████████| 181/181 [00:05<00:00, 30.79it/s]


Epoch 3/30, Train Loss: 1509.2241, Val Loss: 1480.1886


Epoch 4/30: 100%|██████████| 181/181 [00:05<00:00, 31.03it/s]


Epoch 4/30, Train Loss: 1508.3967, Val Loss: 1479.7611


Epoch 5/30: 100%|██████████| 181/181 [00:05<00:00, 31.13it/s]


Epoch 5/30, Train Loss: 1508.4203, Val Loss: 1480.4699


Epoch 6/30: 100%|██████████| 181/181 [00:06<00:00, 28.14it/s]


Epoch 6/30, Train Loss: 1508.0417, Val Loss: 1479.7657


Epoch 7/30: 100%|██████████| 181/181 [00:06<00:00, 27.82it/s]


Epoch 7/30, Train Loss: 1508.1159, Val Loss: 1480.1516


Epoch 8/30: 100%|██████████| 181/181 [00:06<00:00, 29.51it/s]


Epoch 8/30, Train Loss: 1508.4824, Val Loss: 1480.9817


Epoch 9/30: 100%|██████████| 181/181 [00:05<00:00, 30.74it/s]


Epoch 9/30, Train Loss: 1506.9640, Val Loss: 1480.4020
Early stopping triggered after 9 epochs
Regression model training took: 56.22 seconds

Evaluating classification model...
Classification Accuracy: 66.37%
Classification ROC AUC: 0.7138

Evaluating regression model...
Regression Mean Squared Error: 1480.40
Regression Root Mean Squared Error: 38.48 minutes
Regression Mean Absolute Error: 22.38 minutes
Regression R² Score: -0.0006
Preprocessor saved to ./dep_delay_nn/year_2022\resnet_preprocessor_2022.joblib

ResNet model training for 2022 complete! Total processing time: 244.52 seconds

Model for year 2022 completed successfully!

Training ResNet model for year 2023

Processing May2023.csv...
Years found in data: [2023]
Months found in data: {5: 616630}
Filtered to only May data: 616630 rows
Filtered from 616630 to 220469 rows for top 30 airports
Removed 1293.0 cancelled flights, remaining: 219176
Processing took: 3.31 seconds

Matching weather data with flights...
Processed 20000/21

Epoch 1/30: 100%|██████████| 193/193 [00:06<00:00, 31.40it/s]


Epoch 1/30, Train Loss: 0.6241, Val Loss: 0.6020


Epoch 2/30: 100%|██████████| 193/193 [00:06<00:00, 31.19it/s]


Epoch 2/30, Train Loss: 0.6089, Val Loss: 0.5985


Epoch 3/30: 100%|██████████| 193/193 [00:06<00:00, 30.79it/s]


Epoch 3/30, Train Loss: 0.6049, Val Loss: 0.5969


Epoch 4/30: 100%|██████████| 193/193 [00:06<00:00, 31.23it/s]


Epoch 4/30, Train Loss: 0.6023, Val Loss: 0.5949


Epoch 5/30: 100%|██████████| 193/193 [00:06<00:00, 30.81it/s]


Epoch 5/30, Train Loss: 0.6006, Val Loss: 0.5936


Epoch 6/30: 100%|██████████| 193/193 [00:06<00:00, 31.58it/s]


Epoch 6/30, Train Loss: 0.5981, Val Loss: 0.5938


Epoch 7/30: 100%|██████████| 193/193 [00:06<00:00, 30.18it/s]


Epoch 7/30, Train Loss: 0.5966, Val Loss: 0.5912


Epoch 8/30: 100%|██████████| 193/193 [00:06<00:00, 30.51it/s]


Epoch 8/30, Train Loss: 0.5950, Val Loss: 0.5904


Epoch 9/30: 100%|██████████| 193/193 [00:06<00:00, 30.04it/s]


Epoch 9/30, Train Loss: 0.5936, Val Loss: 0.5916


Epoch 10/30: 100%|██████████| 193/193 [00:06<00:00, 29.78it/s]


Epoch 10/30, Train Loss: 0.5923, Val Loss: 0.5897


Epoch 11/30: 100%|██████████| 193/193 [00:06<00:00, 30.87it/s]


Epoch 11/30, Train Loss: 0.5904, Val Loss: 0.5880


Epoch 12/30: 100%|██████████| 193/193 [00:06<00:00, 30.82it/s]


Epoch 12/30, Train Loss: 0.5893, Val Loss: 0.5879


Epoch 13/30: 100%|██████████| 193/193 [00:06<00:00, 30.29it/s]


Epoch 13/30, Train Loss: 0.5882, Val Loss: 0.5883


Epoch 14/30: 100%|██████████| 193/193 [00:06<00:00, 30.05it/s]


Epoch 14/30, Train Loss: 0.5866, Val Loss: 0.5872


Epoch 15/30: 100%|██████████| 193/193 [00:06<00:00, 29.60it/s]


Epoch 15/30, Train Loss: 0.5860, Val Loss: 0.5865


Epoch 16/30: 100%|██████████| 193/193 [00:06<00:00, 30.72it/s]


Epoch 16/30, Train Loss: 0.5842, Val Loss: 0.5874


Epoch 17/30: 100%|██████████| 193/193 [00:06<00:00, 30.03it/s]


Epoch 17/30, Train Loss: 0.5842, Val Loss: 0.5863


Epoch 18/30: 100%|██████████| 193/193 [00:06<00:00, 29.91it/s]


Epoch 18/30, Train Loss: 0.5834, Val Loss: 0.5865


Epoch 19/30: 100%|██████████| 193/193 [00:06<00:00, 30.04it/s]


Epoch 19/30, Train Loss: 0.5824, Val Loss: 0.5842


Epoch 20/30: 100%|██████████| 193/193 [00:06<00:00, 29.30it/s]


Epoch 20/30, Train Loss: 0.5816, Val Loss: 0.5854


Epoch 21/30: 100%|██████████| 193/193 [00:06<00:00, 30.44it/s]


Epoch 21/30, Train Loss: 0.5804, Val Loss: 0.5871


Epoch 22/30: 100%|██████████| 193/193 [00:06<00:00, 30.73it/s]


Epoch 22/30, Train Loss: 0.5802, Val Loss: 0.5866


Epoch 23/30: 100%|██████████| 193/193 [00:06<00:00, 30.12it/s]


Epoch 23/30, Train Loss: 0.5789, Val Loss: 0.5861


Epoch 24/30: 100%|██████████| 193/193 [00:06<00:00, 29.69it/s]


Epoch 24/30, Train Loss: 0.5750, Val Loss: 0.5855
Early stopping triggered after 24 epochs
Classification model training took: 157.81 seconds

Training ResNet regression model...


Epoch 1/30: 100%|██████████| 193/193 [00:06<00:00, 28.14it/s]


Epoch 1/30, Train Loss: 1438.8872, Val Loss: 1408.2061


Epoch 2/30: 100%|██████████| 193/193 [00:06<00:00, 30.01it/s]


Epoch 2/30, Train Loss: 1380.4762, Val Loss: 1400.6304


Epoch 3/30: 100%|██████████| 193/193 [00:06<00:00, 30.15it/s]


Epoch 3/30, Train Loss: 1378.8529, Val Loss: 1399.6069


Epoch 4/30: 100%|██████████| 193/193 [00:06<00:00, 29.37it/s]


Epoch 4/30, Train Loss: 1378.4181, Val Loss: 1399.8388


Epoch 5/30: 100%|██████████| 193/193 [00:06<00:00, 29.22it/s]


Epoch 5/30, Train Loss: 1378.4068, Val Loss: 1400.0257


Epoch 6/30: 100%|██████████| 193/193 [00:06<00:00, 29.41it/s]


Epoch 6/30, Train Loss: 1378.4954, Val Loss: 1400.0787


Epoch 7/30: 100%|██████████| 193/193 [00:06<00:00, 30.23it/s]


Epoch 7/30, Train Loss: 1377.4609, Val Loss: 1400.0746


Epoch 8/30: 100%|██████████| 193/193 [00:06<00:00, 29.82it/s]


Epoch 8/30, Train Loss: 1377.1616, Val Loss: 1400.9655
Early stopping triggered after 8 epochs
Regression model training took: 54.41 seconds

Evaluating classification model...
Classification Accuracy: 69.31%
Classification ROC AUC: 0.7348

Evaluating regression model...
Regression Mean Squared Error: 1400.97
Regression Root Mean Squared Error: 37.43 minutes
Regression Mean Absolute Error: 20.97 minutes
Regression R² Score: -0.0014
Preprocessor saved to ./dep_delay_nn/year_2023\resnet_preprocessor_2023.joblib

ResNet model training for 2023 complete! Total processing time: 299.35 seconds

Model for year 2023 completed successfully!

Training ResNet model for year 2024

Processing May2024.csv...
Years found in data: [2024]
Months found in data: {5: 649428}
Filtered to only May data: 649428 rows
Filtered from 649428 to 228159 rows for top 30 airports
Removed 2994.0 cancelled flights, remaining: 225165
Processing took: 3.32 seconds

Matching weather data with flights...
Processed 20000/22

Epoch 1/30: 100%|██████████| 198/198 [00:06<00:00, 31.93it/s]


Epoch 1/30, Train Loss: 0.6289, Val Loss: 0.6099


Epoch 2/30: 100%|██████████| 198/198 [00:06<00:00, 31.69it/s]


Epoch 2/30, Train Loss: 0.6155, Val Loss: 0.6068


Epoch 3/30: 100%|██████████| 198/198 [00:06<00:00, 29.97it/s]


Epoch 3/30, Train Loss: 0.6118, Val Loss: 0.6050


Epoch 4/30: 100%|██████████| 198/198 [00:06<00:00, 30.96it/s]


Epoch 4/30, Train Loss: 0.6093, Val Loss: 0.6030


Epoch 5/30: 100%|██████████| 198/198 [00:06<00:00, 30.54it/s]


Epoch 5/30, Train Loss: 0.6068, Val Loss: 0.6026


Epoch 6/30: 100%|██████████| 198/198 [00:06<00:00, 30.96it/s]


Epoch 6/30, Train Loss: 0.6045, Val Loss: 0.6004


Epoch 7/30: 100%|██████████| 198/198 [00:06<00:00, 30.43it/s]


Epoch 7/30, Train Loss: 0.6032, Val Loss: 0.6000


Epoch 8/30: 100%|██████████| 198/198 [00:06<00:00, 29.59it/s]


Epoch 8/30, Train Loss: 0.6022, Val Loss: 0.6006


Epoch 9/30: 100%|██████████| 198/198 [00:06<00:00, 30.17it/s]


Epoch 9/30, Train Loss: 0.6008, Val Loss: 0.5990


Epoch 10/30: 100%|██████████| 198/198 [00:06<00:00, 30.31it/s]


Epoch 10/30, Train Loss: 0.5995, Val Loss: 0.5973


Epoch 11/30: 100%|██████████| 198/198 [00:06<00:00, 29.63it/s]


Epoch 11/30, Train Loss: 0.5977, Val Loss: 0.5973


Epoch 12/30: 100%|██████████| 198/198 [00:06<00:00, 28.73it/s]


Epoch 12/30, Train Loss: 0.5966, Val Loss: 0.5985


Epoch 13/30: 100%|██████████| 198/198 [00:06<00:00, 29.56it/s]


Epoch 13/30, Train Loss: 0.5956, Val Loss: 0.5991


Epoch 14/30: 100%|██████████| 198/198 [00:06<00:00, 29.32it/s]


Epoch 14/30, Train Loss: 0.5942, Val Loss: 0.5970


Epoch 15/30: 100%|██████████| 198/198 [00:06<00:00, 29.49it/s]


Epoch 15/30, Train Loss: 0.5936, Val Loss: 0.5975


Epoch 16/30: 100%|██████████| 198/198 [00:06<00:00, 28.54it/s]


Epoch 16/30, Train Loss: 0.5924, Val Loss: 0.5960


Epoch 17/30: 100%|██████████| 198/198 [00:07<00:00, 28.20it/s]


Epoch 17/30, Train Loss: 0.5920, Val Loss: 0.5967


Epoch 18/30: 100%|██████████| 198/198 [00:06<00:00, 29.20it/s]


Epoch 18/30, Train Loss: 0.5908, Val Loss: 0.5949


Epoch 19/30: 100%|██████████| 198/198 [00:06<00:00, 29.40it/s]


Epoch 19/30, Train Loss: 0.5899, Val Loss: 0.5962


Epoch 20/30: 100%|██████████| 198/198 [00:07<00:00, 27.70it/s]


Epoch 20/30, Train Loss: 0.5891, Val Loss: 0.5954


Epoch 21/30: 100%|██████████| 198/198 [00:07<00:00, 27.48it/s]


Epoch 21/30, Train Loss: 0.5882, Val Loss: 0.5961


Epoch 22/30: 100%|██████████| 198/198 [00:06<00:00, 28.64it/s]


Epoch 22/30, Train Loss: 0.5874, Val Loss: 0.5969


Epoch 23/30: 100%|██████████| 198/198 [00:06<00:00, 29.03it/s]


Epoch 23/30, Train Loss: 0.5829, Val Loss: 0.5980
Early stopping triggered after 23 epochs
Classification model training took: 159.80 seconds

Training ResNet regression model...


Epoch 1/30: 100%|██████████| 198/198 [00:07<00:00, 27.27it/s]


Epoch 1/30, Train Loss: 2772.2285, Val Loss: 2522.4074


Epoch 2/30: 100%|██████████| 198/198 [00:07<00:00, 27.92it/s]


Epoch 2/30, Train Loss: 2566.7119, Val Loss: 2420.0447


Epoch 3/30: 100%|██████████| 198/198 [00:07<00:00, 28.00it/s]


Epoch 3/30, Train Loss: 2529.7128, Val Loss: 2418.7185


Epoch 4/30: 100%|██████████| 198/198 [00:06<00:00, 28.89it/s]


Epoch 4/30, Train Loss: 2529.0551, Val Loss: 2419.2740


Epoch 5/30: 100%|██████████| 198/198 [00:06<00:00, 29.27it/s]


Epoch 5/30, Train Loss: 2529.8910, Val Loss: 2418.2405


Epoch 6/30: 100%|██████████| 198/198 [00:07<00:00, 28.27it/s]


Epoch 6/30, Train Loss: 2528.7097, Val Loss: 2418.1348


Epoch 7/30: 100%|██████████| 198/198 [00:07<00:00, 27.76it/s]


Epoch 7/30, Train Loss: 2528.3561, Val Loss: 2419.7749


Epoch 8/30: 100%|██████████| 198/198 [00:06<00:00, 29.19it/s]


Epoch 8/30, Train Loss: 2528.6236, Val Loss: 2419.3810


Epoch 9/30: 100%|██████████| 198/198 [00:06<00:00, 28.54it/s]


Epoch 9/30, Train Loss: 2527.9915, Val Loss: 2419.0446


Epoch 10/30: 100%|██████████| 198/198 [00:06<00:00, 28.46it/s]


Epoch 10/30, Train Loss: 2526.5560, Val Loss: 2420.0756


Epoch 11/30: 100%|██████████| 198/198 [00:07<00:00, 27.72it/s]


Epoch 11/30, Train Loss: 2526.2844, Val Loss: 2420.8351
Early stopping triggered after 11 epochs
Regression model training took: 79.93 seconds

Evaluating classification model...
Classification Accuracy: 67.88%
Classification ROC AUC: 0.7419

Evaluating regression model...
Regression Mean Squared Error: 2420.84
Regression Root Mean Squared Error: 49.20 minutes
Regression Mean Absolute Error: 29.74 minutes
Regression R² Score: -0.0012
Preprocessor saved to ./dep_delay_nn/year_2024\resnet_preprocessor_2024.joblib

ResNet model training for 2024 complete! Total processing time: 328.36 seconds

Model for year 2024 completed successfully!

Comparing ResNet models across years...
Comparison summary saved to ./dep_delay_nn/comparison\dep_delay_nn_summary.csv
ResNet model comparison completed!

Year-by-Year ResNet Model Training Summary:

Year 2021:
  Total flights: 171,382
  Classification accuracy: 72.71%
  Classification AUC: 0.7215
  Regression RMSE: 28.70 minutes
  Regression R²: -0.0035
