In [9]:
import pandas as pd
import numpy as np
import glob
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from joblib import dump
import json
import warnings
warnings.filterwarnings('ignore')

# 添加转换函数用于JSON序列化
def convert_to_serializable(obj):
    """
    Convert NumPy types to Python native types for JSON serialization
    """
    if isinstance(obj, (np.integer, np.int64, np.int32, np.int16, np.int8)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32, np.float16)):
        return float(obj)
    elif isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    elif isinstance(obj, (pd.DataFrame,)):
        return obj.to_dict('records')
    elif isinstance(obj, (pd.Series,)):
        return obj.to_dict()
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    else:
        return obj

# Set data paths
flight_data_path = './cleaned_data/'
weather_data_path = './cleaned_weather_data/'
top_airports_file = './top_100_airports.csv'  # File containing top 100 airports
output_dir = './rf_arrival_delay_models/'  # 更改为降落延迟模型的输出目录

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print("Starting arrival delay prediction models (Random Forest)...")
print(f"Flight data directory: {flight_data_path}")
print(f"Weather data directory: {weather_data_path}")
print(f"Top airports file: {top_airports_file}")
print(f"Model output directory: {output_dir}")

# Load top 30 airports from the top 100 airports file
try:
    # Load the airport data with the exact format provided
    top_airports = pd.read_csv(top_airports_file, low_memory=False)
    
    # The file already has a Rank column, so we can just take the top 30
    top_airports = top_airports.head(30)
    
    # The airport codes are in ORIGIN_IATA column
    top_airport_codes = set(top_airports['ORIGIN_IATA'].str.strip().tolist())
    
    print(f"Loaded top 30 airports: {', '.join(sorted(top_airport_codes))}")
    print(f"Busiest airport: {top_airports.iloc[0]['ORIGIN_IATA']} with {top_airports.iloc[0]['Times']} flights")
    print(f"30th busiest airport: {top_airports.iloc[29]['ORIGIN_IATA']} with {top_airports.iloc[29]['Times']} flights")
except Exception as e:
    print(f"Error loading top airports file: {e}")
    # Fallback: if file doesn't exist, we'll use all airports
    top_airport_codes = None
    print("Will process all airports (top airports file not available)")

Starting arrival delay prediction models (Random Forest)...
Flight data directory: ./cleaned_data/
Weather data directory: ./cleaned_weather_data/
Top airports file: ./top_100_airports.csv
Model output directory: ./rf_arrival_delay_models/
Loaded top 30 airports: ATL, AUS, BNA, BOS, BWI, CLT, DCA, DEN, DFW, DTW, EWR, FLL, IAD, IAH, JFK, LAS, LAX, LGA, MCO, MDW, MIA, MSP, ORD, PHL, PHX, SAN, SEA, SFO, SLC, TPA
Busiest airport: ATL with 457121 flights
30th busiest airport: TPA with 97235 flights


In [10]:
# Function to load weather data - adjusted for the new format ABI_2021_Aug.csv
def load_weather_data():
    print("\nLoading weather data...")
    start_time = time.time()
    
    all_files = glob.glob(os.path.join(weather_data_path, "*.csv"))
    print(f"Found {len(all_files)} total weather data files")
    weather_dict = {}
    count = 0
    matching_count = 0
    
    # Process all weather files
    for file in all_files:
        try:
            # Extract airport code and date information from filename
            filename = os.path.basename(file)
            parts = filename.split('.')[0].split('_')
            
            if len(parts) >= 3:
                iata = parts[0]  # Airport code (e.g., ABI)
                year = parts[1]  # Year (e.g., 2021)
                month_name = parts[2]  # Month name (e.g., Aug)
                
                # Convert month name to number
                month_map = {
                    'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
                    'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
                    'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
                }
                
                if month_name in month_map:
                    month = month_map[month_name]
                    
                    # Only continue with top airports if we have a list
                    if top_airport_codes is None or iata in top_airport_codes:
                        # Read the weather data
                        weather_data = pd.read_csv(file, low_memory=False)
                        
                        # Ensure DATE column exists
                        if 'DATE' not in weather_data.columns:
                            print(f"Warning: DATE column not found in {filename}")
                            continue
                        
                        # Convert DATE to datetime
                        weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
                        
                        # Create the key for the weather dictionary
                        key = f"{iata}_{year}_{month}"
                        
                        # Store the weather data
                        weather_dict[key] = weather_data
                        matching_count += 1
                else:
                    print(f"Warning: Unknown month format in {filename}")
                
                count += 1
                
                # Print progress periodically
                if count % 100 == 0:
                    print(f"Processed {count} weather files, loaded {matching_count} matching files")
        except Exception as e:
            print(f"Error loading weather file {file}: {e}")
    
    print(f"Loaded {matching_count} weather files out of {count} processed files")
    print(f"Loading weather data took: {time.time() - start_time:.2f} seconds")
    return weather_dict

# Get specific May files from the cleaned_data directory
def get_may_files():
    may_files = [
        os.path.join(flight_data_path, "May2021.csv"),
        os.path.join(flight_data_path, "May2022.csv"),
        os.path.join(flight_data_path, "May2023.csv"),
        os.path.join(flight_data_path, "May2024.csv")
    ]
    
    # Verify each file exists
    existing_files = []
    for file_path in may_files:
        if os.path.exists(file_path):
            existing_files.append(file_path)
        else:
            print(f"Warning: File {file_path} not found")
    
    return existing_files

# Get the May 2021-2024 flight data files
flight_files = get_may_files()
print(f"\nFound {len(flight_files)} May files to process:")
for f in flight_files:
    print(f"  - {os.path.basename(f)}")

if not flight_files:
    print("No May 2021-2024 files were found. Please check file paths.")
    exit(1)

# Load all weather data once
weather_dict = load_weather_data()

# Function to extract year from filename
def extract_year_from_filename(filename):
    # Extract year from 'May2021.csv', 'May2022.csv', etc.
    base_name = os.path.basename(filename)
    year_str = base_name.replace('May', '').split('.')[0]
    return int(year_str)

# Function to create late-night arrival indicator
def create_late_night_arrival_indicator(df):
    """
    Creates a binary indicator for late-night arrivals (22-06 scheduled arrival time)
    
    Args:
        df: DataFrame containing flight data with SCH_ARR_TIME
        
    Returns:
        DataFrame with IS_LATE_NIGHT_ARR column added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Initialize IS_LATE_NIGHT_ARR to 0 (not a late-night arrival)
    df['IS_LATE_NIGHT_ARR'] = 0
    
    # Check for SCH_ARR_TIME
    if 'SCH_ARR_TIME' not in df.columns:
        print("Warning: SCH_ARR_TIME column not found")
        return df
    
    # Convert time column to standard format if needed
    if df['SCH_ARR_TIME'].dtype != 'float64':
        try:
            # Handle any non-numeric values
            df['SCH_ARR_TIME'] = pd.to_numeric(df['SCH_ARR_TIME'], errors='coerce')
        except:
            print(f"Warning: Could not convert SCH_ARR_TIME to numeric")
    
    # Identify late-night arrivals (22:00-06:00)
    late_night_arrival = ((df['SCH_ARR_TIME'] >= 2200) | (df['SCH_ARR_TIME'] < 600))
    df.loc[late_night_arrival, 'IS_LATE_NIGHT_ARR'] = 1
    
    # Count arrivals identified as late-night
    late_night_count = late_night_arrival.sum()
    total_count = len(df)
    print(f"Identified {late_night_count} late-night arrivals (22:00-06:00)")
    print(f"Total identified late-night arrivals: {late_night_count} out of {total_count} total flights ({late_night_count/total_count*100:.2f}%)")
    
    # Add a more detailed arrival time-of-day categorical feature
    df['ARR_TIME_OF_DAY'] = pd.cut(
        df['SCH_ARR_TIME'], 
        bins=[0, 600, 1200, 1800, 2200, 2400],
        labels=['Early Morning (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-22)', 'Night (22-24)'],
        include_lowest=True
    )
    
    # Print distribution of flights by arrival time of day
    time_dist = df['ARR_TIME_OF_DAY'].value_counts()
    print("\nDistribution of flights by arrival time of day:")
    for time_cat, count in time_dist.items():
        print(f"  - {time_cat}: {count} flights ({count/total_count*100:.2f}%)")
    
    return df

# Function to prepare arrival delay data - focus on ARR_DELAY
def prepare_arrival_delay_data(df):
    """
    Prepares arrival delay data for modeling
    
    Args:
        df: DataFrame containing flight data with ARR_DELAY column
        
    Returns:
        DataFrame with additional arrival delay-related columns
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure ARR_DELAY is numeric
    if 'ARR_DELAY' in df.columns:
        if df['ARR_DELAY'].dtype != 'float64':
            try:
                df['ARR_DELAY'] = pd.to_numeric(df['ARR_DELAY'], errors='coerce')
            except:
                print(f"Warning: Could not convert ARR_DELAY to numeric")
    else:
        print("Warning: ARR_DELAY column not found in dataset")
        return df
    
    # Create a binary feature for on-time arrival (<=0 means on time or early)
    df['IS_ARR_DELAYED'] = (df['ARR_DELAY'] > 0).astype(int)
    
    # Create a categorical arrival delay feature
    df['ARR_DELAY_CATEGORY'] = pd.cut(
        df['ARR_DELAY'],
        bins=[-float('inf'), -15, 0, 15, 30, 60, 120, float('inf')],
        labels=['Very Early', 'Early', 'On Time', 'Slight Delay', 'Moderate Delay', 
                'Significant Delay', 'Severe Delay'],
        include_lowest=True
    )
    
    # Add absolute arrival delay (for prediction error metrics)
    df['ABS_ARR_DELAY'] = np.abs(df['ARR_DELAY'])
    
    # Print arrival delay statistics
    delay_count = df['IS_ARR_DELAYED'].sum()
    total_count = len(df)
    delay_rate = delay_count / total_count * 100
    
    print(f"\nArrival delay statistics:")
    print(f"Delayed arrivals: {delay_count}/{total_count} ({delay_rate:.2f}%)")
    print(f"On-time or early arrivals: {total_count - delay_count}/{total_count} ({100 - delay_rate:.2f}%)")
    
    print("\nArrival delay magnitude statistics:")
    print(f"Mean arrival delay: {df['ARR_DELAY'].mean():.2f} minutes")
    print(f"Median arrival delay: {df['ARR_DELAY'].median():.2f} minutes")
    print(f"Min arrival delay: {df['ARR_DELAY'].min():.2f} minutes (negative means early arrival)")
    print(f"Max arrival delay: {df['ARR_DELAY'].max():.2f} minutes")
    
    # Print arrival delay category distribution
    delay_cat_dist = df['ARR_DELAY_CATEGORY'].value_counts()
    print("\nArrival delay category distribution:")
    for cat, count in delay_cat_dist.sort_index().items():
        print(f"  - {cat}: {count} flights ({count/total_count*100:.2f}%)")
    
    # Create a feature to indicate if arrival delay is worse than departure delay
    if 'DEP_DELAY' in df.columns:
        df['ARR_WORSE_THAN_DEP'] = ((df['ARR_DELAY'] - df['DEP_DELAY']) > 0).astype(int)
        worse_count = df['ARR_WORSE_THAN_DEP'].sum()
        print(f"\nFlights with arrival delay worse than departure delay: {worse_count}/{total_count} ({worse_count/total_count*100:.2f}%)")
    
    return df

# Function to create arrival time block features
def create_arrival_time_block_features(df):
    """
    Creates time block features from scheduled arrival times
    
    Args:
        df: DataFrame containing flight data with SCH_ARR_TIME
        
    Returns:
        DataFrame with arrival time block features added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    if 'SCH_ARR_TIME' not in df.columns:
        print("Warning: SCH_ARR_TIME column not found for arrival time block features")
        return df
    
    # Ensure SCH_ARR_TIME is numeric
    if df['SCH_ARR_TIME'].dtype != 'float64':
        try:
            df['SCH_ARR_TIME'] = pd.to_numeric(df['SCH_ARR_TIME'], errors='coerce')
        except:
            print(f"Warning: Could not convert SCH_ARR_TIME to numeric")
            return df
    
    # Extract hour from SCH_ARR_TIME (time format is HHMM)
    df['ARR_HOUR'] = (df['SCH_ARR_TIME'] / 100).astype(int)
    
    # Create time blocks (each block is 3 hours)
    time_blocks = {
        0: 'Late Night (0-3)',
        1: 'Late Night (0-3)',
        2: 'Late Night (0-3)',
        3: 'Early Morning (3-6)',
        4: 'Early Morning (3-6)',
        5: 'Early Morning (3-6)',
        6: 'Morning (6-9)',
        7: 'Morning (6-9)',
        8: 'Morning (6-9)',
        9: 'Mid-Day (9-12)',
        10: 'Mid-Day (9-12)',
        11: 'Mid-Day (9-12)',
        12: 'Afternoon (12-15)',
        13: 'Afternoon (12-15)',
        14: 'Afternoon (12-15)',
        15: 'Evening (15-18)',
        16: 'Evening (15-18)',
        17: 'Evening (15-18)',
        18: 'Night (18-21)',
        19: 'Night (18-21)',
        20: 'Night (18-21)',
        21: 'Late Night (21-24)',
        22: 'Late Night (21-24)',
        23: 'Late Night (21-24)'
    }
    
    # Map hours to time blocks
    df['ARR_TIME_BLOCK'] = df['ARR_HOUR'].map(time_blocks)
    
    # Create binary variables for peak arrival times
    # Morning rush (8-10 AM arrivals)
    df['IS_MORNING_RUSH_ARR'] = ((df['ARR_HOUR'] >= 8) & (df['ARR_HOUR'] <= 10)).astype(int)
    
    # Evening rush (17-19 PM arrivals)
    df['IS_EVENING_RUSH_ARR'] = ((df['ARR_HOUR'] >= 17) & (df['ARR_HOUR'] <= 19)).astype(int)
    
    return df

# Function to create day of week features - updated for text day format (Sun, Mon, etc.)
def create_day_features(df):
    """
    Creates day type features from text day names (Sun, Mon, etc.)
    
    Args:
        df: DataFrame containing flight data with WEEK column as text day names
        
    Returns:
        DataFrame with day features added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Check if we have the WEEK column with text day names
    if 'WEEK' in df.columns:
        # Create a mapping from abbreviated day names to full day names
        day_name_map = {
            'Sun': 'Sunday',
            'Mon': 'Monday',
            'Tue': 'Tuesday',
            'Wed': 'Wednesday',
            'Thu': 'Thursday',
            'Fri': 'Friday',
            'Sat': 'Saturday'
        }
        
        # Map abbreviated names to full names
        df['DAY_NAME'] = df['WEEK'].map(day_name_map)
        
        # Create weekend indicator
        df['IS_WEEKEND'] = df['WEEK'].isin(['Sat', 'Sun']).astype(int)
        
        # Print distribution of days
        day_counts = df['DAY_NAME'].value_counts()
        total = len(df)
        print("\nDistribution of flights by day of week:")
        for day, count in day_counts.items():
            print(f"  - {day}: {count} flights ({count/total*100:.2f}%)")
        
        # Print weekend vs. weekday distribution
        weekend_count = df['IS_WEEKEND'].sum()
        weekday_count = total - weekend_count
        print(f"\nWeekend flights: {weekend_count} ({weekend_count/total*100:.2f}%)")
        print(f"Weekday flights: {weekday_count} ({weekday_count/total*100:.2f}%)")
        
    elif 'DAY_OF_WEEK' in df.columns:
        # Assuming 1=Monday, ..., 7=Sunday or 0=Monday, ..., 6=Sunday
        max_day = df['DAY_OF_WEEK'].max()
        
        if max_day == 7:
            # 1-7 format (6,7 = weekend)
            df['IS_WEEKEND'] = ((df['DAY_OF_WEEK'] == 6) | (df['DAY_OF_WEEK'] == 7)).astype(int)
            
            # Map day numbers to names for better interpretability
            day_names = {1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 
                        4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}
        else:
            # 0-6 format (5,6 = weekend)
            df['IS_WEEKEND'] = ((df['DAY_OF_WEEK'] == 5) | (df['DAY_OF_WEEK'] == 6)).astype(int)
            
            # Map day numbers to names for better interpretability
            day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 
                        3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
        
        df['DAY_NAME'] = df['DAY_OF_WEEK'].map(day_names)
    else:
        print("Warning: No day of week column (WEEK or DAY_OF_WEEK) found")
    
    return df

# Function to create flight duration features
def create_flight_duration_features(df):
    """
    Creates features related to flight duration and arrival time
    
    Args:
        df: DataFrame containing flight data
        
    Returns:
        DataFrame with flight duration features added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Calculate scheduled flight duration if possible
    if 'SCH_ARR_TIME' in df.columns and 'SCH_DEP_TIME' in df.columns:
        try:
            # Convert to numeric if needed
            for col in ['SCH_ARR_TIME', 'SCH_DEP_TIME']:
                if df[col].dtype != 'float64':
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Calculate scheduled flight duration (handle overnight flights)
            df['SCH_DURATION'] = ((df['SCH_ARR_TIME'] - df['SCH_DEP_TIME']) % 2400)
            
            # Fix cases where arrival is before departure (overnight flights)
            # If we get a negative or very large value, add 2400 (24 hours)
            mask = df['SCH_DURATION'] < 0
            df.loc[mask, 'SCH_DURATION'] += 2400
            
            # Convert HHMM format to minutes (e.g., 130 = 1 hour 30 minutes = 90 minutes)
            df['SCH_DURATION_MINS'] = (df['SCH_DURATION'] // 100) * 60 + (df['SCH_DURATION'] % 100)
            
            # Calculate actual flight duration if possible
            if 'ACT_ARR_TIME' in df.columns and 'ACT_DEP_TIME' in df.columns:
                for col in ['ACT_ARR_TIME', 'ACT_DEP_TIME']:
                    if df[col].dtype != 'float64':
                        df[col] = pd.to_numeric(df[col], errors='coerce')
                
                df['ACT_DURATION'] = ((df['ACT_ARR_TIME'] - df['ACT_DEP_TIME']) % 2400)
                mask = df['ACT_DURATION'] < 0
                df.loc[mask, 'ACT_DURATION'] += 2400
                
                df['ACT_DURATION_MINS'] = (df['ACT_DURATION'] // 100) * 60 + (df['ACT_DURATION'] % 100)
                
                # Calculate duration difference (negative means flight was faster than scheduled)
                df['DURATION_DIFF'] = df['ACT_DURATION_MINS'] - df['SCH_DURATION_MINS']
                
                # Print statistics
                print("\nFlight duration statistics:")
                print(f"Mean scheduled duration: {df['SCH_DURATION_MINS'].mean():.1f} minutes")
                print(f"Mean actual duration: {df['ACT_DURATION_MINS'].mean():.1f} minutes")
                print(f"Mean duration difference: {df['DURATION_DIFF'].mean():.1f} minutes")
                print(f"Flights faster than scheduled: {(df['DURATION_DIFF'] < 0).sum()} ({(df['DURATION_DIFF'] < 0).mean()*100:.1f}%)")
                print(f"Flights slower than scheduled: {(df['DURATION_DIFF'] > 0).sum()} ({(df['DURATION_DIFF'] > 0).mean()*100:.1f}%)")
                
                # Create duration deviation categories
                df['DURATION_DEVIATION'] = pd.cut(
                    df['DURATION_DIFF'],
                    bins=[-float('inf'), -15, -5, 5, 15, 30, float('inf')],
                    labels=['Much Faster', 'Faster', 'On Schedule', 'Slower', 'Much Slower', 'Extremely Slower']
                )
                
                # Print duration deviation distribution
                dev_dist = df['DURATION_DEVIATION'].value_counts()
                print("\nFlight duration deviation distribution:")
                for dev, count in dev_dist.sort_index().items():
                    print(f"  - {dev}: {count} flights ({count/len(df)*100:.2f}%)")
        except Exception as e:
            print(f"Error calculating flight durations: {e}")
    
    # Create flight distance categories if DISTANCE is available
    if 'DISTANCE' in df.columns:
        df['FLIGHT_DISTANCE_CAT'] = pd.cut(
            df['DISTANCE'],
            bins=[0, 300, 600, 1000, 1500, float('inf')],
            labels=['Very Short (<300 mi)', 'Short (300-600 mi)', 'Medium (600-1000 mi)', 
                   'Long (1000-1500 mi)', 'Very Long (>1500 mi)']
        )
        
        # Print flight distance distribution
        dist_dist = df['FLIGHT_DISTANCE_CAT'].value_counts()
        print("\nFlight distance distribution:")
        for dist, count in dist_dist.sort_index().items():
            print(f"  - {dist}: {count} flights ({count/len(df)*100:.2f}%)")
    
    return df

# Function to match weather data to flights for origin airports (departure weather)
def match_weather_data(df):
    """
    Match weather data to flight records for origin airports
    
    Args:
        df: DataFrame containing flight data
        
    Returns:
        DataFrame with origin weather data added
    """
    print("\nMatching origin weather data with flights...")
    start_time = time.time()
    
    # Make sure necessary date columns exist
    date_columns_exist = all(col in df.columns for col in ['YEAR', 'MONTH', 'DAY'])
    if not date_columns_exist:
        print("Warning: Missing one or more date columns (YEAR, MONTH, DAY)")
        print("Weather data cannot be matched")
        return df
    
    # Create a date column for matching - convert to datetime
    df['FLIGHT_DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])
    
    # Create a column to hold the weather key pattern
    df['WEATHER_KEY'] = df['ORIGIN_IATA'] + '_' + df['YEAR'].astype(str) + '_' + df['MONTH'].astype(str).str.zfill(2)
    
    # Create columns for weather features
    weather_columns = ['EXTREME_WEATHER', 'PRCP', 'WT01', 'WT03', 'WT04', 'WT05', 'WT08', 'WT11']
    for col in weather_columns:
        if col not in df.columns:
            df[col] = 0.0
    
    # Process in batches
    matched_count = 0
    batch_size = 10000
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx]
        
        for idx, row in batch.iterrows():
            try:
                # Get the weather key based on the origin airport and date
                weather_key = row['WEATHER_KEY']
                flight_date = row['FLIGHT_DATE']
                
                # Check if this key exists in our weather dictionary
                if weather_key in weather_dict:
                    weather_data = weather_dict[weather_key]
                    
                    # Find matching weather data for the flight date
                    matching_weather = weather_data[weather_data['DATE'] == flight_date]
                    
                    if not matching_weather.empty:
                        # Match available weather columns for origin
                        for col in weather_columns:
                            if col in matching_weather.columns:
                                df.at[idx, col] = matching_weather[col].iloc[0]
                        matched_count += 1
            except Exception as e:
                # Less verbose error reporting for speed
                pass
        
        # Print progress
        print(f"Processed {end_idx}/{len(df)} rows, matched {matched_count} flights with origin weather data")
    
    print(f"Matched origin weather data for {matched_count} flights ({matched_count/len(df)*100:.2f}%)")
    print(f"Origin weather matching took: {time.time() - start_time:.2f} seconds")
    
    return df

# Function to match weather data to flights for destination airports (for arrival delays)
def match_destination_weather_data(df):
    """
    Match weather data to flight destination records for arrival delay analysis
    
    Args:
        df: DataFrame containing flight data
        
    Returns:
        DataFrame with destination weather data added
    """
    print("\nMatching destination weather data with flights...")
    start_time = time.time()
    
    # Make sure necessary date columns exist
    date_columns_exist = all(col in df.columns for col in ['YEAR', 'MONTH', 'DAY'])
    if not date_columns_exist:
        print("Warning: Missing one or more date columns (YEAR, MONTH, DAY)")
        print("Weather data cannot be matched")
        return df
    
    # Create a date column for matching - convert to datetime
    if 'FLIGHT_DATE' not in df.columns:
        df['FLIGHT_DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])
    
    # Create a column to hold the destination weather key pattern
    df['DEST_WEATHER_KEY'] = df['DEST_IATA'] + '_' + df['YEAR'].astype(str) + '_' + df['MONTH'].astype(str).str.zfill(2)
    
    # Create columns for weather features
    weather_columns = ['EXTREME_WEATHER', 'PRCP', 'WT01', 'WT03', 'WT04', 'WT05', 'WT08', 'WT11']
    for col in weather_columns:
        if f'DEST_{col}' not in df.columns:
            df[f'DEST_{col}'] = 0.0
    
    # Process in batches
    matched_count = 0
    batch_size = 10000
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx]
        
        for idx, row in batch.iterrows():
            try:
                # Get the weather key based on the destination airport and date
                weather_key = row['DEST_WEATHER_KEY']
                flight_date = row['FLIGHT_DATE']
                
                # Check if this key exists in our weather dictionary
                if weather_key in weather_dict:
                    weather_data = weather_dict[weather_key]
                    
                    # Find matching weather data for the flight date
                    matching_weather = weather_data[weather_data['DATE'] == flight_date]
                    
                    if not matching_weather.empty:
                        # Match available weather columns for destination
                        for col in weather_columns:
                            if col in matching_weather.columns:
                                df.at[idx, f'DEST_{col}'] = matching_weather[col].iloc[0]
                        matched_count += 1
            except Exception as e:
                # Less verbose error reporting for speed
                pass
        
        # Print progress
        print(f"Processed {end_idx}/{len(df)} rows, matched {matched_count} flights with destination weather data")
    
    print(f"Matched destination weather data for {matched_count} flights ({matched_count/len(df)*100:.2f}%)")
    print(f"Destination weather matching took: {time.time() - start_time:.2f} seconds")
    
    return df

# Function to load and preprocess a single flight data file
def load_and_process_flight_data(file_path):
    """
    Load and preprocess a single flight data file
    
    Args:
        file_path: Path to the flight data file
        
    Returns:
        DataFrame with processed flight data
    """
    print(f"\nProcessing {os.path.basename(file_path)}...")
    start_time = time.time()
    
    try:
        # Load flight data
        df = pd.read_csv(file_path, low_memory=False)
        original_size = len(df)
        
        # Extract year from filename
        file_year = extract_year_from_filename(file_path)
        
        # Ensure the year is properly set
        if 'YEAR' in df.columns:
            # Verify that the year in the data matches the filename
            unique_years = df['YEAR'].unique()
            print(f"Years found in data: {unique_years}")
            
            # If data has multiple years, filter to only the year from filename
            if len(unique_years) > 1:
                df = df[df['YEAR'] == file_year]
                print(f"Filtered to only year {file_year}: {len(df)} rows")
        else:
            # If no YEAR column exists, create one based on filename
            df['YEAR'] = file_year
            print(f"Added YEAR column with value {file_year}")
        
        # Ensure we only have May data
        if 'MONTH' in df.columns:
            month_counts = df['MONTH'].value_counts()
            print(f"Months found in data: {dict(month_counts)}")
            
            if 5 in month_counts:
                df = df[df['MONTH'] == 5]
                print(f"Filtered to only May data: {len(df)} rows")
            else:
                print(f"Warning: No May data found in file, but proceeding anyway as this should be May data based on filename")
        
        # Check for ARR_DELAY column (required for arrival delay prediction)
        if 'ARR_DELAY' not in df.columns:
            print(f"ARR_DELAY column not found in {os.path.basename(file_path)}. Skipping file.")
            return None
        
        # Filter for top airports if we have the list
        if top_airport_codes is not None:
            df = df[
                df['ORIGIN_IATA'].str.strip().isin(top_airport_codes) & 
                df['DEST_IATA'].str.strip().isin(top_airport_codes)
            ]
            
            filtered_size = len(df)
            print(f"Filtered from {original_size} to {filtered_size} rows for top 30 airports")
            
            # If no data left after filtering, skip this file
            if filtered_size == 0:
                print(f"No data remaining after filtering for top 30 airports. Skipping file.")
                return None
        
        # Remove cancelled flights since they don't have actual arrival times
        if 'CANCELLED' in df.columns:
            cancelled_count = df['CANCELLED'].sum()
            if cancelled_count > 0:
                df = df[df['CANCELLED'] == 0]
                print(f"Removed {cancelled_count} cancelled flights, remaining: {len(df)}")
        
        # Remove diverted flights since they have special handling
        if 'DIVERTED' in df.columns:
            diverted_count = df['DIVERTED'].sum()
            if diverted_count > 0:
                df = df[df['DIVERTED'] == 0]
                print(f"Removed {diverted_count} diverted flights, remaining: {len(df)}")
        
        # Verify we have the required arrival time columns
        required_columns = ['SCH_ARR_TIME', 'ARR_DELAY']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Missing required arrival time columns: {missing_columns}. Skipping file.")
            return None
        
        print(f"Processing took: {time.time() - start_time:.2f} seconds")
        return df
        
    except Exception as e:
        print(f"Error processing file {os.path.basename(file_path)}: {e}")
        return None

# Function to plot feature importances (for random forest)
def plot_feature_importance(model, feature_names, year, top_n=15, output_path=None, model_type='classification'):
    """
    Visualize the feature importances of a random forest model
    
    Args:
        model: Trained random forest model
        feature_names: Names of the features
        year: Year of the model (for title)
        top_n: Number of top features to show
        output_path: Path to save the plot
        model_type: 'classification' or 'regression'
        
    Returns:
        DataFrame with feature importance data
    """
    # Extract feature importances
    importances = model.feature_importances_
    
    # Create DataFrame with features and importances
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })
    
    # Sort by importance
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Take top N features
    top_features = importance_df.head(top_n)
    
    # Plot
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=top_features)
    
    plt.title(f'Top {top_n} Most Important Features for Arrival Delay (Random Forest {model_type.title()} - {year})')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    
    plt.tight_layout()
    
    if output_path:
        plt.savefig(output_path)
        print(f"Feature importance plot saved to {output_path}")
    
    plt.close()
    
    return importance_df

# Function to analyze the correlation between departure and arrival delays
def analyze_delay_correlation(df, year, output_dir):
    """
    Analyze the correlation between departure and arrival delays
    
    Args:
        df: DataFrame containing flight data with DEP_DELAY and ARR_DELAY
        year: Year of the data
        output_dir: Directory to save output plots
        
    Returns:
        Dictionary with correlation statistics
    """
    print("\nAnalyzing correlation between departure and arrival delays...")
    
    if 'DEP_DELAY' not in df.columns or 'ARR_DELAY' not in df.columns:
        print("Cannot analyze correlation: missing DEP_DELAY or ARR_DELAY columns")
        return {}
    
    # Calculate correlation
    delay_corr = df[['DEP_DELAY', 'ARR_DELAY']].corr().iloc[0, 1]
    print(f"Correlation between departure and arrival delays: {delay_corr:.4f}")
    
    # Calculate difference statistics
    df['DELAY_DIFF'] = df['ARR_DELAY'] - df['DEP_DELAY']
    mean_diff = df['DELAY_DIFF'].mean()
    median_diff = df['DELAY_DIFF'].median()
    
    print(f"Mean difference (ARR_DELAY - DEP_DELAY): {mean_diff:.2f} minutes")
    print(f"Median difference: {median_diff:.2f} minutes")
    print(f"Flights where arrival delay > departure delay: {(df['DELAY_DIFF'] > 0).sum()} ({(df['DELAY_DIFF'] > 0).mean()*100:.1f}%)")
    print(f"Flights where arrival delay < departure delay: {(df['DELAY_DIFF'] < 0).sum()} ({(df['DELAY_DIFF'] < 0).mean()*100:.1f}%)")
    
    # Create plots
    # 1. Scatter plot of departure vs arrival delay
    plt.figure(figsize=(10, 8))
    
    # Sample if there are too many points
    max_points = 5000
    if len(df) > max_points:
        sample_df = df.sample(max_points, random_state=42)
    else:
        sample_df = df
    
    plt.scatter(sample_df['DEP_DELAY'], sample_df['ARR_DELAY'], alpha=0.3)
    
    # Add perfect correlation line (y=x)
    max_delay = max(sample_df['DEP_DELAY'].max(), sample_df['ARR_DELAY'].max())
    min_delay = min(sample_df['DEP_DELAY'].min(), sample_df['ARR_DELAY'].min())
    plt.plot([min_delay, max_delay], [min_delay, max_delay], 'r--', label='Perfect Correlation')
    
    # Add a regression line
    # Calculate regression coefficients
    from scipy import stats
    slope, intercept, r_value, p_value, std_err = stats.linregress(sample_df['DEP_DELAY'], sample_df['ARR_DELAY'])
    plt.plot([min_delay, max_delay], [intercept + slope * min_delay, intercept + slope * max_delay], 
             'g-', label=f'Regression Line (y = {slope:.2f}x + {intercept:.2f})')
    
    plt.xlabel('Departure Delay (minutes)')
    plt.ylabel('Arrival Delay (minutes)')
    plt.title(f'Departure vs. Arrival Delay ({year}, r = {delay_corr:.4f})')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, f'dep_vs_arr_delay_scatter_{year}.png'))
    plt.close()
    
    # 2. Histogram of delay differences
    plt.figure(figsize=(12, 6))
    
    # Limit the range for better visualization
    df_filtered = df[(df['DELAY_DIFF'] >= -60) & (df['DELAY_DIFF'] <= 60)]
    sns.histplot(df_filtered['DELAY_DIFF'], bins=50, kde=True)
    
    plt.axvline(0, color='red', linestyle='--', label='No Difference')
    plt.axvline(mean_diff, color='green', linestyle='-', label=f'Mean Difference: {mean_diff:.2f} min')
    plt.axvline(median_diff, color='blue', linestyle='-', label=f'Median Difference: {median_diff:.2f} min')
    
    plt.xlabel('Arrival Delay - Departure Delay (minutes)')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of Delay Differences ({year})')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, f'delay_difference_histogram_{year}.png'))
    plt.close()
    
    # 3. Heatmap of departure vs arrival delay
    plt.figure(figsize=(12, 10))
    
    # Create delay categories for both departure and arrival
    dep_cats = pd.cut(df['DEP_DELAY'], 
                    bins=[-100, -15, 0, 15, 30, 60, 120, 1000],
                    labels=['Early (>15m)', 'Early (0-15m)', 'Delayed (0-15m)', 'Delayed (15-30m)', 
                           'Delayed (30-60m)', 'Delayed (60-120m)', 'Delayed (>120m)'])
    
    arr_cats = pd.cut(df['ARR_DELAY'], 
                    bins=[-100, -15, 0, 15, 30, 60, 120, 1000],
                    labels=['Early (>15m)', 'Early (0-15m)', 'Delayed (0-15m)', 'Delayed (15-30m)', 
                           'Delayed (30-60m)', 'Delayed (60-120m)', 'Delayed (>120m)'])
    
    # Create cross-tabulation
    delay_cross = pd.crosstab(arr_cats, dep_cats, normalize=True) * 100
    
    # Plot heatmap
    ax = sns.heatmap(delay_cross, annot=True, fmt='.1f', cmap='YlGnBu', linewidths=.5)
    
    plt.xlabel('Departure Delay Category')
    plt.ylabel('Arrival Delay Category')
    plt.title(f'Relationship Between Departure and Arrival Delay Categories ({year})')
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, f'delay_category_heatmap_{year}.png'))
    plt.close()
    
    return {
        'delay_correlation': delay_corr,
        'mean_delay_diff': mean_diff,
        'median_delay_diff': median_diff,
        'pct_arr_worse_than_dep': (df['DELAY_DIFF'] > 0).mean() * 100,
        'pct_arr_better_than_dep': (df['DELAY_DIFF'] < 0).mean() * 100
    }


Found 4 May files to process:
  - May2021.csv
  - May2022.csv
  - May2023.csv
  - May2024.csv

Loading weather data...
Found 3550 total weather data files
Processed 100 weather files, loaded 0 matching files
Processed 200 weather files, loaded 0 matching files
Processed 300 weather files, loaded 16 matching files
Processed 400 weather files, loaded 32 matching files
Processed 500 weather files, loaded 32 matching files
Processed 600 weather files, loaded 48 matching files
Processed 700 weather files, loaded 48 matching files
Processed 800 weather files, loaded 48 matching files
Processed 900 weather files, loaded 64 matching files
Processed 1000 weather files, loaded 64 matching files
Processed 1100 weather files, loaded 80 matching files
Processed 1200 weather files, loaded 96 matching files
Processed 1300 weather files, loaded 112 matching files
Processed 1400 weather files, loaded 112 matching files
Processed 1500 weather files, loaded 112 matching files
Processed 1600 weather file

In [11]:

# Function to train model for a specific year
def train_year_model(year, flight_data_file):
    """
    Train an arrival delay model for a specific year's data
    
    Args:
        year: Year to train model for
        flight_data_file: Path to the flight data file
        
    Returns:
        Dictionary with model results or None if error
    """
    print(f"\n{'='*80}")
    print(f"Training Arrival Delay model for year {year}")
    print(f"{'='*80}")
    
    # Create year-specific output directories
    year_output_dir = os.path.join(output_dir, f'year_{year}')
    os.makedirs(year_output_dir, exist_ok=True)
    os.makedirs(os.path.join(year_output_dir, 'metrics'), exist_ok=True)
    os.makedirs(os.path.join(year_output_dir, 'plots'), exist_ok=True)
    
    start_time = time.time()
    
    # Step 1: Load and preprocess the year's flight data
    flight_data = load_and_process_flight_data(flight_data_file)
    if flight_data is None or len(flight_data) == 0:
        print(f"No valid flight data available for {year}. Skipping this year.")
        return None
    
    # Step 2: Match departure weather data (origin airport)
    flight_data = match_weather_data(flight_data)
    
    # Step 3: Match arrival weather data (destination airport)
    flight_data = match_destination_weather_data(flight_data)
    
    # Step 4: Analyze correlation between departure and arrival delays
    corr_stats = analyze_delay_correlation(flight_data, year, os.path.join(year_output_dir, 'plots'))
    
    # Step 5: Add feature enhancements specific to arrival delays
    # Add late-night arrival indicator
    print(f"\nCreating late-night arrival indicator for {year}...")
    flight_data = create_late_night_arrival_indicator(flight_data)
    
    # Prepare arrival delay data
    print(f"\nPreparing arrival delay data for {year}...")
    flight_data = prepare_arrival_delay_data(flight_data)
    
    # Create arrival time block features
    print(f"\nCreating arrival time block features for {year}...")
    flight_data = create_arrival_time_block_features(flight_data)
    
    # Create day features
    print(f"\nCreating day features for {year}...")
    flight_data = create_day_features(flight_data)
    
    # Create flight duration features
    print(f"\nCreating flight duration features for {year}...")
    flight_data = create_flight_duration_features(flight_data)
    
    # Step 6: Feature selection for arrival delay prediction
    print(f"\nSelecting features for arrival delay prediction for {year}...")
    
    # Categorical features for arrival delay
    cat_features = ['DAY_NAME', 'ARR_TIME_BLOCK', 'MKT_AIRLINE', 
                    'ORIGIN_IATA', 'DEST_IATA', 'FLIGHT_DISTANCE_CAT', 'DURATION_DEVIATION',
                    'IS_LATE_NIGHT_ARR', 'IS_WEEKEND', 'IS_MORNING_RUSH_ARR', 'IS_EVENING_RUSH_ARR',
                    "EXTREME_WEATHER", 'DEST_EXTREME_WEATHER']
    
    # Numerical features for arrival delay - include both departure and arrival info
    num_features = [
        # Basic flight info
        'DISTANCE', 'SCH_DURATION_MINS', 
        
        # Weather at origin
        'PRCP', 
        
        # Weather at destination
        'DEST_PRCP',
        
        # Departure delay features (extremely important for arrival delay)
        'DEP_DELAY'
    ]
    
    # Ensure all selected features exist in the dataframe
    cat_features = [f for f in cat_features if f in flight_data.columns]
    num_features = [f for f in num_features if f in flight_data.columns]
    
    print(f"Using categorical features: {cat_features}")
    print(f"Using numerical features: {num_features}")
    
    # Step 7: Prepare data for modeling
    X = flight_data[cat_features + num_features].copy()
    y_class = flight_data['IS_ARR_DELAYED']
    y_reg = flight_data['ARR_DELAY']

    # 首先处理目标变量中的NaN值
    if y_reg.isnull().any():
        print(f"Warning: Found {y_reg.isnull().sum()} NaN values in ARR_DELAY. Removing these records.")
        # 创建有效数据的掩码
        valid_mask = ~y_reg.isnull()
        # 过滤数据集
        X = X[valid_mask]
        y_class = y_class[valid_mask]
        y_reg = y_reg[valid_mask]
        print(f"After removing NaN values, remaining records: {len(X)}")

    # Handle missing values in features
    for col in cat_features:
        if X[col].isnull().sum() > 0:
            # Check if it's a categorical type
            if pd.api.types.is_categorical_dtype(X[col]):
                # Get current categories and add 'unknown'
                current_categories = X[col].cat.categories.tolist()
                if 'unknown' not in current_categories:
                    new_categories = current_categories + ['unknown']
                    X[col] = X[col].cat.set_categories(new_categories)
                X[col] = X[col].fillna('unknown')
            else:
                X[col] = X[col].fillna('unknown')

    for col in num_features:
        if X[col].isnull().sum() > 0:
            print(f"Filling {X[col].isnull().sum()} NaN values in column {col} with median")
            X[col] = X[col].fillna(X[col].median())

    # Final check for any remaining NaN values
    if X.isnull().any().any():
        print("Warning: There are still NaN values in X features after preprocessing")
        # Optional: print columns with NaN values
        for col in X.columns[X.isnull().any()]:
            print(f"  - Column {col} has {X[col].isnull().sum()} NaN values")

    # Split data for classification model
    X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
        X, y_class, test_size=0.25, random_state=42, stratify=y_class
    )

    # Split data for regression model (no stratify for regression)
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
        X, y_reg, test_size=0.25, random_state=42
    )

    print(f"Training set size: {X_train_class.shape}")
    print(f"Test set size: {X_test_class.shape}")

    # Extra verification step to ensure no NaN values exist in the training data
    if np.isnan(y_train_reg.values).any():
        print("Warning: NaN values still present in y_train_reg after splitting!")
        # Replace any remaining NaNs with the median as a last resort
        median_value = np.nanmedian(y_train_reg)
        y_train_reg = y_train_reg.fillna(median_value)
        print(f"Replaced remaining NaN values with median: {median_value}")

    if np.isnan(y_test_reg.values).any():
        print("Warning: NaN values still present in y_test_reg after splitting!")
        # Replace any remaining NaNs with the median as a last resort
        median_value = np.nanmedian(y_test_reg)
        y_test_reg = y_test_reg.fillna(median_value)
        print(f"Replaced remaining NaN values with median: {median_value}")
    
    # Step 8: Define preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_features),
            ('cat', categorical_transformer, cat_features)
        ])
    
    # Step 9: Train classification model for arrival delay (Random Forest)
    print(f"\nTraining arrival delay classification model for {year} (Random Forest)...")
    class_model_start_time = time.time()
    
    # 使用随机森林分类器进行到达延迟分类
    class_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=10,
            min_samples_leaf=5,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ))
    ])
    
    # Train classification model
    class_model.fit(X_train_class, y_train_class)
    final_class_model = class_model
    
    class_model_training_time = time.time() - class_model_start_time
    print(f"Arrival delay classification model training took: {class_model_training_time:.2f} seconds")
    
    # Step 10: Train regression model for arrival delay (Random Forest)
    print(f"\nTraining arrival delay regression model for {year} (Random Forest)...")
    reg_model_start_time = time.time()
    
    # 使用随机森林回归器进行到达延迟时长预测
    reg_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=200,
            max_depth=20,
            min_samples_split=10,
            min_samples_leaf=5,
            random_state=42,
            n_jobs=-1
        ))
    ])
    
    # Train regression model
    reg_model.fit(X_train_reg, y_train_reg)
    final_reg_model = reg_model
    
    reg_model_training_time = time.time() - reg_model_start_time
    print(f"Arrival delay regression model training took: {reg_model_training_time:.2f} seconds")
    
    # Step 11: Evaluate classification model
    print(f"\nEvaluating arrival delay classification model for {year}...")
    
    # Make predictions
    y_pred_class = final_class_model.predict(X_test_class)
    y_prob_class = final_class_model.predict_proba(X_test_class)[:, 1]
    
    # Calculate metrics
    class_accuracy = (y_pred_class == y_test_class).mean() * 100
    class_roc_auc = roc_auc_score(y_test_class, y_prob_class)
    
    # Create classification report
    class_report = classification_report(y_test_class, y_pred_class, output_dict=True)
    
    # Create confusion matrix
    class_cm = confusion_matrix(y_test_class, y_pred_class)
    
    print(f"Arrival Delay Classification Accuracy: {class_accuracy:.2f}%")
    print(f"Arrival Delay Classification ROC AUC: {class_roc_auc:.4f}")
    print(f"Arrival Delay Classification Precision (Delayed): {class_report['1']['precision']:.4f}")
    print(f"Arrival Delay Classification Recall (Delayed): {class_report['1']['recall']:.4f}")
    print(f"Arrival Delay Classification F1 Score (Delayed): {class_report['1']['f1-score']:.4f}")
    
    # Step 12: Evaluate regression model
    print(f"\nEvaluating arrival delay regression model for {year}...")
    
    # Make predictions
    y_pred_reg = final_reg_model.predict(X_test_reg)
    
    # Calculate metrics
    reg_mse = mean_squared_error(y_test_reg, y_pred_reg)
    reg_rmse = np.sqrt(reg_mse)
    reg_mae = mean_absolute_error(y_test_reg, y_pred_reg)
    reg_r2 = r2_score(y_test_reg, y_pred_reg)
    
    print(f"Arrival Delay Regression Mean Squared Error: {reg_mse:.2f}")
    print(f"Arrival Delay Regression Root Mean Squared Error: {reg_rmse:.2f} minutes")
    print(f"Arrival Delay Regression Mean Absolute Error: {reg_mae:.2f} minutes")
    print(f"Arrival Delay Regression R² Score: {reg_r2:.4f}")
    
    # Step 13: Extract feature importances for both models
    try:
        # Get feature names from the preprocessor
        feature_names = final_class_model.named_steps['preprocessor'].get_feature_names_out()
        
        # Get the random forest classifier
        rf_classifier = final_class_model.named_steps['classifier']
        
        # Plot and save feature importances for classification
        class_importance_df = plot_feature_importance(
            rf_classifier, 
            feature_names,
            year=year,
            top_n=20,
            output_path=os.path.join(year_output_dir, 'plots', f'arrival_delay_class_feature_importance_{year}.png'),
            model_type='classification'
        )
        
        # Save feature importances to CSV
        class_importance_df.to_csv(
            os.path.join(year_output_dir, 'metrics', f"arrival_delay_class_feature_importance_{year}.csv"), 
            index=False
        )
        
        # Print top features
        print(f"\nTop 10 most important features for arrival delay classification in {year}:")
        print(class_importance_df.head(10))
        
        # Analyze day of week (DAY_NAME) feature importance
        day_features = [f for f in feature_names if 'DAY_NAME' in f]
        if day_features:
            day_importance = class_importance_df[class_importance_df['Feature'].isin(day_features)]
            print(f"\nDay of week feature importance for arrival delay classification in {year}:")
            print(day_importance)
        
    except Exception as e:
        print(f"Error extracting classification feature importances: {e}")
        class_importance_df = pd.DataFrame()
    
    # Extract feature importances for regression model
    try:
        # Get the random forest regressor
        rf_regressor = final_reg_model.named_steps['regressor']
        
        # Plot and save feature importances for regression
        reg_importance_df = plot_feature_importance(
            rf_regressor, 
            feature_names,
            year=year,
            top_n=20,
            output_path=os.path.join(year_output_dir, 'plots', f'arrival_delay_reg_feature_importance_{year}.png'),
            model_type='regression'
        )
        
        # Save feature importances to CSV
        reg_importance_df.to_csv(
            os.path.join(year_output_dir, 'metrics', f"arrival_delay_reg_feature_importance_{year}.csv"), 
            index=False
        )
        
        # Print top features
        print(f"\nTop 10 most important features for arrival delay regression in {year}:")
        print(reg_importance_df.head(10))
        
        # Analyze day of week feature importance
        if day_features:
            day_importance_reg = reg_importance_df[reg_importance_df['Feature'].isin(day_features)]
            print(f"\nDay of week feature importance for arrival delay regression in {year}:")
            print(day_importance_reg)
        
    except Exception as e:
        print(f"Error extracting regression feature importances: {e}")
        reg_importance_df = pd.DataFrame()
    
    # Step 14: Create visualization plots
    
    # Plot confusion matrix for classification model
    plt.figure(figsize=(8, 6))
    sns.heatmap(class_cm, annot=True, fmt='d', cmap='Blues', 
               xticklabels=['Not Delayed', 'Delayed'],
               yticklabels=['Not Delayed', 'Delayed'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Arrival Delay Classification Confusion Matrix ({year})')
    plt.tight_layout()
    plt.savefig(os.path.join(year_output_dir, 'plots', f'arrival_delay_confusion_matrix_{year}.png'))
    plt.close()
    
    # Plot ROC curve for classification model
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_test_class, y_prob_class)
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {class_roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Arrival Delay Classification ({year})')
    plt.legend()
    plt.savefig(os.path.join(year_output_dir, 'plots', f'arrival_delay_roc_curve_{year}.png'))
    plt.close()
    
    # Plot actual vs predicted delays for regression model
    plt.figure(figsize=(10, 6))
    
    # Create a scatterplot with limited points for clarity
    max_points = 5000
    if len(y_test_reg) > max_points:
        idx = np.random.choice(len(y_test_reg), max_points, replace=False)
        sample_actual = y_test_reg.iloc[idx]
        sample_pred = y_pred_reg[idx]
    else:
        sample_actual = y_test_reg
        sample_pred = y_pred_reg
    
    plt.scatter(sample_actual, sample_pred, alpha=0.3)
    
    # Add perfect prediction line
    max_val = max(sample_actual.max(), sample_pred.max())
    min_val = min(sample_actual.min(), sample_pred.min())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--')
    
    plt.xlabel('Actual Arrival Delay (minutes)')
    plt.ylabel('Predicted Arrival Delay (minutes)')
    plt.title(f'Actual vs Predicted Arrival Delay ({year})')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(year_output_dir, 'plots', f'arrival_delay_actual_vs_predicted_{year}.png'))
    plt.close()
    
    # Plot delay by arrival time block if available
    if 'ARR_TIME_BLOCK' in flight_data.columns:
        plt.figure(figsize=(14, 8))
        time_delay = flight_data.groupby('ARR_TIME_BLOCK')['ARR_DELAY'].agg(['mean', 'count']).reset_index()
        time_delay = time_delay.sort_values('mean', ascending=False)
        
        # Plot bar chart with both mean delay and flight count
        ax1 = plt.subplot(111)
        bars = sns.barplot(x='ARR_TIME_BLOCK', y='mean', data=time_delay, ax=ax1)
        
        # Annotate with mean delay values
        for bar, mean in zip(bars.patches, time_delay['mean']):
            ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                    f'{mean:.1f}', ha='center', va='bottom')
        
        # Create second y-axis for count
        ax2 = ax1.twinx()
        ax2.plot(time_delay.index, time_delay['count'], 'ro-', linewidth=2)
        
        # Set labels and title
        ax1.set_xlabel('Arrival Time Block')
        ax1.set_ylabel('Mean Arrival Delay (minutes)')
        ax2.set_ylabel('Number of Flights', color='r')
        plt.title(f'Mean Arrival Delay by Time of Day ({year})')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(year_output_dir, 'plots', f'arrival_delay_by_time_{year}.png'))
        plt.close()
    
    # Plot delay by day of week from DAY_NAME column
    if 'DAY_NAME' in flight_data.columns:
        plt.figure(figsize=(12, 6))
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        
        week_delay = flight_data.groupby('DAY_NAME')['ARR_DELAY'].mean().reset_index()
        
        # Ensure correct day order if all days are present
        if all(day in week_delay['DAY_NAME'].values for day in day_order):
            week_delay['DAY_NAME'] = pd.Categorical(week_delay['DAY_NAME'], categories=day_order, ordered=True)
            week_delay = week_delay.sort_values('DAY_NAME')
        
        bars = sns.barplot(x='DAY_NAME', y='ARR_DELAY', data=week_delay)
        
        # Annotate with mean delay values
        for bar, mean in zip(bars.patches, week_delay['ARR_DELAY']):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                    f'{mean:.1f}', ha='center', va='bottom')
        
        plt.xlabel('Day of Week')
        plt.ylabel('Mean Arrival Delay (minutes)')
        plt.title(f'Mean Arrival Delay by Day of Week ({year})')
        plt.tight_layout()
        plt.savefig(os.path.join(year_output_dir, 'plots', f'arrival_delay_by_week_{year}.png'))
        plt.close()
    
    # Create duration deviation analysis if available
    if 'DURATION_DEVIATION' in flight_data.columns:
        plt.figure(figsize=(12, 6))
        
        # Calculate mean arrival delay by duration deviation category
        duration_delay = flight_data.groupby('DURATION_DEVIATION')['ARR_DELAY'].mean().reset_index()
        
        # Sort categories in a logical order
        order = ['Much Faster', 'Faster', 'On Schedule', 'Slower', 'Much Slower', 'Extremely Slower']
        order = [o for o in order if o in duration_delay['DURATION_DEVIATION'].values]
        
        duration_delay['DURATION_DEVIATION'] = pd.Categorical(
            duration_delay['DURATION_DEVIATION'], 
            categories=order, 
            ordered=True
        )
        duration_delay = duration_delay.sort_values('DURATION_DEVIATION')
        
        bars = sns.barplot(x='DURATION_DEVIATION', y='ARR_DELAY', data=duration_delay)
        
        # Annotate with mean delay values
        for bar, mean in zip(bars.patches, duration_delay['ARR_DELAY']):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                    f'{mean:.1f}', ha='center', va='bottom')
        
        plt.xlabel('Flight Duration Deviation')
        plt.ylabel('Mean Arrival Delay (minutes)')
        plt.title(f'Impact of Flight Duration on Arrival Delay ({year})')
        plt.tight_layout()
        plt.savefig(os.path.join(year_output_dir, 'plots', f'arrival_delay_by_duration_deviation_{year}.png'))
        plt.close()
    
    # Create heatmap of arrival delays by day and hour if available
    if 'DAY_NAME' in flight_data.columns and 'ARR_HOUR' in flight_data.columns:
        plt.figure(figsize=(14, 8))
        
        # Create pivot table
        day_hour_delay = flight_data.pivot_table(
            values='ARR_DELAY',
            index='DAY_NAME',
            columns='ARR_HOUR',
            aggfunc='mean'
        )
        
        # Reorder days to start with Monday
        day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        day_hour_delay = day_hour_delay.reindex(day_order)
        
        # Create heatmap
        sns.heatmap(day_hour_delay, cmap='YlOrRd', annot=True, fmt='.1f')
        
        plt.title(f'Mean Arrival Delay by Day and Hour ({year})')
        plt.xlabel('Hour of Day')
        plt.ylabel('Day of Week')
        plt.tight_layout()
        plt.savefig(os.path.join(year_output_dir, 'plots', f'arrival_delay_day_hour_heatmap_{year}.png'))
        plt.close()
    
    # Step 15: Save models
    dump(final_class_model, os.path.join(year_output_dir, f"arrival_delay_class_model_{year}.joblib"))
    dump(final_reg_model, os.path.join(year_output_dir, f"arrival_delay_reg_model_{year}.joblib"))
    print(f"Models saved to {year_output_dir}")
    
    # Step 16: Create summary metrics
    metrics = {
        'model_name': f'arrival_delay_rf_{year}',
        'year': year,
        
        # Dataset metrics
        'total_flights': len(flight_data),
        'arr_delayed_flights_rate': flight_data['IS_ARR_DELAYED'].mean() * 100,
        'mean_arr_delay': flight_data['ARR_DELAY'].mean(),
        'median_arr_delay': flight_data['ARR_DELAY'].median(),
        'max_arr_delay': flight_data['ARR_DELAY'].max(),
        'min_arr_delay': flight_data['ARR_DELAY'].min(),
        
        # Correlation metrics
        'dep_arr_delay_correlation': corr_stats.get('delay_correlation', None),
        'mean_delay_difference': corr_stats.get('mean_delay_diff', None),
        'pct_arr_worse_than_dep': corr_stats.get('pct_arr_worse_than_dep', None),
        
        # Classification metrics
        'class_accuracy': class_accuracy,
        'class_roc_auc': class_roc_auc,
        'class_precision': class_report['1']['precision'],
        'class_recall': class_report['1']['recall'],
        'class_f1': class_report['1']['f1-score'],
        'class_training_time': class_model_training_time,
        
        # Regression metrics
        'reg_mse': reg_mse,
        'reg_rmse': reg_rmse,
        'reg_mae': reg_mae,
        'reg_r2': reg_r2,
        'reg_training_time': reg_model_training_time,
        
        # Late night arrival metrics if available
        'late_night_arr_count': flight_data['IS_LATE_NIGHT_ARR'].sum() if 'IS_LATE_NIGHT_ARR' in flight_data.columns else None,
        'late_night_arr_pct': flight_data['IS_LATE_NIGHT_ARR'].mean() * 100 if 'IS_LATE_NIGHT_ARR' in flight_data.columns else None,
        
        'status': 'success',
        'total_processing_time': time.time() - start_time
    }
    
    # Save top important features for both models
    if not class_importance_df.empty:
        # Save top classification features
        for i in range(min(10, len(class_importance_df))):
            feat = class_importance_df.iloc[i]
            metrics[f'class_top_feature_{i+1}'] = feat['Feature']
            metrics[f'class_top_feature_{i+1}_importance'] = float(feat['Importance'])
    
    if not reg_importance_df.empty:
        # Save top regression features
        for i in range(min(10, len(reg_importance_df))):
            feat = reg_importance_df.iloc[i]
            metrics[f'reg_top_feature_{i+1}'] = feat['Feature']
            metrics[f'reg_top_feature_{i+1}_importance'] = float(feat['Importance'])
    
    # Save day of week feature importance
    if 'feature_names' in locals() and 'day_features' in locals():
        if day_features:
            metrics['day_features'] = day_features
            
            if 'day_importance' in locals() and not day_importance.empty:
                metrics['day_importance_class'] = convert_to_serializable(day_importance.to_dict('records'))
            
            if 'day_importance_reg' in locals() and not day_importance_reg.empty:
                metrics['day_importance_reg'] = convert_to_serializable(day_importance_reg.to_dict('records'))
    
    # Save metrics to JSON - using our serialization helper
    import json
    
    # Convert to serializable format
    serializable_metrics = convert_to_serializable(metrics)
    with open(os.path.join(year_output_dir, 'metrics', f'arrival_delay_metrics_{year}.json'), 'w') as f:
        json.dump(serializable_metrics, f, indent=4)
    
    print(f"\nArrival delay model training for {year} complete! Total processing time: {metrics['total_processing_time']:.2f} seconds")
    return metrics

# Function to compare models across years
def compare_year_models(all_results):
    """
    Compare arrival delay models across different years
    
    Args:
        all_results: Dictionary with results for each year
        
    Returns:
        None (saves comparison plots)
    """
    print("\nComparing arrival delay models across years...")
    
    if not all_results or len(all_results) < 2:
        print("Not enough year models to compare.")
        return
    
    # Create a comparison directory
    comparison_dir = os.path.join(output_dir, 'comparison')
    os.makedirs(comparison_dir, exist_ok=True)
    
    # Extract years and sort them
    years = sorted([r['year'] for r in all_results])
    
    # Create DataFrames for different metrics
    class_metrics = pd.DataFrame({
        'Year': years,
        'Accuracy (%)': [r['class_accuracy'] for r in all_results],
        'AUC': [r['class_roc_auc'] for r in all_results],
        'Precision': [r['class_precision'] for r in all_results],
        'Recall': [r['class_recall'] for r in all_results],
        'F1 Score': [r['class_f1'] for r in all_results],
    })
    
    reg_metrics = pd.DataFrame({
        'Year': years,
        'RMSE (min)': [r['reg_rmse'] for r in all_results],
        'MAE (min)': [r['reg_mae'] for r in all_results],
        'R² Score': [r['reg_r2'] for r in all_results],
    })
    
    delay_stats = pd.DataFrame({
        'Year': years,
        'Mean Arrival Delay (min)': [r['mean_arr_delay'] for r in all_results],
        'Arrival Delay Rate (%)': [r['arr_delayed_flights_rate'] for r in all_results],
        'Total Flights': [r['total_flights'] for r in all_results],
    })
    
    # Add correlation data if available
    corr_data = []
    for r in all_results:
        if 'dep_arr_delay_correlation' in r and r['dep_arr_delay_correlation'] is not None:
            corr_data.append({
                'Year': r['year'],
                'Correlation': r['dep_arr_delay_correlation'],
                'Mean Diff (min)': r['mean_delay_difference'],
                'Arrival Worse (%)': r['pct_arr_worse_than_dep']
            })
    
    if corr_data:
        corr_df = pd.DataFrame(corr_data)
    
    # 1. Plot classification metrics
    plt.figure(figsize=(12, 8))
    
    # Set up bar positions
    bar_width = 0.15
    r1 = np.arange(len(years))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]
    r4 = [x + bar_width for x in r3]
    r5 = [x + bar_width for x in r4]
    
    # Create bars
    plt.bar(r1, class_metrics['Accuracy (%)'] / 100, width=bar_width, label='Accuracy', color='blue')
    plt.bar(r2, class_metrics['AUC'], width=bar_width, label='AUC', color='green')
    plt.bar(r3, class_metrics['Precision'], width=bar_width, label='Precision', color='red')
    plt.bar(r4, class_metrics['Recall'], width=bar_width, label='Recall', color='purple')
    plt.bar(r5, class_metrics['F1 Score'], width=bar_width, label='F1 Score', color='orange')
    
    # Add texts on bars
    for i, r in enumerate([r1, r2, r3, r4, r5]):
        values = class_metrics.iloc[:, i+1].values
        if i == 0:  # Accuracy needs to be multiplied by 100
            values = values / 100
        for j, v in enumerate(values):
            plt.text(r[j], v + 0.01, f'{v:.2f}' if i > 0 else f'{v*100:.1f}%', 
                    ha='center', va='bottom', rotation=0, fontsize=8)
    
    # Add labels and title
    plt.xlabel('Year')
    plt.ylabel('Score')
    plt.title('Arrival Delay Classification Metrics by Year')
    plt.xticks([r + 2*bar_width for r in range(len(years))], years)
    plt.legend()
    plt.ylim(0, 1.0)  # Set y-axis limits for better visualization
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, 'arrival_delay_class_metrics_by_year.png'))
    plt.close()
    
    # 2. Plot regression metrics
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot RMSE and MAE
    x = np.arange(len(years))
    width = 0.35
    
    ax1.bar(x - width/2, reg_metrics['RMSE (min)'], width, label='RMSE')
    ax1.bar(x + width/2, reg_metrics['MAE (min)'], width, label='MAE')
    
    # Add text labels
    for i, v in enumerate(reg_metrics['RMSE (min)']):
        ax1.text(i - width/2, v + 0.5, f'{v:.1f}', ha='center', va='bottom')
    for i, v in enumerate(reg_metrics['MAE (min)']):
        ax1.text(i + width/2, v + 0.5, f'{v:.1f}', ha='center', va='bottom')
    
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Minutes')
    ax1.set_title('Arrival Delay Regression Error Metrics')
    ax1.set_xticks(x)
    ax1.set_xticklabels(years)
    ax1.legend()
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot R² Score
    bars = ax2.bar(years, reg_metrics['R² Score'], color='green')
    
    # Add text labels
    for bar, value in zip(bars, reg_metrics['R² Score']):
        ax2.text(bar.get_x() + bar.get_width()/2, value + 0.01, f'{value:.3f}', 
                ha='center', va='bottom')
    
    ax2.set_xlabel('Year')
    ax2.set_ylabel('R² Score')
    ax2.set_title('Arrival Delay Regression R² Score')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, 'arrival_delay_reg_metrics_by_year.png'))
    plt.close()
    
    # 3. Plot delay statistics
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot mean delay
    bars1 = ax1.bar(years, delay_stats['Mean Arrival Delay (min)'], color='blue')
    
    # Add text labels
    for bar, value in zip(bars1, delay_stats['Mean Arrival Delay (min)']):
        ax1.text(bar.get_x() + bar.get_width()/2, value + 0.3, f'{value:.1f}', 
                ha='center', va='bottom')
    
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Minutes')
    ax1.set_title('Mean Arrival Delay by Year')
    ax1.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Plot delay rate
    bars2 = ax2.bar(years, delay_stats['Arrival Delay Rate (%)'], color='red')
    
    # Add text labels
    for bar, value in zip(bars2, delay_stats['Arrival Delay Rate (%)']):
        ax2.text(bar.get_x() + bar.get_width()/2, value + 0.5, f'{value:.1f}%', 
                ha='center', va='bottom')
    
    ax2.set_xlabel('Year')
    ax2.set_ylabel('Percentage')
    ax2.set_title('Arrival Delay Rate by Year')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(comparison_dir, 'arrival_delay_stats_by_year.png'))
    plt.close()
    
    # 4. Plot correlation statistics if available
    if corr_data:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
        
        # Plot correlation coefficient
        bars1 = ax1.bar(corr_df['Year'].astype(str), corr_df['Correlation'], color='purple')
        
        # Add text labels
        for bar, value in zip(bars1, corr_df['Correlation']):
            ax1.text(bar.get_x() + bar.get_width()/2, value + 0.01, f'{value:.3f}', 
                    ha='center', va='bottom')
        
        ax1.set_xlabel('Year')
        ax1.set_ylabel('Correlation Coefficient')
        ax1.set_title('Departure-Arrival Delay Correlation by Year')
        ax1.grid(axis='y', linestyle='--', alpha=0.7)
        
        # Plot percentage of flights where arrival delay is worse than departure
        bars2 = ax2.bar(corr_df['Year'].astype(str), corr_df['Arrival Worse (%)'], color='orange')
        
        # Add text labels
        for bar, value in zip(bars2, corr_df['Arrival Worse (%)']):
            ax2.text(bar.get_x() + bar.get_width()/2, value + 0.5, f'{value:.1f}%', 
                    ha='center', va='bottom')
        
        ax2.set_xlabel('Year')
        ax2.set_ylabel('Percentage')
        ax2.set_title('Flights Where Arrival Delay > Departure Delay')
        ax2.grid(axis='y', linestyle='--', alpha=0.7)
        
        # Save figure
        plt.tight_layout()
        plt.savefig(os.path.join(comparison_dir, 'arrival_departure_correlation_by_year.png'))
        plt.close()
    
    # 5. Create a summary table for all metrics
    summary_data = pd.concat([
        delay_stats.set_index('Year'),
        class_metrics.set_index('Year').iloc[:, 1:],  # Skip the Year column
        reg_metrics.set_index('Year').iloc[:, 1:]     # Skip the Year column
    ], axis=1)
    
    # Add correlation data if available
    if corr_data:
        corr_summary = corr_df.set_index('Year').iloc[:, :3]  # Get correlation columns
        summary_data = pd.concat([summary_data, corr_summary], axis=1)
    
    # Save the summary to CSV
    summary_data.to_csv(os.path.join(comparison_dir, 'arrival_delay_model_comparison.csv'))
    print(f"Comparison summary saved to {os.path.join(comparison_dir, 'arrival_delay_model_comparison.csv')}")
    
    # 6. Create feature importance visualization across years
    try:
        # Collect top features for classification and regression
        class_features_by_year = {}
        reg_features_by_year = {}
        
        for result in all_results:
            year = result['year']
            
            # Get classification features
            class_features = []
            for i in range(1, 6):  # Get top 5 features
                feat_key = f'class_top_feature_{i}'
                imp_key = f'class_top_feature_{i}_importance'
                
                if feat_key in result and imp_key in result:
                    class_features.append({
                        'feature': result[feat_key],
                        'importance': result[imp_key]
                    })
            
            if class_features:
                class_features_by_year[year] = class_features
            
            # Get regression features
            reg_features = []
            for i in range(1, 6):  # Get top 5 features
                feat_key = f'reg_top_feature_{i}'
                imp_key = f'reg_top_feature_{i}_importance'
                
                if feat_key in result and imp_key in result:
                    reg_features.append({
                        'feature': result[feat_key],
                        'importance': result[imp_key]
                    })
            
            if reg_features:
                reg_features_by_year[year] = reg_features
        
        # Create visualization function
        def plot_top_features_by_year(features_by_year, model_type, output_path):
            if not features_by_year:
                return
            
            fig, axes = plt.subplots(len(features_by_year), 1, figsize=(12, 4*len(features_by_year)))
            
            # If there's only one year, we need to convert axes to a list
            if len(features_by_year) == 1:
                axes = [axes]
            
            for i, (year, features) in enumerate(sorted(features_by_year.items())):
                # Extract feature names and importance values
                feat_names = [f['feature'] for f in features]
                importances = [f['importance'] for f in features]
                
                # Sort by importance
                sorted_indices = np.argsort(importances)[::-1]  # Reverse to get descending order
                feat_names = [feat_names[j] for j in sorted_indices]
                importances = [importances[j] for j in sorted_indices]
                
                # Create horizontal bar chart
                bars = axes[i].barh(range(len(feat_names)), importances, align='center')
                
                # Add value labels
                for bar, value in zip(bars, importances):
                    axes[i].text(value + 0.01, bar.get_y() + bar.get_height()/2, 
                              f'{value:.3f}', va='center')
                
                # Set y-ticks and labels
                axes[i].set_yticks(range(len(feat_names)))
                axes[i].set_yticklabels(feat_names)
                
                # Set title
                axes[i].set_title(f'Year {year}')
                
                # Set x-label only for the bottom subplot
                if i == len(features_by_year) - 1:
                    axes[i].set_xlabel('Feature Importance')
            
            plt.suptitle(f'Top 5 Most Important Features for Arrival Delay {model_type.title()} by Year', 
                        fontsize=16, y=1.02)
            plt.tight_layout()
            plt.savefig(output_path, bbox_inches='tight')
            plt.close()
        
        # Plot top features for classification and regression
        plot_top_features_by_year(
            class_features_by_year, 
            'classification',
            os.path.join(comparison_dir, 'arrival_delay_class_top_features_by_year.png')
        )
        
        plot_top_features_by_year(
            reg_features_by_year, 
            'regression',
            os.path.join(comparison_dir, 'arrival_delay_reg_top_features_by_year.png')
        )
        
        # Save feature info to JSON
        features_data = {
            'classification': class_features_by_year,
            'regression': reg_features_by_year
        }
        
        with open(os.path.join(comparison_dir, 'arrival_delay_feature_importance_by_year.json'), 'w') as f:
            json.dump(convert_to_serializable(features_data), f, indent=4)
        
    except Exception as e:
        print(f"Error creating feature importance visualization: {e}")
    
    print("Arrival delay model comparison completed!")

In [12]:

# Main execution
all_results = []

# Process each year's file separately
for file_path in flight_files:
    year = extract_year_from_filename(file_path)
    results = train_year_model(year, file_path)
    
    if results:
        all_results.append(results)
        print(f"\nArrival delay model for year {year} completed successfully!")
    else:
        print(f"\nArrival delay model for year {year} failed.")

# After all individual models are trained, compare them
if len(all_results) > 1:
    compare_year_models(all_results)
else:
    print("\nNot enough successful models to perform comparison.")

# Print final summary
print("\nYear-by-Year Arrival Delay Model Training Summary:")
for year_result in all_results:
    year = year_result['year']
    print(f"\nYear {year}:")
    print(f"  Total flights: {year_result['total_flights']:,}")
    print(f"  Arrival delay classification accuracy: {year_result['class_accuracy']:.2f}%")
    print(f"  Arrival delay classification AUC: {year_result['class_roc_auc']:.4f}")
    print(f"  Arrival delay regression RMSE: {year_result['reg_rmse']:.2f} minutes")
    print(f"  Arrival delay regression R²: {year_result['reg_r2']:.4f}")
    print(f"  Mean arrival delay: {year_result['mean_arr_delay']:.2f} minutes")
    print(f"  Arrival delay rate: {year_result['arr_delayed_flights_rate']:.2f}%")
    
    if 'dep_arr_delay_correlation' in year_result and year_result['dep_arr_delay_correlation'] is not None:
        print(f"  Departure-arrival delay correlation: {year_result['dep_arr_delay_correlation']:.4f}")
    
    # Print top 3 features for arrival delay classification
    print(f"  Top 3 features for arrival delay classification:")
    for i in range(1, 4):
        feature_key = f'class_top_feature_{i}'
        importance_key = f'class_top_feature_{i}_importance'
        if feature_key in year_result and importance_key in year_result:
            print(f"    {i}. {year_result[feature_key]}: {year_result[importance_key]:.4f}")

print("\nTraining complete! Check output directories for detailed results.")


Training Arrival Delay model for year 2021

Processing May2021.csv...


Years found in data: [2021]
Months found in data: {5: 520059}
Filtered to only May data: 520059 rows
Filtered from 520059 to 171867 rows for top 30 airports
Removed 485.0 cancelled flights, remaining: 171382
Removed 482.0 diverted flights, remaining: 170900
Processing took: 2.80 seconds

Matching origin weather data with flights...
Processed 10000/170900 rows, matched 7634 flights with origin weather data
Processed 20000/170900 rows, matched 15264 flights with origin weather data
Processed 30000/170900 rows, matched 22827 flights with origin weather data
Processed 40000/170900 rows, matched 30384 flights with origin weather data
Processed 50000/170900 rows, matched 37950 flights with origin weather data
Processed 60000/170900 rows, matched 45641 flights with origin weather data
Processed 70000/170900 rows, matched 53248 flights with origin weather data
Processed 80000/170900 rows, matched 60891 flights with origin weather data
Processed 90000/170900 rows, matched 68409 flights with ori