In [19]:
import pandas as pd
import numpy as np
import glob
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier 
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

# Set data paths
flight_data_path = './cleaned_data/'
weather_data_path = './cleaned_weather_data/'
top_airports_file = './top_100_airports.csv' 
output_dir = './cancelled_prob_rf_models/'  

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, 'metrics'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'plots'), exist_ok=True)

print("Starting flight cancellation prediction models with red-eye flight detection (Random Forest)...")
print(f"Flight data directory: {flight_data_path}")
print(f"Weather data directory: {weather_data_path}")
print(f"Top airports file: {top_airports_file}")
print(f"Model output directory: {output_dir}")

Starting flight cancellation prediction models with red-eye flight detection (Random Forest)...
Flight data directory: ./cleaned_data/
Weather data directory: ./cleaned_weather_data/
Top airports file: ./top_100_airports.csv
Model output directory: ./cancelled_prob_rf_models/


In [20]:

# Load top 30 airports from the top 100 airports file
try:
    # Load the airport data with the exact format provided
    top_airports = pd.read_csv(top_airports_file, low_memory=False)
    
    # The file already has a Rank column, so we can just take the top 30
    top_airports = top_airports.head(30)
    
    # The airport codes are in ORIGIN_IATA column
    top_airport_codes = set(top_airports['ORIGIN_IATA'].str.strip().tolist())
    
    print(f"Loaded top 30 airports: {', '.join(sorted(top_airport_codes))}")
    print(f"Busiest airport: {top_airports.iloc[0]['ORIGIN_IATA']} with {top_airports.iloc[0]['Times']} flights")
    print(f"30th busiest airport: {top_airports.iloc[29]['ORIGIN_IATA']} with {top_airports.iloc[29]['Times']} flights")
except Exception as e:
    print(f"Error loading top airports file: {e}")
    # Fallback: if file doesn't exist, we'll use all airports
    top_airport_codes = None
    print("Will process all airports (top airports file not available)")

Loaded top 30 airports: ATL, AUS, BNA, BOS, BWI, CLT, DCA, DEN, DFW, DTW, EWR, FLL, IAD, IAH, JFK, LAS, LAX, LGA, MCO, MDW, MIA, MSP, ORD, PHL, PHX, SAN, SEA, SFO, SLC, TPA
Busiest airport: ATL with 457121 flights
30th busiest airport: TPA with 97235 flights


In [21]:

# Function to load weather data for top airports only - specifically for May 2021-2024
def load_weather_data():
    print("\nLoading weather data for May 2021-2024...")
    start_time = time.time()
    
    all_files = glob.glob(os.path.join(weather_data_path, "*.csv"))
    print(f"Found {len(all_files)} total weather data files")
    weather_dict = {}
    count = 0
    matching_count = 0
    
    # Define target years and month
    target_years = ['2021', '2022', '2023', '2024']
    target_month = 'May'
    
    for file in all_files:
        try:
            # Extract IATA code from filename
            filename = os.path.basename(file)
            parts = filename.split('.')[0].split('_')
            
            if len(parts) >= 3:
                iata = parts[0]
                year = parts[1]
                month = parts[2]
                
                # Only load weather data for May 2021-2024 and top airports
                if (year in target_years and 
                    month == target_month and
                    (top_airport_codes is None or iata in top_airport_codes)):
                    
                    key = f"{iata}_{year}_{month}"
                    
                    # Add low_memory=False to handle mixed types warning
                    weather_dict[key] = pd.read_csv(file, low_memory=False)
                    matching_count += 1
                
                count += 1
                    
                # Print progress periodically
                if count % 100 == 0:
                    print(f"Processed {count} weather files, loaded {matching_count} matching files")
        except Exception as e:
            print(f"Error loading weather file {file}: {e}")
    
    print(f"Loaded {matching_count} weather files for May 2021-2024 out of {count} processed files")
    print(f"Loading weather data took: {time.time() - start_time:.2f} seconds")
    return weather_dict

In [22]:

# Get specific May files from the cleaned_data directory based on the file list you shared
def get_may_files():
    may_files = [
        os.path.join(flight_data_path, "May2021.csv"),
        os.path.join(flight_data_path, "May2022.csv"),
        os.path.join(flight_data_path, "May2023.csv"),
        os.path.join(flight_data_path, "May2024.csv")
    ]
    
    # Verify each file exists
    existing_files = []
    for file_path in may_files:
        if os.path.exists(file_path):
            existing_files.append(file_path)
        else:
            print(f"Warning: File {file_path} not found")
    
    return existing_files

# Get the May 2021-2024 flight data files
flight_files = get_may_files()
print(f"\nFound {len(flight_files)} May files to process:")
for f in flight_files:
    print(f"  - {os.path.basename(f)}")

if not flight_files:
    print("No May 2021-2024 files were found. Please check file paths.")
    exit(1)

# Load all weather data once (shared across all models)
weather_dict = load_weather_data()

# Function to extract year from filename (for logging purposes only)
def extract_year_from_filename(filename):
    base_name = os.path.basename(filename)
    year_str = base_name.replace('May', '').split('.')[0]
    return int(year_str)

# Function to create red-eye flight indicator
def create_redeye_indicator(df):
    """
    Creates a binary indicator for red-eye flights (0-6 AM departure or arrival)
    
    Args:
        df: DataFrame containing flight data with departure and arrival times
        
    Returns:
        DataFrame with IS_REDEYE column added
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Initialize IS_REDEYE to 0 (not a red-eye flight)
    df['IS_REDEYE'] = 0
    
    # Check for departure time columns
    dep_time_cols = [col for col in df.columns if 'DEP_TIME' in col.upper()]
    
    # Check for arrival time columns
    arr_time_cols = [col for col in df.columns if 'ARR_TIME' in col.upper()]
    
    # If we have departure time
    if dep_time_cols:
        dep_time_col = dep_time_cols[0]  # Use the first one found
        
        # Convert to float if it's not already
        if df[dep_time_col].dtype != 'float64':
            try:
                df[dep_time_col] = df[dep_time_col].astype(float)
            except:
                # Handle any errors in conversion
                print(f"Warning: Could not convert {dep_time_col} to float")
        
        # Check for departure time between 0 and 600 (0:00 to 6:00 AM)
        # Times are usually in HHMM format (e.g., 1:30 AM = 130, 5:45 AM = 545)
        redeye_departure = (df[dep_time_col] >= 0) & (df[dep_time_col] < 600)
        df.loc[redeye_departure, 'IS_REDEYE'] = 1
    
    # If we have arrival time
    if arr_time_cols:
        arr_time_col = arr_time_cols[0]  # Use the first one found
        
        # Convert to float if it's not already
        if df[arr_time_col].dtype != 'float64':
            try:
                df[arr_time_col] = df[arr_time_col].astype(float)
            except:
                # Handle any errors in conversion
                print(f"Warning: Could not convert {arr_time_col} to float")
        
        # Check for arrival time between 0 and 600 (0:00 to 6:00 AM)
        redeye_arrival = (df[arr_time_col] >= 0) & (df[arr_time_col] < 600)
        df.loc[redeye_arrival, 'IS_REDEYE'] = 1
    
    # Print statistics about red-eye flights
    redeye_count = df['IS_REDEYE'].sum()
    total_count = len(df)
    print(f"Identified {redeye_count} red-eye flights out of {total_count} total flights ({redeye_count/total_count*100:.2f}%)")
    
    return df


Found 4 May files to process:
  - May2021.csv
  - May2022.csv
  - May2023.csv
  - May2024.csv

Loading weather data for May 2021-2024...
Found 3550 total weather data files
Processed 100 weather files, loaded 0 matching files
Processed 200 weather files, loaded 0 matching files
Processed 300 weather files, loaded 4 matching files
Processed 400 weather files, loaded 8 matching files
Processed 500 weather files, loaded 8 matching files
Processed 600 weather files, loaded 12 matching files
Processed 700 weather files, loaded 12 matching files
Processed 800 weather files, loaded 12 matching files
Processed 900 weather files, loaded 16 matching files
Processed 1000 weather files, loaded 16 matching files
Processed 1100 weather files, loaded 20 matching files
Processed 1200 weather files, loaded 24 matching files
Processed 1300 weather files, loaded 28 matching files
Processed 1400 weather files, loaded 28 matching files
Processed 1500 weather files, loaded 28 matching files
Processed 1600 

In [23]:

# Function to train a Random Forest model for a single CSV file
def train_model_for_file(file_path, file_index, total_files):
    file_name = os.path.basename(file_path)
    model_name = os.path.splitext(file_name)[0]
    file_year = extract_year_from_filename(file_name)  # Just for logging
    
    print(f"\nProcessing file {file_index+1}/{total_files}: {file_name} (May {file_year})")
    start_time = time.time()
    
    # Load flight data from this file
    try:
        flight_df = pd.read_csv(file_path, low_memory=False)
        original_size = len(flight_df)
        
        # Ensure we only have May data (in case the file contains other months)
        if 'MONTH' in flight_df.columns:
            month_counts = flight_df['MONTH'].value_counts()
            print(f"Months found in data: {dict(month_counts)}")
            
            if 5 in month_counts:
                flight_df = flight_df[flight_df['MONTH'] == 5]
                print(f"Filtered to only May data: {len(flight_df)} rows")
            else:
                print(f"Warning: No May data found in file, but proceeding anyway as this should be May data based on filename")
        
        # Filter for top airports if we have the list
        if top_airport_codes is not None:
            flight_df = flight_df[
                flight_df['ORIGIN_IATA'].str.strip().isin(top_airport_codes) & 
                flight_df['DEST_IATA'].str.strip().isin(top_airport_codes)
            ]
            
            filtered_size = len(flight_df)
            print(f"Filtered from {original_size} to {filtered_size} rows for top 30 airports")
            
            # If no data left after filtering, skip this file
            if filtered_size == 0:
                print(f"No data remaining after filtering for top 30 airports. Skipping file.")
                return {
                    'file_name': file_name,
                    'status': 'skipped',
                    'reason': 'empty_after_filtering'
                }
    except Exception as e:
        print(f"Error loading flight data file {file_path}: {e}")
        return {
            'file_name': file_name,
            'status': 'error',
            'reason': str(e)
        }
    
    # Add red-eye flight indicator
    print("Creating red-eye flight indicator...")
    flight_df = create_redeye_indicator(flight_df)
    
    # Basic information
    print(f"Final dataset shape: {flight_df.shape}")
    
    # Data preprocessing
    print("Preprocessing data...")
    
    # Create target variable
    if 'CANCELLED' in flight_df.columns:
        flight_df['IS_CANCELLED'] = flight_df['CANCELLED'].astype(int)
    else:
        print("No CANCELLED column found. Skipping file.")
        return {
            'file_name': file_name,
            'status': 'skipped',
            'reason': 'no_cancelled_column'
        }
    
    # Ensure WEEK column exists
    if 'WEEK' not in flight_df.columns:
        if 'DAY_OF_WEEK' in flight_df.columns:
            flight_df['WEEK'] = flight_df['DAY_OF_WEEK']
        elif 'DAY' in flight_df.columns and 'MONTH' in flight_df.columns and 'YEAR' in flight_df.columns:
            # Create DATE column if not exists
            if 'DATE' not in flight_df.columns:
                flight_df['DATE'] = pd.to_datetime(flight_df[['YEAR', 'MONTH', 'DAY']])
            # Extract day of week (0=Monday, 6=Sunday)
            flight_df['WEEK'] = flight_df['DATE'].dt.dayofweek
        else:
            print("Cannot create WEEK column. Required columns missing. Skipping file.")
            return {
                'file_name': file_name,
                'status': 'skipped',
                'reason': 'missing_columns_for_week'
            }
    
    # Analyze cancellation rates
    cancelled_count = flight_df['IS_CANCELLED'].sum()
    total_count = len(flight_df)
    
    # Skip if there are no cancellations (can't train a model)
    if cancelled_count == 0:
        print(f"No cancelled flights in this dataset. Skipping file.")
        return {
            'file_name': file_name,
            'status': 'skipped',
            'reason': 'no_cancelled_flights'
        }
    
    cancellation_rate = cancelled_count / total_count * 100
    print(f"Overall cancellation rate: {cancelled_count}/{total_count} ({cancellation_rate:.2f}%)")
    
    # Analyze cancellation rates by red-eye status
    redeye_df = flight_df[flight_df['IS_REDEYE'] == 1]
    non_redeye_df = flight_df[flight_df['IS_REDEYE'] == 0]
    
    if len(redeye_df) > 0:
        redeye_cancel_rate = redeye_df['IS_CANCELLED'].mean() * 100
        print(f"Red-eye flights cancellation rate: {redeye_cancel_rate:.2f}%")
    
    if len(non_redeye_df) > 0:
        non_redeye_cancel_rate = non_redeye_df['IS_CANCELLED'].mean() * 100
        print(f"Non-red-eye flights cancellation rate: {non_redeye_cancel_rate:.2f}%")
    
    # Match weather data with flights
    print("Matching weather data with flights...")
    
    # Create necessary keys and date columns
    if 'YEAR' in flight_df.columns and 'MONTH' in flight_df.columns and 'DAY' in flight_df.columns:
        flight_df['WEATHER_KEY'] = flight_df['ORIGIN_IATA'] + '_' + flight_df['YEAR'].astype(str) + '_' + flight_df['MONTH'].astype(str).str.zfill(2)
        if 'FLIGHT_DATE' not in flight_df.columns:
            flight_df['FLIGHT_DATE'] = pd.to_datetime(flight_df[['YEAR', 'MONTH', 'DAY']])
    
    # Create columns for extreme weather and precipitation
    flight_df['EXTREME_WEATHER'] = 0  # Using exact name as specified
    flight_df['PRCP'] = 0.0  # Using exact name as specified
    
    # Process in batches
    matched_count = 0
    batch_size = 5000  # Smaller batch size for individual files
    
    for start_idx in range(0, len(flight_df), batch_size):
        end_idx = min(start_idx + batch_size, len(flight_df))
        batch = flight_df.iloc[start_idx:end_idx]
        
        for idx, row in batch.iterrows():
            try:
                weather_key = row['WEATHER_KEY']
                flight_date = row['FLIGHT_DATE']
                
                if weather_key in weather_dict:
                    weather_data = weather_dict[weather_key]
                    # Convert to datetime if not already
                    if not pd.api.types.is_datetime64_any_dtype(weather_data['DATE']):
                        weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
                    
                    # Find matching weather data for the flight date
                    matching_weather = weather_data[weather_data['DATE'] == flight_date]
                    
                    if not matching_weather.empty:
                        if 'EXTREME_WEATHER' in matching_weather.columns:
                            flight_df.at[idx, 'EXTREME_WEATHER'] = matching_weather['EXTREME_WEATHER'].iloc[0]
                        if 'PRCP' in matching_weather.columns:
                            flight_df.at[idx, 'PRCP'] = matching_weather['PRCP'].iloc[0]
                        matched_count += 1
            except Exception as e:
                # Less verbose error reporting for individual files
                pass
    
    print(f"Matched weather data for {matched_count} flights ({matched_count/len(flight_df)*100:.2f}%)")
    
    # Feature selection - Using only the specified variables, now including IS_REDEYE but not YEAR
    print("Selecting features...")
    
    # Categorical features - Use specified ones but not YEAR
    cat_features = ["YEAR", "WEEK", 'MKT_AIRLINE', 'ORIGIN_IATA', 'DEST_IATA', 'IS_REDEYE', 'IS_WEEKEND', 'IS_MORNING_PEAK', 'IS_EVENING_PEAK','EXTREME_WEATHER']
    # Numerical features - Use specified ones plus IS_REDEYE
    num_features = ['DISTANCE',  'PRCP']
    
    # Ensure all selected features exist in the dataframe
    cat_features = [f for f in cat_features if f in flight_df.columns]
    num_features = [f for f in num_features if f in flight_df.columns]
    
    if not cat_features or not num_features:
        print("Missing required features. Skipping file.")
        return {
            'file_name': file_name,
            'status': 'skipped',
            'reason': 'missing_required_features'
        }
    
    print(f"Using categorical features: {cat_features}")
    print(f"Using numerical features: {num_features}")
    
    # Create X (features) and y (target)
    X = flight_df[cat_features + num_features].copy()
    y = flight_df['IS_CANCELLED'].copy()
    
    # Handle missing values
    for col in cat_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna('unknown', inplace=True)
    for col in num_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].median(), inplace=True)
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    
    # Train model
    print("Training Random Forest model...")
    model_start_time = time.time()
    
    # Define preprocessing steps
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_features),
            ('cat', categorical_transformer, cat_features)
        ])

    # Create and train Random Forest model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=100,  # Number of trees in the forest
            max_depth=10,      # Maximum depth of the trees
            min_samples_split=10,  # Minimum samples required to split an internal node
            min_samples_leaf=5,    # Minimum samples required at a leaf node
            class_weight='balanced',  # Handle class imbalance
            random_state=2025,     # For reproducibility
            n_jobs=-1            # Use all available cores
        ))
    ])

    # Train model
    model.fit(X_train, y_train)
    final_model = model
    
    # Predictions
    y_pred = final_model.predict(X_test)
    y_prob = final_model.predict_proba(X_test)[:, 1]
    
    # Feature importance from Random Forest
    try:
        # Get feature names from the preprocessor
        feature_names = final_model.named_steps['preprocessor'].get_feature_names_out()
        
        # Get the trained Random Forest model
        rf_model = final_model.named_steps['classifier']
        
        # Get feature importances from the model
        importances = rf_model.feature_importances_
        
        # Create a DataFrame to store feature importances
        feature_importance = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        })
        
        # Sort by importance
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        
    except Exception as e:
        print(f"Error extracting feature importances: {e}")
        feature_importance = pd.DataFrame()
    
    model_training_time = time.time() - model_start_time
    print(f"Model training took: {model_training_time:.2f} seconds")
    
    # Evaluate model
    print("Evaluating model...")
    
    # Check if we have predictions for evaluation
    if not isinstance(y_pred, np.ndarray) or len(y_pred) == 0:
        print("No predictions available. Skipping evaluation.")
        return {
            'file_name': file_name,
            'status': 'error',
            'reason': 'prediction_failed'
        }
    
    # Calculate metrics
    try:
        accuracy = (y_pred == y_test).mean() * 100
        roc_auc = roc_auc_score(y_test, y_prob)
        
        # Create classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Create confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        
        print(f"Accuracy: {accuracy:.2f}%")
        print(f"ROC AUC: {roc_auc:.4f}")
        
        # Feature importance analysis
        if not feature_importance.empty:
            # Print top features
            print("\nTop 10 most important features:")
            print(feature_importance.head(10))
            
            # Look specifically for IS_REDEYE importance
            redeye_importance = feature_importance[feature_importance['Feature'] == 'IS_REDEYE']
            if not redeye_importance.empty:
                importance = redeye_importance.iloc[0]['Importance']
                print(f"\nRed-eye flight importance: {importance:.6f}")
                print(f"IS_REDEYE importance rank: {feature_importance[feature_importance['Feature'] == 'IS_REDEYE'].index[0] + 1} out of {len(feature_importance)}")
            
            # Save feature importance to CSV
            feature_importance.to_csv(os.path.join(output_dir, 'metrics', f"{model_name}_feature_importance.csv"), index=False)
            
            # Plot feature importance
            plt.figure(figsize=(16, 10))
            top_features = feature_importance.head(15)  # Top 15 features
            sns.barplot(x='Importance', y='Feature', data=top_features)
            plt.title(f'Top 15 Feature Importances for {model_name}')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_feature_importance.png"))
            plt.close()
        else:
            print("Could not extract feature importances")
        
        # Plot ROC curve
        plt.figure(figsize=(16, 10))
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for {model_name} (Random Forest)')
        plt.legend()
        plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_roc_curve.png"))
        plt.close()
        
        # Plot confusion matrix
        plt.figure(figsize=(16, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Not Cancelled', 'Cancelled'],
                   yticklabels=['Not Cancelled', 'Cancelled'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f'Confusion Matrix for {model_name}')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_confusion_matrix.png"))
        plt.close()
        
        # Plot cancellation rate comparison (red-eye vs non-red-eye)
        if len(redeye_df) > 0 and len(non_redeye_df) > 0:
            plt.figure(figsize=(16, 10))
            categories = ['Non-Red-Eye', 'Red-Eye']
            rates = [non_redeye_cancel_rate, redeye_cancel_rate]
            counts = [len(non_redeye_df), len(redeye_df)]
            
            bars = plt.bar(categories, rates, color=['skyblue', 'navy'])
            
            # Add value labels on top of bars
            for i, (bar, rate, count) in enumerate(zip(bars, rates, counts)):
                plt.text(i, rate + 0.5, f"{rate:.2f}%\n({count} flights)", 
                         ha='center', va='bottom')
            
            plt.ylabel('Cancellation Rate (%)')
            plt.title(f'Cancellation Rate Comparison: Red-Eye vs. Non-Red-Eye Flights ({model_name})')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_redeye_comparison.png"))
            plt.close()
        
        # Generate partial dependence plot for IS_REDEYE feature
        # This helps understand how the model's prediction changes with the red-eye flight status
        try:
            # Create feature-specific datasets
            redeye_X_test = X_test.copy()
            redeye_X_test['IS_REDEYE'] = 1
            
            non_redeye_X_test = X_test.copy()
            non_redeye_X_test['IS_REDEYE'] = 0
            
            # Get predictions for both scenarios
            redeye_probs = final_model.predict_proba(redeye_X_test)[:, 1]
            non_redeye_probs = final_model.predict_proba(non_redeye_X_test)[:, 1]
            
            # Calculate the average effect
            avg_redeye_effect = np.mean(redeye_probs - non_redeye_probs)
            
            # Plot distribution of effects
            plt.figure(figsize=(16, 10))
            
            # Plot the differences for each sample
            differences = redeye_probs - non_redeye_probs
            sns.histplot(differences, bins=30, kde=True)
            
            plt.axvline(avg_redeye_effect, color='red', linestyle='--', 
                       label=f'Average effect: {avg_redeye_effect:.4f}')
            plt.xlabel('Change in Cancellation Probability (Red-Eye - Non-Red-Eye)')
            plt.ylabel('Frequency')
            plt.title(f'Red-Eye Flight Effect on Cancellation Probability ({model_name})')
            plt.legend()
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_redeye_effect.png"))
            plt.close()
            
            print(f"Average effect of being a red-eye flight on cancellation probability: {avg_redeye_effect:.4f}")
            
        except Exception as e:
            print(f"Error creating red-eye effect plot: {e}")
        
        # Save model
        model_path = os.path.join(output_dir, f"{model_name}_model.joblib")
        dump(final_model, model_path)
        print(f"Model saved to {model_path}")
        
        # Save metrics summary
        metrics = {
            'file_name': file_name,
            'model_name': model_name,
            'file_year': file_year,  # Just for reference, not used as a feature
            'accuracy': accuracy,
            'roc_auc': roc_auc,
            'precision': report['1']['precision'],
            'recall': report['1']['recall'],
            'f1_score': report['1']['f1-score'],
            'cancellation_rate': cancellation_rate,
            'training_time': model_training_time,
            'training_size': len(X_train),
            'test_size': len(X_test),
            'status': 'success'
        }
        
        # Add red-eye specific metrics
        metrics['redeye_count'] = len(redeye_df)
        metrics['redeye_percentage'] = len(redeye_df) / len(flight_df) * 100
        metrics['redeye_cancel_rate'] = redeye_cancel_rate if len(redeye_df) > 0 else None
        metrics['non_redeye_cancel_rate'] = non_redeye_cancel_rate if len(non_redeye_df) > 0 else None
        
        # If IS_REDEYE feature importance available, add it
        if not redeye_importance.empty:
            metrics['redeye_importance'] = importance
            metrics['redeye_rank'] = feature_importance[feature_importance['Feature'] == 'IS_REDEYE'].index[0] + 1
        
        # Add red-eye effect from partial dependence if available
        if 'avg_redeye_effect' in locals():
            metrics['redeye_effect'] = avg_redeye_effect
        
        # Save confusion matrix values
        metrics['true_negative'] = cm[0, 0]
        metrics['false_positive'] = cm[0, 1]
        metrics['false_negative'] = cm[1, 0]
        metrics['true_positive'] = cm[1, 1]
        
        # Save top 5 most important features
        if not feature_importance.empty:
            for i in range(min(5, len(feature_importance))):
                feat = feature_importance.iloc[i]
                metrics[f'top_feature_{i+1}'] = feat['Feature']
                metrics[f'top_feature_{i+1}_importance'] = feat['Importance']
        
        print(f"Processing of {file_name} completed in {time.time() - start_time:.2f} seconds")
        return metrics
    
    except Exception as e:
        print(f"Error in evaluation: {e}")
        return {
            'file_name': file_name,
            'status': 'error',
            'reason': str(e)
        }

In [24]:
# Sequential processing of the May files
results = []
for i, file_path in enumerate(flight_files):
    result = train_model_for_file(file_path, i, len(flight_files))
    results.append(result)

# Summarize results
print("\nSummary of Random Forest model training:")
success_count = sum(1 for r in results if r.get('status') == 'success')
error_count = sum(1 for r in results if r.get('status') == 'error')
skipped_count = sum(1 for r in results if r.get('status') == 'skipped')

print(f"Successfully trained models: {success_count}/{len(results)}")
print(f"Failed models: {error_count}/{len(results)}")
print(f"Skipped files: {skipped_count}/{len(results)}")

# Create a summary DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(output_dir, 'cancelled_prob_rf_summary.csv'), index=False)

# Calculate average metrics for successful models
if success_count > 0:
    successful_results = [r for r in results if r.get('status') == 'success']
    avg_accuracy = sum(r.get('accuracy', 0) for r in successful_results) / success_count
    avg_roc_auc = sum(r.get('roc_auc', 0) for r in successful_results) / success_count
    avg_precision = sum(r.get('precision', 0) for r in successful_results) / success_count
    avg_recall = sum(r.get('recall', 0) for r in successful_results) / success_count
    
    print("\nAverage metrics across all successful models:")
    print(f"Accuracy: {avg_accuracy:.2f}%")
    print(f"ROC AUC: {avg_roc_auc:.4f}")
    print(f"Precision: {avg_precision:.4f}")
    print(f"Recall: {avg_recall:.4f}")
    
    # For successful models, identify most common important features
    feature_counts = {}
    for result in successful_results:
        for i in range(1, 6):  # Top 5 features
            feature_key = f'top_feature_{i}'
            if feature_key in result:
                feature = result[feature_key]
                if feature in feature_counts:
                    feature_counts[feature] += 1
                else:
                    feature_counts[feature] = 1
    
    if feature_counts:
        print("\nMost common important features across all models:")
        sorted_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
        for feature, count in sorted_features[:10]:  # Top 10 most common
            print(f"{feature}: Appears in {count} models ({count/success_count*100:.1f}%)")
    
    # Analyze red-eye flight effects across all models
    if all('redeye_importance' in r for r in successful_results):
        importances = [r['redeye_importance'] for r in successful_results]
        avg_importance = sum(importances) / len(importances)
        
        print(f"\nRed-eye flight importance (across all models):")
        print(f"Average IS_REDEYE importance: {avg_importance:.6f}")
        
        # Check red-eye effect from partial dependence if available
        if all('redeye_effect' in r for r in successful_results):
            effects = [r['redeye_effect'] for r in successful_results]
            avg_effect = sum(effects) / len(effects)
            
            effect_dir = "increases" if avg_effect > 0 else "decreases"
            print(f"Being a red-eye flight generally {effect_dir} cancellation probability by {abs(avg_effect):.4f}")
    
    # Plot combined red-eye vs non-red-eye cancellation rate comparison
    if all('redeye_cancel_rate' in r and 'non_redeye_cancel_rate' in r for r in successful_results):
        redeye_rates = [r['redeye_cancel_rate'] for r in successful_results if r['redeye_cancel_rate'] is not None]
        non_redeye_rates = [r['non_redeye_cancel_rate'] for r in successful_results if r['non_redeye_cancel_rate'] is not None]
        
        if redeye_rates and non_redeye_rates:
            avg_redeye_rate = sum(redeye_rates) / len(redeye_rates)
            avg_non_redeye_rate = sum(non_redeye_rates) / len(non_redeye_rates)
            
            # Total counts across all models
            redeye_counts = [r.get('redeye_count', 0) for r in successful_results]
            total_redeye = sum(redeye_counts)
            total_non_redeye = sum(r.get('training_size', 0) + r.get('test_size', 0) for r in successful_results) - total_redeye
            
            plt.figure(figsize=(16, 10))
            categories = ['Non-Red-Eye', 'Red-Eye']
            rates = [avg_non_redeye_rate, avg_redeye_rate]
            
            bars = plt.bar(categories, rates, color=['skyblue', 'navy'])
            
            # Add value labels
            for i, (bar, rate, count) in enumerate(zip(bars, rates, [total_non_redeye, total_redeye])):
                plt.text(i, rate + 0.5, f"{rate:.2f}%\n({count} flights)", 
                         ha='center', va='bottom')
            
            plt.ylabel('Average Cancellation Rate (%)')
            plt.title('Average Cancellation Rate Comparison: Red-Eye vs. Non-Red-Eye Flights')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'plots', 'average_redeye_comparison.png'))
            plt.close()
            print("\nAverage red-eye vs non-red-eye cancellation rate comparison saved to average_redeye_comparison.png")

print("\nRandom Forest model training with red-eye flight detection complete!")
print(f"Full summary saved to {os.path.join(output_dir, 'cancelled_prob_rf_summary.csv')}")


Processing file 1/4: May2021.csv (May 2021)
Months found in data: {5: 520059}
Filtered to only May data: 520059 rows
Filtered from 520059 to 171867 rows for top 30 airports
Creating red-eye flight indicator...
Identified 7202 red-eye flights out of 171867 total flights (4.19%)
Final dataset shape: (171867, 52)
Preprocessing data...
Overall cancellation rate: 485/171867 (0.28%)
Red-eye flights cancellation rate: 0.22%
Non-red-eye flights cancellation rate: 0.28%
Matching weather data with flights...
Matched weather data for 0 flights (0.00%)
Selecting features...
Using categorical features: ['YEAR', 'WEEK', 'MKT_AIRLINE', 'ORIGIN_IATA', 'DEST_IATA', 'IS_REDEYE', 'EXTREME_WEATHER']
Using numerical features: ['DISTANCE', 'PRCP']
Training set size: (154680, 9)
Test set size: (17187, 9)
Training Random Forest model...
Model training took: 1.93 seconds
Evaluating model...
Accuracy: 92.23%
ROC AUC: 0.8659

Top 10 most important features:
                 Feature  Importance
13   cat__MKT_AIR

  plt.tight_layout()


Average effect of being a red-eye flight on cancellation probability: -0.0107
Model saved to ./cancelled_prob_rf_models/May2021_model.joblib
Processing of May2021.csv completed in 9.39 seconds

Processing file 2/4: May2022.csv (May 2022)
Months found in data: {5: 602950}
Filtered to only May data: 602950 rows
Filtered from 602950 to 210079 rows for top 30 airports
Creating red-eye flight indicator...
Identified 13296 red-eye flights out of 210079 total flights (6.33%)
Final dataset shape: (210079, 51)
Preprocessing data...
Overall cancellation rate: 4659/210079 (2.22%)
Red-eye flights cancellation rate: 1.56%
Non-red-eye flights cancellation rate: 2.26%
Matching weather data with flights...
Matched weather data for 0 flights (0.00%)
Selecting features...
Using categorical features: ['YEAR', 'WEEK', 'MKT_AIRLINE', 'ORIGIN_IATA', 'DEST_IATA', 'IS_REDEYE', 'EXTREME_WEATHER']
Using numerical features: ['DISTANCE', 'PRCP']
Training set size: (189071, 9)
Test set size: (21008, 9)
Training Ra