In [15]:
import pandas as pd
import numpy as np
import glob
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

flight_data_path = './cleaned_data/'
weather_data_path = './cleaned_weather_data/'
top_airports_file = './top_100_airports.csv'
output_dir = './cancelled_prob_lr_models/'

os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, 'metrics'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'plots'), exist_ok=True)

print("Starting flight cancellation prediction models with red-eye flight detection (Logistic Regression)...")
print(f"Flight data directory: {flight_data_path}")
print(f"Weather data directory: {weather_data_path}")
print(f"Top airports file: {top_airports_file}")
print(f"Model output directory: {output_dir}")

Starting flight cancellation prediction models with red-eye flight detection (Logistic Regression)...
Flight data directory: ./cleaned_data/
Weather data directory: ./cleaned_weather_data/
Top airports file: ./top_100_airports.csv
Model output directory: ./cancelled_prob_lr_models/


In [16]:
try:
    top_airports = pd.read_csv(top_airports_file, low_memory=False)
    
    top_airports = top_airports.head(30)
    
    top_airport_codes = set(top_airports['ORIGIN_IATA'].str.strip().tolist())
    
    print(f"Loaded top 30 airports: {', '.join(sorted(top_airport_codes))}")
    print(f"Busiest airport: {top_airports.iloc[0]['ORIGIN_IATA']} with {top_airports.iloc[0]['Times']} flights")
    print(f"30th busiest airport: {top_airports.iloc[29]['ORIGIN_IATA']} with {top_airports.iloc[29]['Times']} flights")
except Exception as e:
    print(f"Error loading top airports file: {e}")
    top_airport_codes = None
    print("Will process all airports (top airports file not available)")

Loaded top 30 airports: ATL, AUS, BNA, BOS, BWI, CLT, DCA, DEN, DFW, DTW, EWR, FLL, IAD, IAH, JFK, LAS, LAX, LGA, MCO, MDW, MIA, MSP, ORD, PHL, PHX, SAN, SEA, SFO, SLC, TPA
Busiest airport: ATL with 457121 flights
30th busiest airport: TPA with 97235 flights


In [17]:

def load_weather_data():
    print("\nLoading weather data for May 2021-2024...")
    start_time = time.time()
    
    all_files = glob.glob(os.path.join(weather_data_path, "*.csv"))
    print(f"Found {len(all_files)} total weather data files")
    weather_dict = {}
    count = 0
    matching_count = 0
    
    target_years = ['2021', '2022', '2023', '2024']
    target_month = 'May'
    
    for file in all_files:
        try:
            filename = os.path.basename(file)
            parts = filename.split('.')[0].split('_')
            
            if len(parts) >= 3:
                iata = parts[0]
                year = parts[1]
                month = parts[2]
                
                # Only load weather data for May 2021-2024 and top airports
                if (year in target_years and 
                    month == target_month and
                    (top_airport_codes is None or iata in top_airport_codes)):
                    
                    key = f"{iata}_{year}_{month}"
                    
                    weather_dict[key] = pd.read_csv(file, low_memory=False)
                    matching_count += 1
                
                count += 1
                    
                if count % 100 == 0:
                    print(f"Processed {count} weather files, loaded {matching_count} matching files")
        except Exception as e:
            print(f"Error loading weather file {file}: {e}")
    
    print(f"Loaded {matching_count} weather files for May 2021-2024 out of {count} processed files")
    print(f"Loading weather data took: {time.time() - start_time:.2f} seconds")
    return weather_dict

# Get specific May files from the cleaned_data directory based on the file list you shared
def get_may_files():
    may_files = [
        os.path.join(flight_data_path, "May2021.csv"),
        os.path.join(flight_data_path, "May2022.csv"),
        os.path.join(flight_data_path, "May2023.csv"),
        os.path.join(flight_data_path, "May2024.csv")
    ]
    
    existing_files = []
    for file_path in may_files:
        if os.path.exists(file_path):
            existing_files.append(file_path)
        else:
            print(f"Warning: File {file_path} not found")
    
    return existing_files

In [18]:
# Get the May 2021-2024 flight data files
flight_files = get_may_files()
print(f"\nFound {len(flight_files)} May files to process:")
for f in flight_files:
    print(f"  - {os.path.basename(f)}")

if not flight_files:
    print("No May 2021-2024 files were found. Please check file paths.")
    exit(1)

weather_dict = load_weather_data()

# Function to extract year from filename (for logging purposes only)
def extract_year_from_filename(filename):
    base_name = os.path.basename(filename)
    year_str = base_name.replace('May', '').split('.')[0]
    return int(year_str)


Found 4 May files to process:
  - May2021.csv
  - May2022.csv
  - May2023.csv
  - May2024.csv

Loading weather data for May 2021-2024...
Found 3550 total weather data files
Processed 100 weather files, loaded 0 matching files
Processed 200 weather files, loaded 0 matching files
Processed 300 weather files, loaded 4 matching files
Processed 400 weather files, loaded 8 matching files
Processed 500 weather files, loaded 8 matching files
Processed 600 weather files, loaded 12 matching files
Processed 700 weather files, loaded 12 matching files
Processed 800 weather files, loaded 12 matching files
Processed 900 weather files, loaded 16 matching files
Processed 1000 weather files, loaded 16 matching files
Processed 1100 weather files, loaded 20 matching files
Processed 1200 weather files, loaded 24 matching files
Processed 1300 weather files, loaded 28 matching files
Processed 1400 weather files, loaded 28 matching files
Processed 1500 weather files, loaded 28 matching files
Processed 1600 

In [19]:
def create_redeye_indicator(df):
    df = df.copy()
    
    df['IS_REDEYE'] = 0
    
    time_columns = []
    
    if 'SCH_DEP_TIME' in df.columns:
        time_columns.append('SCH_DEP_TIME')
    
    if 'SCH_ARR_TIME' in df.columns:
        time_columns.append('SCH_ARR_TIME')
    
    for col in time_columns:
        if df[col].dtype != 'float64':
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except:
                print(f"Warning: Could not convert {col} to numeric")
    
    if 'SCH_DEP_TIME' in time_columns:
        redeye_departure = (df['SCH_DEP_TIME'] >= 0) & (df['SCH_DEP_TIME'] < 600)
        df.loc[redeye_departure, 'IS_REDEYE'] = 1
        
        dep_redeye_count = redeye_departure.sum()
        print(f"Identified {dep_redeye_count} red-eye flights based on departure time (0-6 AM)")
    
    if 'SCH_ARR_TIME' in time_columns:
        redeye_arrival = (df['SCH_ARR_TIME'] >= 0) & (df['SCH_ARR_TIME'] < 600)
        df.loc[redeye_arrival, 'IS_REDEYE'] = 1
        
        arr_redeye_count = redeye_arrival.sum()
        print(f"Identified {arr_redeye_count} red-eye flights based on arrival time (0-6 AM)")
    
    redeye_count = df['IS_REDEYE'].sum()
    total_count = len(df)
    print(f"Total identified red-eye flights: {redeye_count} out of {total_count} total flights ({redeye_count/total_count*100:.2f}%)")
    
    if 'SCH_DEP_TIME' in time_columns:
        df['DEP_TIME_OF_DAY'] = pd.cut(
            df['SCH_DEP_TIME'], 
            bins=[0, 600, 1200, 1800, 2400],
            labels=['Early Morning (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)'],
            include_lowest=True
        )
        
        time_dist = df['DEP_TIME_OF_DAY'].value_counts()
        print("\nDistribution of flights by departure time of day:")
        for time_cat, count in time_dist.items():
            print(f"  - {time_cat}: {count} flights ({count/total_count*100:.2f}%)")
    
    return df

In [20]:
def train_model_for_file(file_path, file_index, total_files):
    file_name = os.path.basename(file_path)
    model_name = os.path.splitext(file_name)[0]
    file_year = extract_year_from_filename(file_name)  
    
    print(f"\nProcessing file {file_index+1}/{total_files}: {file_name} (May {file_year})")
    start_time = time.time()
    
    try:
        flight_df = pd.read_csv(file_path, low_memory=False)
        original_size = len(flight_df)
        
        if 'MONTH' in flight_df.columns:
            month_counts = flight_df['MONTH'].value_counts()
            print(f"Months found in data: {dict(month_counts)}")
            
            if 5 in month_counts:
                flight_df = flight_df[flight_df['MONTH'] == 5]
                print(f"Filtered to only May data: {len(flight_df)} rows")
            else:
                print(f"Warning: No May data found in file, but proceeding anyway as this should be May data based on filename")
        
        if top_airport_codes is not None:
            flight_df = flight_df[
                flight_df['ORIGIN_IATA'].str.strip().isin(top_airport_codes) & 
                flight_df['DEST_IATA'].str.strip().isin(top_airport_codes)
            ]
            
            filtered_size = len(flight_df)
            print(f"Filtered from {original_size} to {filtered_size} rows for top 30 airports")
            
            if filtered_size == 0:
                print(f"No data remaining after filtering for top 30 airports. Skipping file.")
                return {
                    'file_name': file_name,
                    'status': 'skipped',
                    'reason': 'empty_after_filtering'
                }
    except Exception as e:
        print(f"Error loading flight data file {file_path}: {e}")
        return {
            'file_name': file_name,
            'status': 'error',
            'reason': str(e)
        }
    
    print("Creating red-eye flight indicator...")
    flight_df = create_redeye_indicator(flight_df)
    
    print(f"Final dataset shape: {flight_df.shape}")
    
    print("Preprocessing data...")
    
    if 'CANCELLED' in flight_df.columns:
        flight_df['IS_CANCELLED'] = flight_df['CANCELLED'].astype(int)
    else:
        print("No CANCELLED column found. Skipping file.")
        return {
            'file_name': file_name,
            'status': 'skipped',
            'reason': 'no_cancelled_column'
        }
    
    if 'WEEK' not in flight_df.columns:
        if 'DAY' in flight_df.columns and 'MONTH' in flight_df.columns and 'YEAR' in flight_df.columns:
            if 'DATE' not in flight_df.columns:
                flight_df['DATE'] = pd.to_datetime(flight_df[['YEAR', 'MONTH', 'DAY']])
            flight_df['WEEK'] = flight_df['DATE'].dt.dayofweek
        else:
            print("Cannot create WEEK column. Required columns missing. Skipping file.")
            return {
                'file_name': file_name,
                'status': 'skipped',
                'reason': 'missing_columns_for_week'
            }
    
    cancelled_count = flight_df['IS_CANCELLED'].sum()
    total_count = len(flight_df)
    
    if cancelled_count == 0:
        print(f"No cancelled flights in this dataset. Skipping file.")
        return {
            'file_name': file_name,
            'status': 'skipped',
            'reason': 'no_cancelled_flights'
        }
    
    cancellation_rate = cancelled_count / total_count * 100
    print(f"Overall cancellation rate: {cancelled_count}/{total_count} ({cancellation_rate:.2f}%)")
    
    redeye_df = flight_df[flight_df['IS_REDEYE'] == 1]
    non_redeye_df = flight_df[flight_df['IS_REDEYE'] == 0]
    
    if len(redeye_df) > 0:
        redeye_cancel_rate = redeye_df['IS_CANCELLED'].mean() * 100
        print(f"Red-eye flights cancellation rate: {redeye_cancel_rate:.2f}%")
    
    if len(non_redeye_df) > 0:
        non_redeye_cancel_rate = non_redeye_df['IS_CANCELLED'].mean() * 100
        print(f"Non-red-eye flights cancellation rate: {non_redeye_cancel_rate:.2f}%")
    
    print("Matching weather data with flights...")
    
    if 'YEAR' in flight_df.columns and 'MONTH' in flight_df.columns and 'DAY' in flight_df.columns:
        flight_df['WEATHER_KEY'] = flight_df['ORIGIN_IATA'] + '_' + flight_df['YEAR'].astype(str) + '_' + flight_df['MONTH'].astype(str).str.zfill(2)
        if 'FLIGHT_DATE' not in flight_df.columns:
            flight_df['FLIGHT_DATE'] = pd.to_datetime(flight_df[['YEAR', 'MONTH', 'DAY']])
    
    flight_df['EXTREME_WEATHER'] = 0  
    flight_df['PRCP'] = 0.0
    
    matched_count = 0
    batch_size = 5000
    
    for start_idx in range(0, len(flight_df), batch_size):
        end_idx = min(start_idx + batch_size, len(flight_df))
        batch = flight_df.iloc[start_idx:end_idx]
        
        for idx, row in batch.iterrows():
            try:
                weather_key = row['WEATHER_KEY']
                flight_date = row['FLIGHT_DATE']
                
                if weather_key in weather_dict:
                    weather_data = weather_dict[weather_key]
                    if not pd.api.types.is_datetime64_any_dtype(weather_data['DATE']):
                        weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
                    
                    matching_weather = weather_data[weather_data['DATE'] == flight_date]
                    
                    if not matching_weather.empty:
                        if 'EXTREME_WEATHER' in matching_weather.columns:
                            flight_df.at[idx, 'EXTREME_WEATHER'] = matching_weather['EXTREME_WEATHER'].iloc[0]
                        if 'PRCP' in matching_weather.columns:
                            flight_df.at[idx, 'PRCP'] = matching_weather['PRCP'].iloc[0]
                        matched_count += 1
            except Exception as e:
                pass
    
    print(f"Matched weather data for {matched_count} flights ({matched_count/len(flight_df)*100:.2f}%)")
    
    print("Selecting features...")
    
    cat_features = ["YEAR", 'WEEK', 'MKT_AIRLINE', 'ORIGIN_IATA', 'DEST_IATA','EXTREME_WEATHER',  'IS_REDEYE', 'IS_WEEKEND', 'IS_MORNING_PEAK', 'IS_EVENING_PEAK']
    num_features = ['DISTANCE','PRCP']
    
    
    cat_features = [f for f in cat_features if f in flight_df.columns]
    num_features = [f for f in num_features if f in flight_df.columns]
    
    if not cat_features or not num_features:
        print("Missing required features. Skipping file.")
        return {
            'file_name': file_name,
            'status': 'skipped',
            'reason': 'missing_required_features'
        }
    
    print(f"Using categorical features: {cat_features}")
    print(f"Using numerical features: {num_features}")
    
    X = flight_df[cat_features + num_features].copy()
    y = flight_df['IS_CANCELLED'].copy()
    
    for col in cat_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna('unknown', inplace=True)
    for col in num_features:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].median(), inplace=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2025, stratify=y)
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    
    print("Training logistic regression model...")
    model_start_time = time.time()
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_features),
            ('cat', categorical_transformer, cat_features)
        ])

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=500, class_weight='balanced', C=1.0))
    ])

    model.fit(X_train, y_train)
    final_model = model
    
    y_pred = final_model.predict(X_test)
    y_prob = final_model.predict_proba(X_test)[:, 1]
    
    try:
        feature_names = final_model.named_steps['preprocessor'].get_feature_names_out()
    except:
        feature_names = []
        feature_names.extend(num_features)  
        for i, feature in enumerate(cat_features):
            encoder = final_model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot']
            categories = encoder.categories_[i]
            feature_names.extend([f"{feature}_{cat}" for cat in categories])
    
    coefficients = final_model.named_steps['classifier'].coef_[0]
    
    model_training_time = time.time() - model_start_time
    print(f"Model training took: {model_training_time:.2f} seconds")
    
    print("Evaluating model...")
    
    if not isinstance(y_pred, np.ndarray) or len(y_pred) == 0:
        print("No predictions available. Skipping evaluation.")
        return {
            'file_name': file_name,
            'status': 'error',
            'reason': 'prediction_failed'
        }
    
    try:
        accuracy = (y_pred == y_test).mean() * 100
        roc_auc = roc_auc_score(y_test, y_prob)
        
        report = classification_report(y_test, y_pred, output_dict=True)
        
        cm = confusion_matrix(y_test, y_pred)
        
        print(f"Accuracy: {accuracy:.2f}%")
        print(f"ROC AUC: {roc_auc:.4f}")
        
        if len(feature_names) == len(coefficients):
            feature_importance = pd.DataFrame({
                'Feature': feature_names,
                'Coefficient': coefficients
            })
            feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
            feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)
            
            print("\nTop 10 most important features:")
            print(feature_importance.head(10))
            
            redeye_importance = feature_importance[feature_importance['Feature'] == 'IS_REDEYE']
            if not redeye_importance.empty:
                coef = redeye_importance.iloc[0]['Coefficient']
                effect = "increases" if coef > 0 else "decreases"
                print(f"\nRed-eye flight effect: Being a red-eye flight {effect} cancellation risk")
                print(f"IS_REDEYE coefficient: {coef:.4f}")
                print(f"IS_REDEYE importance rank: {feature_importance[feature_importance['Feature'] == 'IS_REDEYE'].index[0] + 1} out of {len(feature_importance)}")
            
            feature_importance.to_csv(os.path.join(output_dir, 'metrics', f"{model_name}_feature_importance.csv"), index=False)
            
            # Plot feature importance
            plt.figure(figsize=(16, 10))
            top_features = feature_importance.head(15)  # Top 15 features
            sns.barplot(x='Coefficient', y='Feature', data=top_features)
            plt.axvline(x=0, color='gray', linestyle='--')
            plt.title(f'Top 15 Feature Coefficients for {model_name}')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_feature_importance.png"))
            plt.close()
        else:
            print(f"Feature names ({len(feature_names)}) and coefficients ({len(coefficients)}) length mismatch")
        
        # Plot ROC curve
        plt.figure(figsize=(16, 10))
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for {model_name} (Logistic Regression)')
        plt.legend()
        plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_roc_curve.png"))
        plt.close()
        
        # Plot confusion matrix
        plt.figure(figsize=(16, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Not Cancelled', 'Cancelled'],
                   yticklabels=['Not Cancelled', 'Cancelled'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f'Confusion Matrix for {model_name}')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_confusion_matrix.png"))
        plt.close()
        
        # Plot cancellation rate comparison
        if len(redeye_df) > 0 and len(non_redeye_df) > 0:
            plt.figure(figsize=(16, 10))
            categories = ['Non-Red-Eye', 'Red-Eye']
            rates = [non_redeye_cancel_rate, redeye_cancel_rate]
            counts = [len(non_redeye_df), len(redeye_df)]
            
            bars = plt.bar(categories, rates, color=['skyblue', 'navy'])
            
            for i, (bar, rate, count) in enumerate(zip(bars, rates, counts)):
                plt.text(i, rate + 0.5, f"{rate:.2f}%\n({count} flights)", 
                         ha='center', va='bottom')
            
            plt.ylabel('Cancellation Rate (%)')
            plt.title(f'Cancellation Rate Comparison: Red-Eye vs. Non-Red-Eye Flights ({model_name})')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'plots', f"{model_name}_redeye_comparison.png"))
            plt.close()
        
        model_path = os.path.join(output_dir, f"{model_name}_model.joblib")
        dump(final_model, model_path)
        print(f"Model saved to {model_path}")
        
        # Save metrics summary
        metrics = {
            'file_name': file_name,
            'model_name': model_name,
            'file_year': file_year,  
            'accuracy': accuracy,
            'roc_auc': roc_auc,
            'precision': report['1']['precision'],
            'recall': report['1']['recall'],
            'f1_score': report['1']['f1-score'],
            'cancellation_rate': cancellation_rate,
            'training_time': model_training_time,
            'training_size': len(X_train),
            'test_size': len(X_test),
            'status': 'success'
        }
        
        metrics['redeye_count'] = len(redeye_df)
        metrics['redeye_percentage'] = len(redeye_df) / len(flight_df) * 100
        metrics['redeye_cancel_rate'] = redeye_cancel_rate if len(redeye_df) > 0 else None
        metrics['non_redeye_cancel_rate'] = non_redeye_cancel_rate if len(non_redeye_df) > 0 else None
        
        if not redeye_importance.empty:
            metrics['redeye_coefficient'] = coef
            metrics['redeye_rank'] = feature_importance[feature_importance['Feature'] == 'IS_REDEYE'].index[0] + 1
        
        # Save confusion matrix values
        metrics['true_negative'] = cm[0, 0]
        metrics['false_positive'] = cm[0, 1]
        metrics['false_negative'] = cm[1, 0]
        metrics['true_positive'] = cm[1, 1]
        
        # Save top 5 most important features
        if len(feature_names) == len(coefficients):
            for i in range(min(5, len(feature_importance))):
                feat = feature_importance.iloc[i]
                metrics[f'top_feature_{i+1}'] = feat['Feature']
                metrics[f'top_feature_{i+1}_coef'] = feat['Coefficient']
        
        print(f"Processing of {file_name} completed in {time.time() - start_time:.2f} seconds")
        return metrics
    
    except Exception as e:
        print(f"Error in evaluation: {e}")
        return {
            'file_name': file_name,
            'status': 'error',
            'reason': str(e)
        }

In [21]:
# Sequential processing of the May files
results = []
for i, file_path in enumerate(flight_files):
    result = train_model_for_file(file_path, i, len(flight_files))
    results.append(result)

# Summarize results
print("\nSummary of logistic regression model training:")
success_count = sum(1 for r in results if r.get('status') == 'success')
error_count = sum(1 for r in results if r.get('status') == 'error')
skipped_count = sum(1 for r in results if r.get('status') == 'skipped')

print(f"Successfully trained models: {success_count}/{len(results)}")
print(f"Failed models: {error_count}/{len(results)}")
print(f"Skipped files: {skipped_count}/{len(results)}")

# Create a summary DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(output_dir, 'cancelled_prob_lr_summary.csv'), index=False)

# Calculate average metrics for successful models
if success_count > 0:
    successful_results = [r for r in results if r.get('status') == 'success']
    avg_accuracy = sum(r.get('accuracy', 0) for r in successful_results) / success_count
    avg_roc_auc = sum(r.get('roc_auc', 0) for r in successful_results) / success_count
    avg_precision = sum(r.get('precision', 0) for r in successful_results) / success_count
    avg_recall = sum(r.get('recall', 0) for r in successful_results) / success_count
    
    print("\nAverage metrics across all successful models:")
    print(f"Accuracy: {avg_accuracy:.2f}%")
    print(f"ROC AUC: {avg_roc_auc:.4f}")
    print(f"Precision: {avg_precision:.4f}")
    print(f"Recall: {avg_recall:.4f}")
    
    # For successful models, identify most common important features
    feature_counts = {}
    for result in successful_results:
        for i in range(1, 6):  # Top 5 features
            feature_key = f'top_feature_{i}'
            if feature_key in result:
                feature = result[feature_key]
                if feature in feature_counts:
                    feature_counts[feature] += 1
                else:
                    feature_counts[feature] = 1
    
    if feature_counts:
        print("\nMost common important features across all models:")
        sorted_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
        for feature, count in sorted_features[:10]:  # Top 10 most common
            print(f"{feature}: Appears in {count} models ({count/success_count*100:.1f}%)")
    
    if all('redeye_coefficient' in r for r in successful_results):
        coefficients = [r['redeye_coefficient'] for r in successful_results]
        avg_coef = sum(coefficients) / len(coefficients)
        effect = "increases" if avg_coef > 0 else "decreases"
        
        print(f"\nRed-eye flight effect (across all models): Being a red-eye flight generally {effect} cancellation risk")
        print(f"Average IS_REDEYE coefficient: {avg_coef:.4f}")
        
        positive_count = sum(1 for c in coefficients if c > 0)
        negative_count = sum(1 for c in coefficients if c < 0)
        print(f"Models where red-eye increases cancellation risk: {positive_count}/{len(coefficients)}")
        print(f"Models where red-eye decreases cancellation risk: {negative_count}/{len(coefficients)}")
    
    # Plot combined red-eye vs non-red-eye cancellation rate comparison
    if all('redeye_cancel_rate' in r and 'non_redeye_cancel_rate' in r for r in successful_results):
        redeye_rates = [r['redeye_cancel_rate'] for r in successful_results if r['redeye_cancel_rate'] is not None]
        non_redeye_rates = [r['non_redeye_cancel_rate'] for r in successful_results if r['non_redeye_cancel_rate'] is not None]
        
        if redeye_rates and non_redeye_rates:
            avg_redeye_rate = sum(redeye_rates) / len(redeye_rates)
            avg_non_redeye_rate = sum(non_redeye_rates) / len(non_redeye_rates)
            
            redeye_counts = [r.get('redeye_count', 0) for r in successful_results]
            total_redeye = sum(redeye_counts)
            total_non_redeye = sum(r.get('training_size', 0) + r.get('test_size', 0) for r in successful_results) - total_redeye
            
            plt.figure(figsize=(16, 10))
            categories = ['Non-Red-Eye', 'Red-Eye']
            rates = [avg_non_redeye_rate, avg_redeye_rate]
            
            bars = plt.bar(categories, rates, color=['skyblue', 'navy'])
            
            for i, (bar, rate, count) in enumerate(zip(bars, rates, [total_non_redeye, total_redeye])):
                plt.text(i, rate + 0.5, f"{rate:.2f}%\n({count} flights)", 
                         ha='center', va='bottom')
            
            plt.ylabel('Average Cancellation Rate (%)')
            plt.title('Average Cancellation Rate Comparison: Red-Eye vs. Non-Red-Eye Flights')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'plots', 'average_redeye_comparison.png'))
            plt.close()
            print("\nAverage red-eye vs non-red-eye cancellation rate comparison saved to average_redeye_comparison.png")

print("\nLogistic regression model training with red-eye flight detection complete!")
print(f"Full summary saved to {os.path.join(output_dir, 'cancelled_prob_lr_summary.csv')}")


Processing file 1/4: May2021.csv (May 2021)
Months found in data: {5: 520059}
Filtered to only May data: 520059 rows
Filtered from 520059 to 171867 rows for top 30 airports
Creating red-eye flight indicator...
Identified 2861 red-eye flights based on departure time (0-6 AM)
Identified 4657 red-eye flights based on arrival time (0-6 AM)
Total identified red-eye flights: 7202 out of 171867 total flights (4.19%)

Distribution of flights by departure time of day:
  - Morning (6-12): 67434 flights (39.24%)
  - Afternoon (12-18): 61417 flights (35.74%)
  - Evening (18-24): 36112 flights (21.01%)
  - Early Morning (0-6): 6904 flights (4.02%)
Final dataset shape: (171867, 53)
Preprocessing data...
Overall cancellation rate: 485/171867 (0.28%)
Red-eye flights cancellation rate: 0.22%
Non-red-eye flights cancellation rate: 0.28%
Matching weather data with flights...
Matched weather data for 0 flights (0.00%)
Selecting features...
Using categorical features: ['YEAR', 'WEEK', 'MKT_AIRLINE', 'ORIG

  plt.tight_layout()


Model saved to ./cancelled_prob_lr_models/May2021_model.joblib
Processing of May2021.csv completed in 7.84 seconds

Processing file 2/4: May2022.csv (May 2022)
Months found in data: {5: 602950}
Filtered to only May data: 602950 rows
Filtered from 602950 to 210079 rows for top 30 airports
Creating red-eye flight indicator...
Identified 6180 red-eye flights based on departure time (0-6 AM)
Identified 7586 red-eye flights based on arrival time (0-6 AM)
Total identified red-eye flights: 13296 out of 210079 total flights (6.33%)

Distribution of flights by departure time of day:
  - Morning (6-12): 79757 flights (37.97%)
  - Afternoon (12-18): 71342 flights (33.96%)
  - Evening (18-24): 48014 flights (22.86%)
  - Early Morning (0-6): 10966 flights (5.22%)
Final dataset shape: (210079, 52)
Preprocessing data...
Overall cancellation rate: 4659/210079 (2.22%)
Red-eye flights cancellation rate: 1.56%
Non-red-eye flights cancellation rate: 2.26%
Matching weather data with flights...
Matched weat