In [1]:
# Import standard libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import logging

# Suppress logs from cmdstanpy
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)

# Suppress logs from prophet
logging.getLogger('prophet').setLevel(logging.WARNING)

# Optionally, suppress other libraries if they are also too verbose
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('seaborn').setLevel(logging.WARNING)



# Import the flight_forecasting module
import flight_forecasting as ff

# For displaying plots within the notebook
%matplotlib inline

# Optional: Set Seaborn style for better aesthetics
sns.set_theme()


## Flight Data Import
List all the CSV filenames corresponding to different origin airports. Ensure that these files are present in the `INPUT_DIR` specified in `flight_forecasting.py`.

In [2]:
# List of CSV filenames corresponding to different origin airports
CSV_FILES = [
    'Origin Airport Baltimore, MD BaltimoreWashington International Thurgood Marshall (BWI).csv',
    'Origin Airport Chicago, IL Chicago Midway International (MDW).csv',
    'Origin Airport Dallas, TX Dallas Love Field (DAL).csv',
    'Origin Airport Denver, CO Denver International (DEN).csv',
    'Origin Airport Las Vegas, NV Harry Reid International (LAS).csv'
]

# Update the flight_forecasting module's CSV_FILES if necessary
ff.CSV_FILES = CSV_FILES

## Processing and Forecasting

- Load each CSV file, clean the data by handling missing values, and filter for the specified month (June by default).
- Aggregate the number of flights per year for the specified month.
- Iterate through each CSV file, perform forecasting using various models, evaluate their performance, and visualize the results.
  - Forecasting Models tested:
    - ARIMA
    - Exponential Smoothing
    - Linear Regression
    - Naive Forecast
    - Prophet

In [3]:
# Initialize a list to store predictions for 2025
prediction_summary = []

# Iterate through each CSV file and perform forecasting
for csv_file in ff.CSV_FILES:
    # Extract airport name from the filename
    airport_name = csv_file.split('.csv')[0].replace('Origin Airport ', '').strip()
    
    print(f"\n{'='*80}\nProcessing Airport: {airport_name}\n{'='*80}")
    
    # Load data
    filepath = os.path.join(ff.INPUT_DIR, csv_file)
    df_raw = ff.load_data(filepath)
    if df_raw is None:
        print(f"Skipping {airport_name} due to loading issues.")
        continue  # Skip to the next file if loading failed
    
    # Clean and prepare data
    df_clean = ff.clean_and_prepare_data(df_raw, month=ff.FORECAST_MONTH)
    
    # Display first few rows of cleaned data
    print("\nFirst few rows of the cleaned DataFrame:")
    display(df_clean.head())
    
    # Aggregate flights per year
    flights_per_year = ff.aggregate_flights(df_clean)
    
    # Ensure complete years from 2000 to 2024
    all_years = pd.DataFrame({'Year': range(2000, 2025)})
    flights_per_year = pd.merge(all_years, flights_per_year, on='Year', how='left')
    flights_per_year['Number_of_Flights'] = flights_per_year['Number_of_Flights'].fillna(0).astype(int)
    
    print("\nComplete Flights Per Year (June only):")
    display(flights_per_year)
    
    # Perform walk-forward validation
    results = ff.walk_forward_validation(flights_per_year)
    
    # Display forecasting results
    print("\nForecasting Results:")
    display(results)
    
    # Calculate performance summary
    results_clean = results.dropna()
    performance_summary = results_clean.groupby('Model').agg({
        'MAE': 'mean',
        'RMSE': 'mean'
    }).reset_index()
    
    print("\nModel Performance Summary:")
    display(performance_summary)
    
    # Identify the best model based on RMSE
    best_model_row = performance_summary.loc[performance_summary['RMSE'].idxmin()]
    best_model = best_model_row['Model']
    best_rmse = best_model_row['RMSE']
    
    print(f"\nBest Model for {airport_name}: {best_model} with RMSE = {best_rmse:.2f}")
    
    # Forecasting for 2025 Using the Best Model
    print(f"\nForecasting 2025 for {airport_name} using {best_model}...\n")
    
    # Prepare data for forecasting 2025
    # Create a DataFrame for training up to 2024
    train_data_2025 = flights_per_year.copy()
    
    # Depending on the model, fit and forecast accordingly
    if best_model == 'Naïve Forecast':
        # For Naïve Forecast, prediction is the last known value
        prediction_2025 = ff.forecast_naive(train_data_2025)
    elif best_model == 'Linear Regression':
        prediction_2025 = ff.forecast_linear_regression(train_data_2025, test_year=2025)
    elif best_model == 'ARIMA(1,1,1)':
        prediction_2025 = ff.forecast_arima(train_data_2025, test_year=2025)
    elif best_model == 'Exponential Smoothing':
        prediction_2025 = ff.forecast_exponential_smoothing(train_data_2025, test_year=2025)
    elif best_model == 'Prophet':
        prediction_2025 = ff.forecast_prophet(train_data_2025, test_year=2025)
    else:
        print(f"Unknown model: {best_model}. Cannot forecast 2025.")
        prediction_2025 = np.nan
    
    # Handle cases where prediction could not be made
    if np.isnan(prediction_2025):
        print(f"Prediction for 2025 could not be made using {best_model}.")
        prediction_2025_display = "Prediction Failed"
    else:
        prediction_2025_display = int(prediction_2025)
    
    # Append the prediction to the summary table
    prediction_summary.append({
        'Airport': airport_name,
        '2025 Predicted Flights': prediction_2025_display,
        'Model Used': best_model
    })
    
    print(f"2025 Predicted Flights for {airport_name}: {prediction_2025_display}")
    
    # Save forecasting results
    ff.save_forecasts(results, ff.OUTPUT_DIR, airport_name)
    
    # Save performance summary
    airport_output_dir = os.path.join(ff.OUTPUT_DIR, airport_name)
    performance_summary_path = os.path.join(airport_output_dir, f'{airport_name}_Performance_Summary.csv')
    performance_summary.to_csv(performance_summary_path, index=False)
    print(f"Performance summary saved to {performance_summary_path}")
    
    # Visualize performance metrics (saved as images)
    ff.visualize_performance(performance_summary, airport_name, ff.OUTPUT_DIR)
    
    # Visualize predictions vs actuals (saved as images)
    ff.visualize_predictions(results_clean, flights_per_year, airport_name, ff.OUTPUT_DIR)
    
    # -----------------------------------------------
    # **Enhanced Line Plot with Actuals and 2025 Forecast**
    # -----------------------------------------------
    
    # 1. Line Plot with Forecasts (Inline)
    plt.figure(figsize=(14,8))
    
    # Plot Actuals with a distinct color and thicker line
    plt.plot(
        flights_per_year['Year'], 
        flights_per_year['Number_of_Flights'], 
        label='Actual', 
        marker='o', 
        color='black', 
        linewidth=2
    )
    
    # Plot Predictions for each model
    models = results_clean['Model'].unique()
    
    for model in models:
        model_predictions = results_clean[results_clean['Model'] == model][['Year', 'Predicted']]
        plt.plot(
            model_predictions['Year'], 
            model_predictions['Predicted'], 
            label=model, 
            marker='o', 
            linewidth=1
        )
    
    # Plot 2025 Forecast from the Best Model
    if not np.isnan(prediction_2025):
        plt.plot(
            2025, 
            prediction_2025, 
            label='2025 Forecast', 
            marker='X', 
            markersize=12, 
            linestyle='--', 
            color='red'
        )
        plt.annotate(
            '2025 Forecast', 
            xy=(2025, prediction_2025), 
            xytext=(2025, prediction_2025 + (0.05 * prediction_2025)),
            arrowprops=dict(facecolor='red', shrink=0.05), 
            horizontalalignment='center'
        )
    
    plt.title(f'Forecasted vs Actual Number of Flights in June for {airport_name}', fontsize=16)
    plt.xlabel('Year', fontsize=14)
    plt.ylabel('Number of Flights', fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # 2. Display RMSE and MAE in a Table
    print("Model Performance Summary:")
    display(performance_summary)
    
    # -----------------------------------------------
    # **Optional: Display Separator**
    # -----------------------------------------------
    print("\n" + "="*80 + "\n")

# After the loop, compile the prediction_summary list into a DataFrame
summary_df = pd.DataFrame(prediction_summary)

# Display the summary table
print("\n=== 2025 Flight Predictions Summary ===")
display(summary_df)

# Define the path to save the summary table
summary_table_path = os.path.join(ff.OUTPUT_DIR, '2025_Flight_Predictions_Summary.csv')

# Save the summary DataFrame to CSV
summary_df.to_csv(summary_table_path, index=False)
print(f"\nSummary table saved to {summary_table_path}")



Processing Airport: Baltimore, MD BaltimoreWashington International Thurgood Marshall (BWI)
Loaded data from D:/GitHub Repos/630-SOUTHWEST-SCHEDULING/Data/Origin Airport Baltimore, MD BaltimoreWashington International Thurgood Marshall (BWI).csv successfully.

First few rows of the cleaned DataFrame:


Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes),Date,Month,Year
0,WN,6/1/2003,85.0,N431,LAX,17:45,17:50,380.0,325.0,5.0,18:00,10.0,0.0,0.0,0.0,0.0,0.0,2003-06-01,6,2003
1,WN,6/1/2003,846.0,N410,LAX,9:40,10:00,380.0,315.0,20.0,10:07,7.0,0.0,0.0,0.0,0.0,0.0,2003-06-01,6,2003
2,WN,6/1/2004,85.0,N438,LAX,17:50,19:33,355.0,325.0,103.0,19:43,10.0,9.0,0.0,0.0,0.0,64.0,2004-06-01,6,2004
3,WN,6/1/2004,285.0,N709SW,SAN,19:10,20:35,355.0,315.0,85.0,20:44,9.0,3.0,0.0,0.0,0.0,42.0,2004-06-01,6,2004
4,WN,6/1/2004,590.0,N453,SAN,11:05,11:05,355.0,335.0,0.0,11:18,13.0,0.0,0.0,0.0,0.0,0.0,2004-06-01,6,2004



Complete Flights Per Year (June only):


Unnamed: 0,Year,Destination Airport,Number_of_Flights
0,2000,,0
1,2001,,0
2,2002,,0
3,2003,LAX,60
4,2004,LAX,62
...,...,...,...
87,2024,CHS,107
88,2024,LAX,84
89,2024,MIA,97
90,2024,SAN,83



Processing Year: 2001
Exponential Smoothing failed for year 2001: index 1 is out of bounds for axis 0 with size 1
Prophet model failed for year 2001: Dataframe has less than 2 non-NaN rows.

Processing Year: 2002
ARIMA model failed for year 2002: too many indices for array: array is 0-dimensional, but 1 were indexed

Processing Year: 2003


23:58:11 - cmdstanpy - INFO - Chain [1] start processing
23:58:11 - cmdstanpy - INFO - Chain [1] done processing



Processing Year: 2004

Processing Year: 2004


23:58:11 - cmdstanpy - INFO - Chain [1] start processing
23:58:11 - cmdstanpy - INFO - Chain [1] done processing
23:58:11 - cmdstanpy - INFO - Chain [1] start processing
23:58:11 - cmdstanpy - INFO - Chain [1] done processing



Processing Year: 2005

Processing Year: 2005


23:58:11 - cmdstanpy - INFO - Chain [1] start processing
23:58:11 - cmdstanpy - INFO - Chain [1] done processing
23:58:11 - cmdstanpy - INFO - Chain [1] start processing
23:58:12 - cmdstanpy - INFO - Chain [1] done processing



Processing Year: 2006

Processing Year: 2006


23:58:12 - cmdstanpy - INFO - Chain [1] start processing
23:58:12 - cmdstanpy - INFO - Chain [1] done processing


KeyboardInterrupt: 