In [None]:
#below is the forcasting method

In [1]:
import pandas as pd
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.model_selection import ExpandingWindowSplitter, ForecastingGridSearchCV
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
import re  # Importing regular expressions
from sklearn.metrics import mean_squared_error  # Importing MSE from sklearn

# Step 1: Load the dataset
print("Loading the dataset...")
data = pd.read_csv("['EGLL']_2023-04-01_00-00-00_till_2023-08-29_00-00-00_stack.csv")

# Ensure the timestamp is in datetime format and set as index
print("Converting timestamps and setting index...")
data['TimeStamp'] = pd.to_datetime(data['TimeStamp'])
data.set_index('TimeStamp', inplace=True)

# Step 2: Clean initial column names to remove spaces or special characters
data.columns = [re.sub(r'\W+', '_', col) for col in data.columns]
print("\nCleaned Column Names:")
print(data.columns)

# Check if frequency is missing and set it explicitly
if data.index.freq is None:
    print("Frequency is not set. Attempting to set it...")
    try:
        data = data.asfreq('15min', method='pad')  # Adjust '15min' to your actual time interval
        print(f"Frequency set to: {data.index.freq}")
    except Exception as e:
        print(f"Error setting frequency: {e}")

# Step 2: Prepare the data
print("Preparing the data...")
holding_columns = ['Holding_Time_Big', 'Holding_Time_Ock', 'Holding_Time_Bov', 'Holding_Time_Lam']

# Handle missing values by dropping rows with NaNs in target columns
print("Dropping rows with NaN values in target columns...")
data.dropna(subset=holding_columns, inplace=True)

# Split data into training and testing sets using train_test_split
print("Splitting data into training and testing sets...")
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=False)

Loading the dataset...
Converting timestamps and setting index...

Cleaned Column Names:
Index(['Unnamed_0', 'Holding_Time_Big', 'Holding_Time_Ock', 'Holding_Time_Bov',
       'Holding_Time_Lam', 'Big', 'Ock', 'Bov', 'Lam', 'Decimal_Hours', 'day',
       'month', 'year', 'day_of_week', 'Day_0', 'Day_1', 'Day_2', 'Day_3',
       'Day_4', 'Day_5', 'Day_6', 'Big_Max', 'Big_Min', 'Ock_Max', 'Ock_Min',
       'Bov_Max', 'Bov_Min', 'Lam_Max', 'Lam_Min', 'WTC_L', 'WTC_M', 'WTC_H',
       'WTC_J', 'Engine_Jet', 'Engine_Turboprop_shaft', 'Runway_09L',
       'Runway_09R', 'Runway_27L', 'Runway_27R', 'No_of_Landings_1HR',
       'No_stack', 'ceiling', 'wind', 'precip', 'freezing', 'phenomena',
       'wind_dir', 'wind_speed', 'Crosswind_Component', 'Headwind_Component',
       'departures_delayIndex', 'arrivals_numCancelled',
       'arrivals_delayIndex'],
      dtype='object')
Frequency is not set. Attempting to set it...
Frequency set to: <15 * Minutes>
Preparing the data...
Dropping rows with

In [2]:
# Step 3: Define the forecaster and grid search functions

# Function to create a forecaster
def create_forecaster(window_length=5):
    regressor = lgb.LGBMRegressor()
    forecaster = make_reduction(regressor, window_length=window_length, strategy="recursive")
    return forecaster

# Function to perform grid search and forecast
def grid_search_forecaster(train, test, target_column, param_grid):
    print(f"Creating forecaster for {target_column}...")
    forecaster = create_forecaster()

    # Cross-validation setup
    print(f"Setting up cross-validation for {target_column}...")
    cv = ExpandingWindowSplitter(initial_window=int(len(train) * 0.7))

    # Grid search
    print(f"Starting grid search for {target_column}...")
    gscv = ForecastingGridSearchCV(
        forecaster, strategy="refit", cv=cv, param_grid=param_grid,
        scoring=MeanAbsolutePercentageError(symmetric=True)
    )

    # Fit the model
    print(f"Fitting the model for {target_column}...")
    gscv.fit(train[target_column])
    print(f"Best params for {target_column}: {gscv.best_params_}")

    # Forecasting
    print(f"Forecasting for {target_column}...")
    fh = np.arange(len(test)) + 1
    y_pred = gscv.predict(fh=fh)
    
    # Calculate performance metrics
    print(f"Calculating performance metrics for {target_column}...")
    mae = np.mean(np.abs(test[target_column] - y_pred))
    mape = MeanAbsolutePercentageError(symmetric=True)(test[target_column], y_pred)
    mse = mean_squared_error(test[target_column], y_pred)  # MSE calculation

    print(f"MAE for {target_column}: {mae}")
    print(f"MAPE for {target_column}: {mape}")
    print(f"MSE for {target_column}: {mse}")  # Print the MSE

    return mae, mape, mse, y_pred, gscv.best_params_

# Step 4: Perform grid search for each holding column

# Define the parameter grid for window length
param_grid = {
    "window_length": [5, 10, 15, 20, 25, 30]  # Grid search over these window lengths
}

# Dictionary to store results
results = {}

print("Starting grid search for each holding column...")
for column in holding_columns:
    print(f"\nProcessing column: {column}")
    mae, mape, mse, predictions, best_params = grid_search_forecaster(train_data, test_data, column, param_grid)
    results[column] = {"MAE": mae, "MAPE": mape, "MSE": mse, "Predictions": predictions, "Best_Window_Length": best_params["window_length"]}

# Step 5: Forecast for multiple time steps into the future

# Define the future intervals (15-minute steps for 4 hours)
future_intervals = np.arange(1, 17)  # 16 intervals (15, 30, ..., 240 minutes)

# Dictionary to store future predictions
future_predictions = {}

print("\nStarting future predictions for each holding column...")
for column in holding_columns:
    print(f"Forecasting future intervals for {column}...")
    # Use the best forecaster found during grid search
    best_forecaster = create_forecaster(window_length=results[column]["Best_Window_Length"])
    best_forecaster.fit(train_data[column])
    
    # Predict future intervals
    future_pred = best_forecaster.predict(fh=future_intervals)
    future_predictions[column] = future_pred

# Step 6: Display the future predictions
for column in holding_columns:
    print(f"\nFuture predictions for {column}:")
    print(future_predictions[column])

print("\nProcess completed successfully.")

Starting grid search for each holding column...

Processing column: Holding_Time_Big
Creating forecaster for Holding_Time_Big...
Setting up cross-validation for Holding_Time_Big...
Starting grid search for Holding_Time_Big...
Fitting the model for Holding_Time_Big...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 660
[LightGBM] [Info] Number of data points in the train set: 11581, number of used features: 15
[LightGBM] [Info] Start training from score 0.926949
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 660
[LightGBM] [Info] Number of data points in the train set: 11581, number of used features: 15
[LightGBM] [Info] Start training from s