train:2006-4-1 to 2040-12-31; test:2050-4-1 to 2080-11-30

In [39]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
import os

Error of different LSTM Models During this Test Period

In [34]:
# Set up paths
prediction_dirs = {
    **{f'Model_{i}': 'LSTM_MME_prediction' for i in range(3, 9)},
    'Model_85': 'LSTM_CHESS_prediction'
}

# Time range
start_date = '2050-04-01'
end_date = '2080-11-30'

# Store RMSE results
rmse_results = {}

# Iterate over all models
for model_name, folder in prediction_dirs.items():
    file_path = f'{folder}/{model_name.lower()}_test_predictions.csv'
    try:
        df = pd.read_csv(file_path, parse_dates=['date'])
        # Select the test period
        df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(df['Actual'], df['Predicted']))
        rmse_results[model_name] = rmse
        print(f"{model_name} - RMSE (2050-04-01 to 2080-11-30): {rmse:.3f}")
    except FileNotFoundError:
        print(f"{model_name} - Prediction file not found.")

Model_3 - RMSE (2050-04-01 to 2080-11-30): 2.080
Model_4 - RMSE (2050-04-01 to 2080-11-30): 1.897
Model_5 - RMSE (2050-04-01 to 2080-11-30): 1.902
Model_6 - RMSE (2050-04-01 to 2080-11-30): 1.933
Model_7 - RMSE (2050-04-01 to 2080-11-30): 1.979
Model_8 - RMSE (2050-04-01 to 2080-11-30): 1.954
Model_85 - RMSE (2050-04-01 to 2080-11-30): 3.135


Error of different RF Models During this Test Period

In [11]:
# Set up directories
prediction_dirs = {
    **{f'Model_{i}': 'RF_MME_prediction' for i in range(3, 9)},
    'Model_85': 'RF_CHESS_prediction'
}

# Time range
start_date = '2050-04-01'
end_date = '2080-11-30'

# Store RMSE results
rmse_results = {}

# Iterate over all models
for model_name, folder in prediction_dirs.items():
    file_path = f'{folder}/{model_name.lower()}_test_predictions.csv'
    try:
        df = pd.read_csv(file_path, parse_dates=['date'])
        # Select the test period
        df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(df['Actual'], df['Predicted']))
        rmse_results[model_name] = rmse
        print(f"{model_name} - RMSE (2050-04-01 to 2080-11-30): {rmse:.3f}")
    except FileNotFoundError:
        print(f"{model_name} - Prediction file not found.")


Model_3 - RMSE (2050-04-01 to 2080-11-30): 2.034
Model_4 - RMSE (2050-04-01 to 2080-11-30): 1.982
Model_5 - RMSE (2050-04-01 to 2080-11-30): 1.967
Model_6 - RMSE (2050-04-01 to 2080-11-30): 2.023
Model_7 - RMSE (2050-04-01 to 2080-11-30): 2.038
Model_8 - RMSE (2050-04-01 to 2080-11-30): 2.060
Model_85 - RMSE (2050-04-01 to 2080-11-30): 2.584


Extracting Training Data for Each Model

In [41]:
# Model ID configuration
day_models = [3, 4, 5, 6, 7, 8]
month_model = 85
lstm_dir_day = 'LSTM_MME_prediction'
rf_dir_day = 'RF_MME_prediction'
lstm_dir_month = 'LSTM_CHESS_prediction'
rf_dir_month = 'RF_CHESS_prediction'
start_date = '2006-04-01'
end_date = '2040-12-31'

# ----------- Collect common prediction dates across all models -----------
date_sets = []

for model_id in day_models:
    file_lstm = os.path.join(lstm_dir_day, f'model_{model_id}_train_predictions.csv')
    df = pd.read_csv(file_lstm, parse_dates=['date'])
    df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    date_sets.append(set(df['date']))

# Dates shared by all models
common_dates = sorted(set.intersection(*date_sets))
df_full_dates = pd.DataFrame({'date': common_dates})

# ----------- Daily interpolation of monthly LSTM predictions -----------
file_lstm_85 = os.path.join(lstm_dir_month, 'model_85_train_predictions.csv')
df_lstm_85_raw = pd.read_csv(file_lstm_85, parse_dates=['date'])
df_lstm_85_raw = df_lstm_85_raw[(df_lstm_85_raw['date'] >= start_date) & (df_lstm_85_raw['date'] <= end_date)]

df_lstm_85_interp = df_lstm_85_raw.set_index('date').reindex(df_full_dates['date'])
df_lstm_85_interp['Predicted'] = df_lstm_85_interp['Predicted'].interpolate(method='linear', limit_direction='both')
df_lstm_85_interp = df_lstm_85_interp.reset_index().rename(columns={'index': 'date', 'Predicted': 'LSTM_85'})

# ----------- Daily interpolation of monthly RF predictions -----------
file_rf_85 = os.path.join(rf_dir_month, 'model_85_train_predictions.csv')
df_rf_85_raw = pd.read_csv(file_rf_85, parse_dates=['date'])
df_rf_85_raw = df_rf_85_raw[(df_rf_85_raw['date'] >= start_date) & (df_rf_85_raw['date'] <= end_date)]

df_rf_85_interp = df_rf_85_raw.set_index('date').reindex(df_full_dates['date'])
df_rf_85_interp['Predicted'] = df_rf_85_interp['Predicted'].interpolate(method='linear', limit_direction='both')
df_rf_85_interp = df_rf_85_interp.reset_index().rename(columns={'index': 'date', 'Predicted': 'RF_85'})

# ----------- Build stacking feature files -----------
for model_id in day_models:
    # LSTM predictions
    file_lstm = os.path.join(lstm_dir_day, f'model_{model_id}_train_predictions.csv')
    df_lstm = pd.read_csv(file_lstm, parse_dates=['date'])
    df_lstm = df_lstm[(df_lstm['date'] >= start_date) & (df_lstm['date'] <= end_date)]
    df_lstm = df_lstm[['date', 'Predicted']].rename(columns={'Predicted': 'LSTM_day'})

    # RF predictions
    file_rf = os.path.join(rf_dir_day, f'model_{model_id}_train_predictions.csv')
    df_rf = pd.read_csv(file_rf, parse_dates=['date'])
    df_rf = df_rf[(df_rf['date'] >= start_date) & (df_rf['date'] <= end_date)]
    df_rf = df_rf[['date', 'Predicted']].rename(columns={'Predicted': 'RF_day'})

    # Actual values (from the LSTM file)
    df_actual = pd.read_csv(file_lstm, parse_dates=['date'])  # Reuse the LSTM file
    df_actual = df_actual[(df_actual['date'] >= start_date) & (df_actual['date'] <= end_date)]
    df_actual = df_actual[['date', 'Actual']]

    # Merge all features
    df_merge = df_full_dates.merge(df_lstm, on='date', how='left')
    df_merge = df_merge.merge(df_rf, on='date', how='left')
    df_merge = df_merge.merge(df_lstm_85_interp, on='date', how='left')
    df_merge = df_merge.merge(df_rf_85_interp, on='date', how='left')
    df_merge = df_merge.merge(df_actual, on='date', how='left')

    # Reorder columns
    df_merge = df_merge[['date', 'LSTM_day', 'RF_day', 'LSTM_85', 'RF_85', 'Actual']]

    # Save to Excel
    output_name = f'stacking_data_model_train_{model_id}.xlsx'
    df_merge.to_excel(output_name, index=False)


Extracting Testing Data for Each Model

In [37]:
# Model ID configuration
day_models = [3, 4, 5, 6, 7, 8]
month_model = 85
lstm_dir_day = 'LSTM_MME_prediction'
rf_dir_day = 'RF_MME_prediction'
lstm_dir_month = 'LSTM_CHESS_prediction'
rf_dir_month = 'RF_CHESS_prediction'
start_date = '2050-04-01'
end_date = '2080-11-30'

# ----------- Collect the actual prediction dates shared across all models -----------
date_sets = []

for model_id in day_models:
    file_lstm = os.path.join(lstm_dir_day, f'model_{model_id}_test_predictions.csv')
    df = pd.read_csv(file_lstm, parse_dates=['date'])
    df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    date_sets.append(set(df['date']))

# Find the intersection of dates across all models
common_dates = sorted(set.intersection(*date_sets))
df_full_dates = pd.DataFrame({'date': common_dates})

# ----------- Interpolate monthly LSTM predictions to daily resolution -----------
file_lstm_85 = os.path.join(lstm_dir_month, 'model_85_test_predictions.csv')
df_lstm_85_raw = pd.read_csv(file_lstm_85, parse_dates=['date'])
df_lstm_85_raw = df_lstm_85_raw[(df_lstm_85_raw['date'] >= start_date) & (df_lstm_85_raw['date'] <= end_date)]

# Interpolate to the common_dates
df_lstm_85_interp = df_lstm_85_raw.set_index('date').reindex(df_full_dates['date'])
df_lstm_85_interp['Predicted'] = df_lstm_85_interp['Predicted'].interpolate(method='linear', limit_direction='both')
df_lstm_85_interp = df_lstm_85_interp.reset_index().rename(columns={'index': 'date', 'Predicted': 'LSTM_85'})

# ----------- Interpolate monthly RF predictions to daily resolution -----------
file_rf_85 = os.path.join(rf_dir_month, 'model_85_test_predictions.csv')
df_rf_85_raw = pd.read_csv(file_rf_85, parse_dates=['date'])
df_rf_85_raw = df_rf_85_raw[(df_rf_85_raw['date'] >= start_date) & (df_rf_85_raw['date'] <= end_date)]

df_rf_85_interp = df_rf_85_raw.set_index('date').reindex(df_full_dates['date'])
df_rf_85_interp['Predicted'] = df_rf_85_interp['Predicted'].interpolate(method='linear', limit_direction='both')
df_rf_85_interp = df_rf_85_interp.reset_index().rename(columns={'index': 'date', 'Predicted': 'RF_85'})

# ----------- Generate stacking input data for each model -----------
for model_id in day_models:
    # LSTM predictions
    file_lstm = os.path.join(lstm_dir_day, f'model_{model_id}_test_predictions.csv')
    df_lstm = pd.read_csv(file_lstm, parse_dates=['date'])
    df_lstm = df_lstm[(df_lstm['date'] >= start_date) & (df_lstm['date'] <= end_date)]
    df_lstm = df_lstm[['date', 'Predicted']].rename(columns={'Predicted': 'LSTM_day'})

    # RF predictions
    file_rf = os.path.join(rf_dir_day, f'model_{model_id}_test_predictions.csv')
    df_rf = pd.read_csv(file_rf, parse_dates=['date'])
    df_rf = df_rf[(df_rf['date'] >= start_date) & (df_rf['date'] <= end_date)]
    df_rf = df_rf[['date', 'Predicted']].rename(columns={'Predicted': 'RF_day'})

    # Actual values (from LSTM prediction file)
    df_actual = pd.read_csv(file_lstm, parse_dates=['date'])
    df_actual = df_actual[(df_actual['date'] >= start_date) & (df_actual['date'] <= end_date)]
    df_actual = df_actual[['date', 'Actual']]

    # Merge all fields (strictly based on dates in df_full_dates)
    df_merge = df_full_dates.merge(df_lstm, on='date', how='left')
    df_merge = df_merge.merge(df_rf, on='date', how='left')
    df_merge = df_merge.merge(df_lstm_85_interp, on='date', how='left')
    df_merge = df_merge.merge(df_rf_85_interp, on='date', how='left')
    df_merge = df_merge.merge(df_actual, on='date', how='left')

    # Reorder columns
    df_merge = df_merge[['date', 'LSTM_day', 'RF_day', 'LSTM_85', 'RF_85', 'Actual']]

    # Save the file
    output_name = f'stacking_data_model_test_{model_id}.xlsx'
    df_merge.to_excel(output_name, index=False)


## Stacking

In [40]:
# List of model IDs
day_models = [3, 4, 5, 6, 7, 8]

# Store RMSE results for each model
results = []

# Iterate through each model
for model_id in day_models:
    # Read training data
    train_file = f'stacking_data_model_train_{model_id}.xlsx'
    train_df = pd.read_excel(train_file)

    # Read testing data
    test_file = f'stacking_data_model_test_{model_id}.xlsx'
    test_df = pd.read_excel(test_file)

    # Define features and target
    features = ['LSTM_day', 'RF_day', 'LSTM_85', 'RF_85']
    X_train = train_df[features]
    y_train = train_df['Actual']
    X_test = test_df[features]
    y_test = test_df['Actual']
    
    # Train Random Forest Regressor as stacking model
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Compute RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Store results
    results.append({
        'model_id': model_id,
        'rmse': rmse
    })

# Output RMSE results for all models
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model_id,rmse
0,3,2.110478
1,4,2.064789
2,5,2.073191
3,6,2.100484
4,7,2.143762
5,8,2.152854
