In [6]:
import matplotlib.pyplot as plt
plt.ioff()

<contextlib.ExitStack at 0x7fe270377400>

## 24 Hour ahead Lag Experiment

In [7]:
# Loading Data
from src.data_preparation import load_data

PROCESSED_DATA_PATH = "data/processed/df_24h_lag_BLV_spatial_images.csv"
df = load_data(PROCESSED_DATA_PATH, date_col="measurement_time")

df.drop(columns=["Unnamed: 0", "timestamp"], inplace=True)



df_interpolated = df.interpolate(method='linear')

df_interpolated.fillna(0, inplace=True)

features_to_check = ['ghi', 'dni', 'solar_zenith', 'GHI_cs', 'DNI_cs', 'CSI_ghi', 
                                   'CSI_dni',
                                   'nam_ghi', 'nam_dni', 'nam_cc', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover',
       '20_dwsw', '20_cloud_cover', '88_dwsw', '88_cloud_cover', 'AVG(R)',
       'STD(R)', 'ENT(R)', 'AVG(G)', 'STD(G)', 'ENT(G)', 'AVG(B)', 'STD(B)',
       'ENT(B)', 'AVG(RB)', 'STD(RB)', 'ENT(RB)', 'AVG(NRB)', 'STD(NRB)',
       'ENT(NRB)']

# 2. Calculate the number of NaN values for each feature
# We use .isna() instead of == 0
nans_per_feature = df_interpolated[features_to_check].isna().sum()

# 3. Filter to get only features that actually have NaN values
features_with_nans = nans_per_feature[nans_per_feature > 0]

# 4. Report the findings for which features have NaNs
if features_with_nans.empty:
    print("No NaN (missing) values found in any of the specified features.")
else:
    print("--- Features With NaN Values ---")
    print("The following features have NaN values, with the total count for each:")
    # Sort for clearer output
    print(features_with_nans.sort_values(ascending=False))
    print("\n" + "="*40 + "\n")

    # 5. Analyze the distribution of hours for rows containing NaNs
    print("--- Distribution of NaN-Value Records by Hour ---")
    
    # Create a boolean mask for rows that contain *at least one* NaN
    # in the specified columns
    rows_with_any_nan = df_interpolated[features_to_check].isna().any(axis=1)
    
    if rows_with_any_nan.sum() > 0:
        # Get the index for these rows
        nan_rows_index = df_interpolated.index[rows_with_any_nan]
        
        # Extract the hour from the DatetimeIndex and get the value counts
        hour_distribution = nan_rows_index.hour.value_counts().sort_index()
        
        print("Distribution of records (rows) containing at least one NaN, by hour:")
        print(hour_distribution)
        
        # Optional: Print total number of affected rows
        print(f"\nTotal number of rows with at least one NaN: {rows_with_any_nan.sum()}")
    else:
        # This case shouldn't be hit if features_with_nans was not empty,
        # but it's good practice to include.
        print("No rows found with NaN values (this is unexpected, check logic).")



Loaded 12,611 records
Date range: 2014-01-02 14:00:00+00:00 to 2016-12-30 00:00:00+00:00
Timezone: UTC
No NaN (missing) values found in any of the specified features.


  df_interpolated = df.interpolate(method='linear')


In [8]:
import torch
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    else "cpu"
)

device

'cuda'

In [None]:

df_phase1 = df_interpolated[['CSI_ghi','time_gap_hours',
       'time_gap_norm',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']]


df_phase2 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h']]

df_phase3 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc']]

df_phase4 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc', '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover',
       '20_dwsw', '20_cloud_cover', '88_dwsw', '88_cloud_cover']]

df_phase5 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc','80_cloud_cover',  '56_cloud_cover',
        '20_cloud_cover', '88_cloud_cover']]

df_phase6 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc', '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover',
       '20_dwsw', '20_cloud_cover', '88_dwsw', '88_cloud_cover', 'AVG(R)',
       'STD(R)', 'ENT(R)', 'AVG(G)', 'STD(G)', 'ENT(G)', 'AVG(B)', 'STD(B)',
       'ENT(B)', 'AVG(RB)', 'STD(RB)', 'ENT(RB)', 'AVG(NRB)', 'STD(NRB)',
       'ENT(NRB)'
      ]]


In [10]:

# 2. Rolling Origin Split (More Suitable in TimeSeries) like K-Folds
from src.data_preparation import fixed_holdout_split, rolling_origin_evaluation,save_splits_info

rollingSplits_df_24h = rolling_origin_evaluation(df=df, start_train = '2014-01-2',
    end_train = '2016-12-31')
save_splits_info({}, rollingSplits_df_24h, experiment_name="exp-003")



ROLLING ORIGIN EVALUATION
Total folds: 35
Frequency: MS
Data range: 2014-01-02 to 2016-12-30

Fold Summary:
  Fold 1: Train [2014-01-02 to 2014-02-01] (336 records) → Val [2014-02-02 to 2014-02-28] (203 records)
  Fold 2: Train [2014-01-02 to 2014-03-01] (551 records) → Val [2014-03-02 to 2014-03-31] (349 records)
  Fold 3: Train [2014-01-02 to 2014-04-01] (923 records) → Val [2014-04-02 to 2014-04-30] (337 records)
  Fold 4: Train [2014-01-02 to 2014-05-01] (1,283 records) → Val [2014-05-02 to 2014-05-31] (349 records)
  Fold 5: Train [2014-01-02 to 2014-06-01] (1,655 records) → Val [2014-06-02 to 2014-06-30] (337 records)
  Fold 6: Train [2014-01-02 to 2014-07-01] (2,015 records) → Val [2014-07-02 to 2014-07-31] (349 records)
  Fold 7: Train [2014-01-02 to 2014-08-01] (2,387 records) → Val [2014-08-02 to 2014-08-31] (349 records)
  Fold 8: Train [2014-01-02 to 2014-09-01] (2,759 records) → Val [2014-09-02 to 2014-09-30] (337 records)
  Fold 9: Train [2014-01-02 to 2014-10-01] (3,119

----

In [11]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase1.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase1, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph1_X, ph1_Y, ph1_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph1_X, ph1_Y,
    pd.DataFrame(index=pd.to_datetime(ph1_labels_list, utc=True)),
    filename_prefix='phas1_data',  # <-- You are saving as 'phas2_data'
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p1_ReLU_exp01_essentialF",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "ReLU"

    },
    "data_prefix": "phas1_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False,

}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas1_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 7), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas1_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 7), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
INF

----

In [12]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase1.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase1, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph1_X, ph1_Y, ph1_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph1_X, ph1_Y,
    pd.DataFrame(index=pd.to_datetime(ph1_labels_list, utc=True)),
    filename_prefix='phas1_data',  # <-- You are saving as 'phas2_data'
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p1_Tanh_exp01_essentialF",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas1_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False,

}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})


pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp02_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas1_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 7), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas1_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 7), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
INF

In [13]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase1.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase1, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph1_X, ph1_Y, ph1_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph1_X, ph1_Y,
    pd.DataFrame(index=pd.to_datetime(ph1_labels_list, utc=True)),
    filename_prefix='phas1_data',  # <-- You are saving as 'phas2_data'
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p1_Tanh_scaleTarg_exp01_essentialF",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas1_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True,

}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})


pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp03_results, summary = pipeline.run()

INFO:src.utils:Saved arrays to data/phas1_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 7), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas1_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 7), Y shape: (1056, 1, 12)



--- Step 2: Building model arrays (X, Y) ---


INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
INFO:src.pipeline:Train samples: 22, Val samples: 19
INFO:src.pipeline:X_train scaled range: [-1.000, 1.000]
INFO:src.pipeline:Y_train scaled range: [-1.000, 1.000]
INFO:src.pipeline:Model parameters: 180,364
INFO:src.engine:Using HuberLoss (Smooth L1 Loss)
INFO:src.engine:Ep

-----


-----

### Exp 04 Using Phase 2 Datafeatures 


In [14]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase2.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase2, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph2_X, ph2_Y, ph2_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph2_X, ph2_Y,
    pd.DataFrame(index=pd.to_datetime(ph2_labels_list, utc=True)),
    filename_prefix='phas2_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p2_exp01_Essn_BLV",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas2_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp02_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas2_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 47), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas2_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 47), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

#### Exp 05: Phase 3

In [15]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase3.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase3, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph3_X, ph3_Y, ph3_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph3_X, ph3_Y,
    pd.DataFrame(index=pd.to_datetime(ph3_labels_list, utc=True)),
    filename_prefix='phas3_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p3_exp01_Essn_BLV",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas3_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph3_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas3_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 49), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas3_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 49), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [16]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase4.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase4, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph4_X, ph4_Y, ph4_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph4_X, ph4_Y,
    pd.DataFrame(index=pd.to_datetime(ph4_labels_list, utc=True)),
    filename_prefix='phas4_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p4_exp01_Essn_BLV_5NAM_CCANDGHI",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas4_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
pipeline.reference_df = fixed_df  # <- full fixed-grid dataframe (with DateTimeIndex)
ph4_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas4_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 57), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas4_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 57), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [17]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase5.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase5, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph5_X, ph5_Y, ph5_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph5_X, ph5_Y,
    pd.DataFrame(index=pd.to_datetime(ph5_labels_list, utc=True)),
    filename_prefix='phas5_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p5_exp01_Essn_BLV_5NAM_CC",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas5_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph5_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas5_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 53), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas5_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 53), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [18]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p6_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [19]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_Atten_p6_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": True,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_Atten_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [20]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_UniLSTM_p6_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": False,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
pipeline.reference_df = fixed_df  # <- full fixed-grid dataframe (with DateTimeIndex)ph6_UNI_exp03_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)


----

In [21]:
df_phase6.columns

Index(['solar_zenith', 'CSI_ghi', 'time_gap_hours', 'time_gap_norm',
       'day_boundary_flag', 'hour_progression', 'absolute_hour', 'season_flag',
       'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h',
       'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h', 'V_CSI_ghi_9h',
       'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h', 'L_CSI_ghi_10h',
       'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h', 'B_CSI_ghi_12h',
       'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h', 'V_CSI_ghi_13h',
       'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h', 'L_CSI_ghi_14h',
       'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h', 'B_CSI_ghi_16h',
       'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h', 'V_CSI_ghi_17h',
       'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h', 'L_CSI_ghi_18h',
       'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h', 'nam_ghi', 'nam_cc',
       '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover', '20_dwsw',
       '20_cloud_cover', '88

----

In [23]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p6_LeakyReLU_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "LeakyReLU"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp03_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [24]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p6_NoTargScale_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1056, 7, 12, 72), Y shape: (1056, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

In [25]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 14
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_24h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 14,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "24hLag_BiLSTM_p6_NoTargScale_14Horizon_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-003/exp-003rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1049, 14, 12, 72), Y shape: (1049, 1, 12)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1049, 14, 12, 72), Y shape: (1049, 1, 12)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (12611 rows).
INFO:src.utils:Loaded 35 folds from exp-003/exp-003rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===

----

In [28]:
from pathlib import Path
import pandas as pd
import numpy as np

# ====== CONFIG ======
EXPERIMENTS_DIR = Path("experiments")      # <-- set your root experiments dir
TAG = "validation"                         # or "test" if you have test runs
METRICS_FILE = f"metrics_GHI_{TAG}.csv"    # we focus on GHI as requested
SAVE_SUMMARY = True                        # save consolidated CSV next to EXPERIMENTS_DIR
SUMMARY_CSV_NAME = f"experiments_skill_summary_GHI_{TAG}.csv"
# ====================

def _safe_skill(ours, ref):
    """Skill = 1 - ours/ref (positive => Ours better). NaN-safe."""
    ours = float(ours) if ours is not None else np.nan
    ref  = float(ref)  if ref  is not None else np.nan
    if not np.isfinite(ours) or not np.isfinite(ref) or ref == 0:
        return np.nan
    return 1.0 - (ours / ref)

def _read_metrics(path: Path) -> pd.DataFrame | None:
    """Read metrics CSV, robust to minor casing/spacing in index/columns."""
    try:
        df = pd.read_csv(path, index_col=0)
        # normalize index/columns (RMSE/MAE rows, Ours/NAM/SP columns)
        df.index = [str(i).strip().upper() for i in df.index]
        df.columns = [str(c).strip().title() for c in df.columns]
        return df
    except Exception as e:
        print(f"[WARN] Failed to read {path}: {e}")
        return None

def _collect_fold_skills(metrics_df: pd.DataFrame) -> dict:
    """
    Extract per-fold skills for GHI RMSE/MAE:
      - Ours vs SP
      - Ours vs NAM
    """
    out = {}
    # Expect rows "RMSE","MAE" and columns "Ours","Nam","Sp"
    for metric in ("RMSE", "MAE"):
        if metric not in metrics_df.index:
            # some files may have only RMSE, etc.
            continue
        ours = metrics_df.loc[metric, "Ours"] if "Ours" in metrics_df.columns else np.nan
        nam  = metrics_df.loc[metric, "Nam"]  if "Nam"  in metrics_df.columns else np.nan
        sp   = metrics_df.loc[metric, "Sp"]   if "Sp"   in metrics_df.columns else np.nan

        out[f"{metric}_skill_Ours_vs_SP"]  = _safe_skill(ours, sp)
        out[f"{metric}_skill_Ours_vs_NAM"] = _safe_skill(ours, nam)
    return out

def summarize_experiments(experiments_dir: Path, metrics_file: str) -> pd.DataFrame:
    """
    Walk experiments/*/benchmarks/fold_*/{metrics_file}, aggregate skills per experiment.
    Returns a DataFrame with mean ± std for each skill.
    """
    rows = []
    for exp_dir in sorted([p for p in experiments_dir.iterdir() if p.is_dir()]):
        bench = exp_dir / "benchmarks"
        if not bench.is_dir():
            continue

        fold_paths = sorted([p for p in bench.iterdir() if p.is_dir() and p.name.startswith("fold_")])
        if not fold_paths:
            continue

        fold_skills = []
        for fdir in fold_paths:
            mpath = fdir / metrics_file
            if not mpath.is_file():
                # silently skip missing folds
                continue
            mdf = _read_metrics(mpath)
            if mdf is None:
                continue
            skills = _collect_fold_skills(mdf)
            if skills:
                fold_skills.append(skills)

        if not fold_skills:
            continue

        # Convert to DataFrame for aggregation
        fs = pd.DataFrame(fold_skills)
        # Compute mean and std per skill, drop all-NaN cols
        means = fs.mean(numeric_only=True)
        stds  = fs.std(numeric_only=True)

        # Build one summary row per experiment
        row = {
            "experiment": exp_dir.name,
            "n_folds_used": int(fs.shape[0]),
        }
        # Pretty string columns (mean ± std) for quick reading
        for k in means.index:
            m = means[k]
            s = stds[k]
            row[f"{k}_mean"] = m
            row[f"{k}_std"]  = s
            row[f"{k}_pretty"] = f"{m:.3f} ± {s:.3f}" if np.isfinite(m) else "NaN"

        rows.append(row)

    if not rows:
        print("[INFO] No experiments with metrics found.")
        return pd.DataFrame()

    summary_df = pd.DataFrame(rows).sort_values("experiment").reset_index(drop=True)
    return summary_df

summary = summarize_experiments(EXPERIMENTS_DIR, METRICS_FILE)

# Reorder/compact columns for readability
if not summary.empty:
    pretty_cols = ["experiment", "n_folds_used",
                   "RMSE_skill_Ours_vs_SP_pretty",
                   "MAE_skill_Ours_vs_SP_pretty",
                   "RMSE_skill_Ours_vs_NAM_pretty",
                   "MAE_skill_Ours_vs_NAM_pretty"]
    # If any are missing because of missing metrics, keep whatever exists
    pretty_cols = [c for c in pretty_cols if c in summary.columns]

    display_cols = pretty_cols or list(summary.columns)
    display(summary[display_cols])

    if SAVE_SUMMARY:
        out_path = EXPERIMENTS_DIR / SUMMARY_CSV_NAME
        # Save numeric means/stds (drop _pretty columns)
        numeric_cols = [c for c in summary.columns if not c.endswith("_pretty")]
        summary[numeric_cols].to_csv(out_path, index=False)
        print(f"[OK] Saved summary CSV to: {out_path.resolve()}")
else:
    summary

Unnamed: 0,experiment,n_folds_used,RMSE_skill_Ours_vs_SP_pretty,MAE_skill_Ours_vs_SP_pretty,RMSE_skill_Ours_vs_NAM_pretty,MAE_skill_Ours_vs_NAM_pretty
0,1hLag_BiLSTM_p1_ReLU_exp01_essentialF_20251111...,35,-0.245 ± 1.026,-0.914 ± 1.810,0.015 ± 0.185,0.027 ± 0.210
1,1hLag_BiLSTM_p1_Tanh_exp01_essentialF_20251111...,35,-0.278 ± 1.217,-0.985 ± 2.157,0.007 ± 0.199,0.019 ± 0.219
2,1hLag_BiLSTM_p1_Tanh_scaleTarg_exp01_essential...,17,-0.096 ± 0.373,-0.566 ± 0.839,-0.012 ± 0.204,0.037 ± 0.261
3,24hLag_BiLSTM_Atten_p6_exp01_Essn_BLV_5NAM_CCA...,35,0.106 ± 0.188,0.005 ± 0.281,0.222 ± 0.230,0.307 ± 0.284
4,24hLag_BiLSTM_p1_ReLU_exp01_essentialF_2025111...,35,-0.005 ± 0.533,-0.430 ± 1.056,0.193 ± 0.118,0.143 ± 0.204
5,24hLag_BiLSTM_p1_Tanh_exp01_essentialF_2025111...,35,0.095 ± 0.178,-0.170 ± 0.352,0.224 ± 0.175,0.212 ± 0.243
6,24hLag_BiLSTM_p1_Tanh_scaleTarg_exp01_essentia...,35,0.023 ± 0.431,-0.321 ± 0.861,0.202 ± 0.149,0.189 ± 0.206
7,24hLag_BiLSTM_p2_exp01_Essn_BLV_20251111_010156,35,0.188 ± 0.294,-0.048 ± 0.535,0.328 ± 0.150,0.334 ± 0.192
8,24hLag_BiLSTM_p3_exp01_Essn_BLV_20251111_011219,35,0.126 ± 0.285,-0.041 ± 0.509,0.268 ± 0.160,0.325 ± 0.184
9,24hLag_BiLSTM_p4_exp01_Essn_BLV_5NAM_CCANDGHI_...,35,0.110 ± 0.207,-0.012 ± 0.327,0.238 ± 0.197,0.311 ± 0.233


[OK] Saved summary CSV to: /home/muhammadhassan/App_v02/experiments/experiments_skill_summary_GHI_validation.csv
