In [13]:
import matplotlib.pyplot as plt
plt.ioff()

<contextlib.ExitStack at 0x7f2514e60c10>

## 3 Hour ahead Lag Experiment

In [14]:
# Loading Data
from src.data_preparation import load_data

PROCESSED_DATA_PATH = "data/processed/df_1h_lag_BLV_spatial_images.csv"
df = load_data(PROCESSED_DATA_PATH, date_col="measurement_time")

df.drop(columns=["Unnamed: 0", "timestamp"], inplace=True)



df_interpolated = df.interpolate(method='linear')

df_interpolated.fillna(0, inplace=True)

features_to_check = ['ghi', 'dni', 'solar_zenith', 'GHI_cs', 'DNI_cs', 'CSI_ghi', 
                                   'CSI_dni',
                                   'nam_ghi', 'nam_dni', 'nam_cc', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover',
       '20_dwsw', '20_cloud_cover', '88_dwsw', '88_cloud_cover', 'AVG(R)',
       'STD(R)', 'ENT(R)', 'AVG(G)', 'STD(G)', 'ENT(G)', 'AVG(B)', 'STD(B)',
       'ENT(B)', 'AVG(RB)', 'STD(RB)', 'ENT(RB)', 'AVG(NRB)', 'STD(NRB)',
       'ENT(NRB)']

# 2. Calculate the number of NaN values for each feature
# We use .isna() instead of == 0
nans_per_feature = df_interpolated[features_to_check].isna().sum()

# 3. Filter to get only features that actually have NaN values
features_with_nans = nans_per_feature[nans_per_feature > 0]

# 4. Report the findings for which features have NaNs
if features_with_nans.empty:
    print("No NaN (missing) values found in any of the specified features.")
else:
    print("--- Features With NaN Values ---")
    print("The following features have NaN values, with the total count for each:")
    # Sort for clearer output
    print(features_with_nans.sort_values(ascending=False))
    print("\n" + "="*40 + "\n")

    # 5. Analyze the distribution of hours for rows containing NaNs
    print("--- Distribution of NaN-Value Records by Hour ---")
    
    # Create a boolean mask for rows that contain *at least one* NaN
    # in the specified columns
    rows_with_any_nan = df_interpolated[features_to_check].isna().any(axis=1)
    
    if rows_with_any_nan.sum() > 0:
        # Get the index for these rows
        nan_rows_index = df_interpolated.index[rows_with_any_nan]
        
        # Extract the hour from the DatetimeIndex and get the value counts
        hour_distribution = nan_rows_index.hour.value_counts().sort_index()
        
        print("Distribution of records (rows) containing at least one NaN, by hour:")
        print(hour_distribution)
        
        # Optional: Print total number of affected rows
        print(f"\nTotal number of rows with at least one NaN: {rows_with_any_nan.sum()}")
    else:
        # This case shouldn't be hit if features_with_nans was not empty,
        # but it's good practice to include.
        print("No rows found with NaN values (this is unexpected, check logic).")



Loaded 11,560 records
Date range: 2014-01-03 14:00:00+00:00 to 2016-12-30 23:00:00+00:00
Timezone: UTC
No NaN (missing) values found in any of the specified features.


  df_interpolated = df.interpolate(method='linear')


In [15]:
import torch
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    else "cpu"
)

device

'cuda'

In [16]:
df_phase1 = df_interpolated[['CSI_ghi','time_gap_hours',
       'time_gap_norm',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']]


df_phase2 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h']]

df_phase3 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc']]

df_phase4 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc', '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover',
       '20_dwsw', '20_cloud_cover', '88_dwsw', '88_cloud_cover']]

df_phase5 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc','80_cloud_cover',  '56_cloud_cover',
        '20_cloud_cover', '88_cloud_cover']]

df_phase6 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
       'time_gap_norm', 'day_boundary_flag', 'hour_progression',
       'absolute_hour',
       'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       'nam_ghi', 'nam_cc', '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover',
       '20_dwsw', '20_cloud_cover', '88_dwsw', '88_cloud_cover', 'AVG(R)',
       'STD(R)', 'ENT(R)', 'AVG(G)', 'STD(G)', 'ENT(G)', 'AVG(B)', 'STD(B)',
       'ENT(B)', 'AVG(RB)', 'STD(RB)', 'ENT(RB)', 'AVG(NRB)', 'STD(NRB)',
       'ENT(NRB)'
      ]]


In [17]:

# 2. Rolling Origin Split (More Suitable in TimeSeries) like K-Folds
from src.data_preparation import fixed_holdout_split, rolling_origin_evaluation,save_splits_info

rollingSplits_df_1h = rolling_origin_evaluation(df=df, start_train = '2014-01-2',
    end_train = '2016-12-31')
save_splits_info({}, rollingSplits_df_1h, experiment_name="exp-001")



ROLLING ORIGIN EVALUATION
Total folds: 35
Frequency: MS
Data range: 2014-01-03 to 2016-12-30

Fold Summary:
  Fold 1: Train [2014-01-03 to 2014-02-01] (297 records) → Val [2014-02-02 to 2014-02-28] (186 records)
  Fold 2: Train [2014-01-03 to 2014-03-01] (494 records) → Val [2014-03-02 to 2014-03-31] (319 records)
  Fold 3: Train [2014-01-03 to 2014-04-01] (835 records) → Val [2014-04-02 to 2014-04-30] (308 records)
  Fold 4: Train [2014-01-03 to 2014-05-01] (1,165 records) → Val [2014-05-02 to 2014-05-31] (319 records)
  Fold 5: Train [2014-01-03 to 2014-06-01] (1,506 records) → Val [2014-06-02 to 2014-06-30] (308 records)
  Fold 6: Train [2014-01-03 to 2014-07-01] (1,836 records) → Val [2014-07-02 to 2014-07-31] (319 records)
  Fold 7: Train [2014-01-03 to 2014-08-01] (2,177 records) → Val [2014-08-02 to 2014-08-31] (319 records)
  Fold 8: Train [2014-01-03 to 2014-09-01] (2,518 records) → Val [2014-09-02 to 2014-09-30] (308 records)
  Fold 9: Train [2014-01-03 to 2014-10-01] (2,848

----

In [18]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase1.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase1, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph1_X, ph1_Y, ph1_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph1_X, ph1_Y,
    pd.DataFrame(index=pd.to_datetime(ph1_labels_list, utc=True)),
    filename_prefix='phas1_data',  # <-- You are saving as 'phas2_data'
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p1_ReLU_exp01_essentialF",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "ReLU"

    },
    "data_prefix": "phas1_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False,

}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp01_results, summary = pipeline.run()

INFO:src.utils:Saved arrays to data/phas1_data_*.npy



--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:X shape: (1055, 7, 11, 7), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas1_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 7), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-001/exp-001rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
INFO:src.pipeline:Train samples: 21, Val samples: 18
INF

----

In [19]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase1.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase1, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph1_X, ph1_Y, ph1_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph1_X, ph1_Y,
    pd.DataFrame(index=pd.to_datetime(ph1_labels_list, utc=True)),
    filename_prefix='phas1_data',  # <-- You are saving as 'phas2_data'
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p1_Tanh_exp01_essentialF",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas1_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False,

}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})


pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp02_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas1_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 7), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas1_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 7), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-001/exp-001rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
INF

In [20]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase1.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase1, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph1_X, ph1_Y, ph1_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph1_X, ph1_Y,
    pd.DataFrame(index=pd.to_datetime(ph1_labels_list, utc=True)),
    filename_prefix='phas1_data',  # <-- You are saving as 'phas2_data'
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p1_Tanh_scaleTarg_exp01_essentialF",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas1_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True,

}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})


pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp03_results, summary = pipeline.run()

INFO:src.utils:Saved arrays to data/phas1_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 7), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas1_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 7), Y shape: (1055, 1, 11)



--- Step 2: Building model arrays (X, Y) ---


INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-001/exp-001rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
INFO:src.pipeline:Train samples: 21, Val samples: 18
INFO:src.pipeline:X_train scaled range: [-1.000, 1.000]
INFO:src.pipeline:Y_train scaled range: [-1.000, 1.000]
INFO:src.pipeline:Model parameters: 179,851
INFO:src.engine:Using HuberLoss (Smooth L1 Loss)
INFO:src.engine:Ep

KeyboardInterrupt: 

-----


-----

### Exp 04 Using Phase 2 Datafeatures 


In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase2.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase2, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph2_X, ph2_Y, ph2_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph2_X, ph2_Y,
    pd.DataFrame(index=pd.to_datetime(ph2_labels_list, utc=True)),
    filename_prefix='phas2_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p2_exp01_Essn_BLV",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas2_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph1_exp02_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas2_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 47), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas2_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 47), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

#### Exp 05: Phase 3

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase3.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase3, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph3_X, ph3_Y, ph3_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph3_X, ph3_Y,
    pd.DataFrame(index=pd.to_datetime(ph3_labels_list, utc=True)),
    filename_prefix='phas3_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p3_exp01_Essn_BLV",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas3_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph3_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas3_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 49), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas3_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 49), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase4.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase4, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph4_X, ph4_Y, ph4_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph4_X, ph4_Y,
    pd.DataFrame(index=pd.to_datetime(ph4_labels_list, utc=True)),
    filename_prefix='phas4_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p4_exp01_Essn_BLV_5NAM_CCANDGHI",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas4_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

pipeline = SolarForecastingPipeline(LSTM_CONFIG)
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
pipeline.reference_df = fixed_df  # <- full fixed-grid dataframe (with DateTimeIndex)
ph4_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas4_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 57), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas4_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 57), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase5.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase5, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph5_X, ph5_Y, ph5_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph5_X, ph5_Y,
    pd.DataFrame(index=pd.to_datetime(ph5_labels_list, utc=True)),
    filename_prefix='phas5_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p5_exp01_Essn_BLV_5NAM_CC",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas5_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph5_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas5_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 53), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas5_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 53), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p6_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_Atten_p6_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": True,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_Atten_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_UniLSTM_p6_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": False,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
pipeline.reference_df = fixed_df  # <- full fixed-grid dataframe (with DateTimeIndex)ph6_UNI_exp03_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)


----

In [None]:
df_phase6.columns

Index(['solar_zenith', 'CSI_ghi', 'time_gap_hours', 'time_gap_norm',
       'day_boundary_flag', 'hour_progression', 'absolute_hour', 'season_flag',
       'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'B_CSI_ghi_8h',
       'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h', 'V_CSI_ghi_9h',
       'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h', 'L_CSI_ghi_10h',
       'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h', 'B_CSI_ghi_12h',
       'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h', 'V_CSI_ghi_13h',
       'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h', 'L_CSI_ghi_14h',
       'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h', 'B_CSI_ghi_16h',
       'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h', 'V_CSI_ghi_17h',
       'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h', 'L_CSI_ghi_18h',
       'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h', 'nam_ghi', 'nam_cc',
       '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover', '20_dwsw',
       '20_cloud_cover', '88

In [None]:

from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p6_exp01_Essn_BLV_5NAM_CCANDGHI_images_NAIVE_NAM_COMP",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}


# If your column names differ, set them in the config:

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp01_results, summary = pipeline.run()




--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

In [None]:

from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p6_exp02_arch128n_4l_Essn_BLV_5NAM_CCANDGHI_images_NAIVE_NAM_COMP",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 128,
        "num_layers": 4,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}


# If your column names differ, set them in the config:

LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp02_results, summary = pipeline.run()




--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

----

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p6_LeakyReLU_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "LeakyReLU"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": True
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp03_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

---

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p6_NoTargScale_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 72), Y shape: (1055, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===
I

In [None]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 14
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase6.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase6, timestamp_col="measurement_time", expected_T=None)  # or set T
    


# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph6_X, ph6_Y, ph6_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph6_X, ph6_Y,
    pd.DataFrame(index=pd.to_datetime(ph6_labels_list, utc=True)),
    filename_prefix='phas6_data',  
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 14,
        "horizon_days": 1,
    }
)

# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "1hLag_BiLSTM_p6_NoTargScale_14Horizon_exp01_Essn_BLV_5NAM_CCANDGHI_images",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
        "use_attention": False,
        # "steps_per_day": 11,
        "activation_function": "Tanh"

    },
    "data_prefix": "phas6_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-001/exp-001rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber", 
    "early_stopping_patience": 20,
    "max_folds": 35,
    "scale_target": False
}
LSTM_CONFIG.update({
    "truth_csi_col": "actual_csi",
    "truth_ghi_col": None,     # or None to reconstruct as CSI * clear_sky_ghi
    "ghi_cs_col":    "clear_sky_ghi",
    "nam_ghi_col":   "nam_ghi",
    "nam_csi_col":   None,        # optional; benchmark will derive if missing
})



pipeline = SolarForecastingPipeline(LSTM_CONFIG)
ph6_exp01_results, summary = pipeline.run()


--- Step 2: Building model arrays (X, Y) ---


INFO:src.utils:Saved arrays to data/phas6_data_*.npy
INFO:src.utils:X shape: (1048, 14, 11, 72), Y shape: (1048, 1, 11)
INFO:src.pipeline:Loading data...
INFO:src.utils:Loaded arrays from data/phas6_data_*.npy
INFO:src.utils:X shape: (1048, 14, 11, 72), Y shape: (1048, 1, 11)
INFO:src.pipeline:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:src.pipeline:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json
INFO:src.pipeline:
=== Running Fold 1 ===

----

## 3 Hours Back