In [1]:
# Loading Data
from src.data_preparation import load_data

PROCESSED_DATA_PATH = "data/processed/df_1h_lag_BLV_spatial_images.csv"
df = load_data(PROCESSED_DATA_PATH, date_col="measurement_time")

df.drop(columns=["Unnamed: 0", "timestamp"], inplace=True)



df_interpolated = df.interpolate(method='linear')

df_interpolated.fillna(0, inplace=True)

features_to_check = ['ghi', 'dni', 'solar_zenith', 'GHI_cs', 'DNI_cs', 'CSI_ghi', 
                                   'CSI_dni',
                                   'nam_ghi', 'nam_dni', 'nam_cc', 'B_CSI_ghi_8h', 'V_CSI_ghi_8h', 'L_CSI_ghi_8h', 'B_CSI_ghi_9h',
       'V_CSI_ghi_9h', 'L_CSI_ghi_9h', 'B_CSI_ghi_10h', 'V_CSI_ghi_10h',
       'L_CSI_ghi_10h', 'B_CSI_ghi_11h', 'V_CSI_ghi_11h', 'L_CSI_ghi_11h',
       'B_CSI_ghi_12h', 'V_CSI_ghi_12h', 'L_CSI_ghi_12h', 'B_CSI_ghi_13h',
       'V_CSI_ghi_13h', 'L_CSI_ghi_13h', 'B_CSI_ghi_14h', 'V_CSI_ghi_14h',
       'L_CSI_ghi_14h', 'B_CSI_ghi_15h', 'V_CSI_ghi_15h', 'L_CSI_ghi_15h',
       'B_CSI_ghi_16h', 'V_CSI_ghi_16h', 'L_CSI_ghi_16h', 'B_CSI_ghi_17h',
       'V_CSI_ghi_17h', 'L_CSI_ghi_17h', 'B_CSI_ghi_18h', 'V_CSI_ghi_18h',
       'L_CSI_ghi_18h', 'B_CSI_ghi_19h', 'V_CSI_ghi_19h', 'L_CSI_ghi_19h',
       '80_dwsw', '80_cloud_cover', '56_dwsw', '56_cloud_cover',
       '20_dwsw', '20_cloud_cover', '88_dwsw', '88_cloud_cover', 'AVG(R)',
       'STD(R)', 'ENT(R)', 'AVG(G)', 'STD(G)', 'ENT(G)', 'AVG(B)', 'STD(B)',
       'ENT(B)', 'AVG(RB)', 'STD(RB)', 'ENT(RB)', 'AVG(NRB)', 'STD(NRB)',
       'ENT(NRB)']

# 2. Calculate the number of NaN values for each feature
# We use .isna() instead of == 0
nans_per_feature = df_interpolated[features_to_check].isna().sum()

# 3. Filter to get only features that actually have NaN values
features_with_nans = nans_per_feature[nans_per_feature > 0]

# 4. Report the findings for which features have NaNs
if features_with_nans.empty:
    print("No NaN (missing) values found in any of the specified features.")
else:
    print("--- Features With NaN Values ---")
    print("The following features have NaN values, with the total count for each:")
    # Sort for clearer output
    print(features_with_nans.sort_values(ascending=False))
    print("\n" + "="*40 + "\n")

    # 5. Analyze the distribution of hours for rows containing NaNs
    print("--- Distribution of NaN-Value Records by Hour ---")
    
    # Create a boolean mask for rows that contain *at least one* NaN
    # in the specified columns
    rows_with_any_nan = df_interpolated[features_to_check].isna().any(axis=1)
    
    if rows_with_any_nan.sum() > 0:
        # Get the index for these rows
        nan_rows_index = df_interpolated.index[rows_with_any_nan]
        
        # Extract the hour from the DatetimeIndex and get the value counts
        hour_distribution = nan_rows_index.hour.value_counts().sort_index()
        
        print("Distribution of records (rows) containing at least one NaN, by hour:")
        print(hour_distribution)
        
        # Optional: Print total number of affected rows
        print(f"\nTotal number of rows with at least one NaN: {rows_with_any_nan.sum()}")
    else:
        # This case shouldn't be hit if features_with_nans was not empty,
        # but it's good practice to include.
        print("No rows found with NaN values (this is unexpected, check logic).")



Loaded 11,560 records
Date range: 2014-01-03 14:00:00+00:00 to 2016-12-30 23:00:00+00:00
Timezone: UTC
No NaN (missing) values found in any of the specified features.


  df_interpolated = df.interpolate(method='linear')


In [2]:
import torch
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    else "cpu"
)

device

'cuda'

In [3]:
df_phase1 = df_interpolated[['solar_zenith', 'CSI_ghi','time_gap_hours',
        'time_gap_norm', 'day_boundary_flag', 'hour_progression',
        'absolute_hour',
        'season_flag', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']]






In [4]:
from src.preprocessing import build_model_arrays, to_fixedgrid_multiindex
import pandas as pd
from src.utils import DataManager
from src.pipeline import SolarForecastingPipeline

# --- Block 1: Define variables ---

TARGET_COL = "CSI_ghi"  # <-- FIX 1: Define Target FIRST
history_days = 7
horizon_days = 1

# This list now correctly excludes the target
feature_cols = [c for c in df_phase1.columns.tolist() if c != TARGET_COL]

# --- Block 2: Build model arrays (Your code was correct here) ---


print("\n--- Step 2: Building model arrays (X, Y) ---")
fixed_df = to_fixedgrid_multiindex(df_phase1, timestamp_col="measurement_time", expected_T=None)  # or set T
    






--- Step 2: Building model arrays (X, Y) ---


In [5]:
fixed_df.isna().sum()

solar_zenith         0
CSI_ghi              0
time_gap_hours       0
time_gap_norm        0
day_boundary_flag    0
hour_progression     0
absolute_hour        0
season_flag          0
hour_sin             0
hour_cos             0
month_sin            0
month_cos            0
dtype: int64

In [6]:
TARGET_COL , feature_cols

('CSI_ghi',
 ['solar_zenith',
  'time_gap_hours',
  'time_gap_norm',
  'day_boundary_flag',
  'hour_progression',
  'absolute_hour',
  'season_flag',
  'hour_sin',
  'hour_cos',
  'month_sin',
  'month_cos'])

In [7]:
fixed_df.columns

Index(['solar_zenith', 'CSI_ghi', 'time_gap_hours', 'time_gap_norm',
       'day_boundary_flag', 'hour_progression', 'absolute_hour', 'season_flag',
       'hour_sin', 'hour_cos', 'month_sin', 'month_cos'],
      dtype='object')

In [8]:
fixed_df.isna().sum()

solar_zenith         0
CSI_ghi              0
time_gap_hours       0
time_gap_norm        0
day_boundary_flag    0
hour_progression     0
absolute_hour        0
season_flag          0
hour_sin             0
hour_cos             0
month_sin            0
month_cos            0
dtype: int64

In [9]:
import numpy as np
# _tensor_from_norm NaN Issue
norm_df = fixed_df.copy()
norm_df = norm_df.sort_index()

dates = list(norm_df.index.get_level_values("date").unique())
K = int(norm_df.index.get_level_values("bin_id").max()) + 1
# the number of features, the third dim.
F = len(feature_cols)

# (num_days, K_bins, F). Filled with (Not a Number)
X = np.full((len(dates), K, F), np.nan, dtype=float)


In [10]:
# builds the tensor one feature at a time.
for j, col in enumerate(feature_cols):
    if col not in norm_df.columns:
        continue

    # (date, bin_id) -> value; drop target_time by taking first. 2D matrix (mat) where rows are dates and columns are bin IDs.
    mat = (
            norm_df[col]                                                                # 1. Select Series
            .groupby(level=["date", "bin_id"])                                          # 2. Group by (date, bin)
            .first()                         # one value per (date, bin_id)             # 3. Collapse time level
            .unstack("bin_id")               # rows: date, cols: bin_id                 # 4. Pivot bins to columns
            .reindex(index=dates, columns=range(K))                                     # 5. Enforce shape
        )

X[:, :, j] = mat.values # takes the 2D matrix of data for the current feature and slots it perfectly into its designated "slice" of the final 3D tensor X.


In [11]:
np.isnan(X).any(), np.isnan(X).sum()

(np.True_, np.int64(116942))

In [12]:
print("--- 1. Total NaN Analysis ---")
total_nans = np.isnan(X).sum()
total_elements = X.size
percent_nan = (total_nans / total_elements) * 100
print(f"Total elements in X: {total_elements}")
print(f"Total NaN values:    {total_nans}")
print(f"Overall NaN percentage: {percent_nan:.2f}%\n")


print("--- 2. NaN Distribution per Bin ---")
# Sum NaNs along the 'days' (0) and 'features' (2) axes
nans_per_bin = np.isnan(X).sum(axis=(0, 2))
# Total possible data points for one bin = num_days * num_features
total_per_bin = X.shape[0] * X.shape[2]
percent_per_bin = (nans_per_bin / total_per_bin) * 100

print(f"Total data points per bin: {total_per_bin}")
print("NaN counts and percentage per bin:")
for i in range(K):
    print(f"  Bin {i}: {nans_per_bin[i]:>4} NaNs ({percent_per_bin[i]:6.2f}%)")
print("\n")


print("--- 3. NaN Distribution per Feature ---")
# Sum NaNs along the 'days' (0) and 'bins' (1) axes
nans_per_feature = np.isnan(X).sum(axis=(0, 1))
# Total possible data points for one feature = num_days * num_bins
total_per_feature = X.shape[0] * X.shape[1]
percent_per_feature = (nans_per_feature / total_per_feature) * 100

print(f"Total data points per feature: {total_per_feature}")
print("NaN counts and percentage per feature:")
for j, col in enumerate(feature_cols):
    print(f"  {col:<15}: {nans_per_feature[j]:>4} NaNs ({percent_per_feature[j]:6.2f}%)")
print("\n")


print("--- 4. Detailed 2D Breakdown (NaN Counts per Bin and Feature) ---")
# Sum NaNs only along the 'days' (0) axis
# This gives a 2D matrix of shape (K_bins, F_features)
nan_matrix = np.isnan(X).sum(axis=0)

# Use pandas for a nice printout
nan_summary_df = pd.DataFrame(nan_matrix, columns=feature_cols, dtype=int)
nan_summary_df.index.name = 'Bin ID'
print(nan_summary_df)
print("\n")

--- 1. Total NaN Analysis ---
Total elements in X: 128502
Total NaN values:    116942
Overall NaN percentage: 91.00%

--- 2. NaN Distribution per Bin ---
Total data points per bin: 11682
NaN counts and percentage per bin:
  Bin 0: 10620 NaNs ( 90.91%)
  Bin 1: 10631 NaNs ( 91.00%)
  Bin 2: 10631 NaNs ( 91.00%)
  Bin 3: 10631 NaNs ( 91.00%)
  Bin 4: 10631 NaNs ( 91.00%)
  Bin 5: 10631 NaNs ( 91.00%)
  Bin 6: 10631 NaNs ( 91.00%)
  Bin 7: 10631 NaNs ( 91.00%)
  Bin 8: 10631 NaNs ( 91.00%)
  Bin 9: 10631 NaNs ( 91.00%)
  Bin 10: 10643 NaNs ( 91.11%)


--- 3. NaN Distribution per Feature ---
Total data points per feature: 11682
NaN counts and percentage per feature:
  solar_zenith   : 11682 NaNs (100.00%)
  time_gap_hours : 11682 NaNs (100.00%)
  time_gap_norm  : 11682 NaNs (100.00%)
  day_boundary_flag: 11682 NaNs (100.00%)
  hour_progression: 11682 NaNs (100.00%)
  absolute_hour  : 11682 NaNs (100.00%)
  season_flag    : 11682 NaNs (100.00%)
  hour_sin       : 11682 NaNs (100.00%)
  hour

In summary:
- 1039 days (1062 - 23) have complete data (all 11 bins).
- 12 days (23 - 11) are missing only the last bin (Bin 10).
- 11 days are missing all bins except the first one (Bins 1-10).

In [13]:
X_imputed_zero = np.nan_to_num(X, nan=0.0)

In [14]:
np.isnan(X_imputed_zero).any(), np.isnan(X_imputed_zero).sum()

(np.False_, np.int64(0))

In [15]:
from src.preprocessing import _tensor_from_norm
norm_df = fixed_df.copy()
# Once for all the input features (feature_cols) to create a 3D tensor X_all with the shape (num_days, K, F).
X_all, dates = _tensor_from_norm(norm_df, feature_cols=feature_cols)
# Once for just the single target variable (target_col) to create a tensor y_all.
y_all, _ = _tensor_from_norm(norm_df, feature_cols=[TARGET_COL])
# squeezes out the last dimension, making y_all a more convenient 2D matrix of shape (num_days, K).
y_all = y_all[:,:,0]

In [16]:
np.isnan(X_all).any(), np.isnan(y_all).any()

(np.False_, np.False_)

In [17]:
# You correctly passed the filtered 'feature_cols' here, fixing the data leak!
ph1_X, ph1_Y, ph1_labels_list = build_model_arrays(
        fixed_df,
        feature_cols=feature_cols,  
        target_col=TARGET_COL,
        history_days=history_days,
        horizon_days=horizon_days,
    )

In [18]:
np.isnan(ph1_X).any()

np.False_

In [19]:
# --- Block 3: Save arrays ---
data_manager = DataManager()
data_manager.save_arrays(
    ph1_X, ph1_Y,
    pd.DataFrame(index=pd.to_datetime(ph1_labels_list, utc=True)),
    filename_prefix='phas1_data',  # <-- You are saving as 'phas2_data'
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    metadata={
        "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
        "timestamp_col": "measurement_time",
        "feature_set": feature_cols,
        "history_days": 7,
        "horizon_days": 1,
    }
)

INFO:src.utils:Saved arrays to data/phas1_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 11), Y shape: (1055, 1, 11)


In [20]:
# --- Block 4: Configure and Run ---
LSTM_CONFIG = {
    "experiment_name": "UniLSTM_p1_exp01_essentialF",
    "model_type": "LSTM",
    "model_config": {
        "hidden_size": 64,
        "num_layers": 2,
        "dropout": 0.35,
        "bidirectional": True,
    },
    "data_prefix": "phas1_data",  # <-- FIX 2: Match the filename_prefix
    "splits_file": "exp-004/exp-004rolling_origin_splits.json",
    "feature_cols": feature_cols,
    "feature_selection": feature_cols, # <-- This is correct!
    "target_col": TARGET_COL,
    "batch_size": 32,
    "num_epochs": 50,
    "learning_rate": 0.001,
    "loss_function": "Huber",  # <-- This setting needs the fix below
    "early_stopping_patience": 20,
    "max_folds": 35,
}



----


#### Function Testing

In [21]:
pipeline = SolarForecastingPipeline(LSTM_CONFIG)

In [22]:
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Save configuration LSTM_CONFIG, and initilize the experiment directories.
pipeline.tracker.save_config(pipeline.config)

logger.info("Loading data...")

# Load Packed X, Y, Labels, and metadata
"""
{
  "X_shape": [
    1055,
    7,
    11,
    3
  ],
  "Y_shape": [
    1055,
    1,
    11
  ],
  "X_dtype": "float64",
  "Y_dtype": "float64",
  "saved_at": "2025-11-07T15:53:45.042544",
  "has_labels": true,
  "feature_cols": [
    "solar_zenith",
    "absolute_hour",
    "season_flag"
  ],
  "target_col": "CSI_ghi",
  "input_csv": "data/processed/df_1h_lag_BLV_spatial_images.csv",
  "timestamp_col": "measurement_time",
  "feature_set": [
    "solar_zenith",
    "absolute_hour",
    "season_flag"
  ],
  "history_days": 7,
  "horizon_days": 1
}
"""
X, Y, labels, metadata = pipeline.data_manager.load_arrays(
            filename_prefix=pipeline.config.get('data_prefix')
        )





INFO:__main__:Loading data...
INFO:src.utils:Loaded arrays from data/phas1_data_*.npy
INFO:src.utils:X shape: (1055, 7, 11, 11), Y shape: (1055, 1, 11)


In [23]:
import numpy as np
np.isnan(X).any()

np.False_

In [24]:
# Selected Features based in the data.
saved_features = metadata.get("feature_cols")
# Selected Features passed from User.
expected_features = pipeline.config.get("feature_cols")

if expected_features is not None:
    if saved_features is not None and list(expected_features) != list(saved_features):
        raise ValueError(
                    "Feature mismatch between configuration and saved arrays. "
                    f"Config expects {expected_features}, arrays contain {saved_features}."
            )
else:
    pipeline.config["feature_cols"] = saved_features

labels_index = labels.index if labels is not None else None
if labels_index is None:
    logger.warning(
                "Labels dataframe not available; NAM comparison metrics will be skipped."
            )
    pipeline.reference_df = None
else:
    # Loading Raw data that uesed to cook the X, Y, Labels Tensors.
    from src.evaluation_utils import _load_processed_dataframe
    csv_path = metadata.get("input_csv", PROCESSED_DATA_PATH) 
    base_df = _load_processed_dataframe(csv_path)
    try:
        from src.evaluation_utils import build_reference_from_existing
        logger.info("Building simple reference from existing columns (no pvlib, no regridding)...")
        # Use the SAME merged dataframe you used to build arrays (before windowing)
        # Suppose it's called `merged_df` or `base_df` in your pipeline
        pipeline.reference_df = build_reference_from_existing(
                    base_df,                         # <-- your processed/merged modeling df
                    time_col="measurement_time",
                    nam_time_col="nam_target_time",
                    meas_ghi_col="ghi",
                    nam_ghi_col="nam_ghi",
                    cs_ghi_col="GHI_cs",
                    actual_csi_col="CSI_ghi",
                )
        logger.info("Reference dataframe prepared (%d rows).", len(pipeline.reference_df))
    except Exception as exc:
            logger.warning("Failed to construct simple reference: %s", exc)
            pipeline.reference_df = None


# Load splits
splits_data = pipeline.data_manager.load_rolling_splits(
            pipeline.config.get('splits_file', 'rolling_origin_splits.json')
        )

# Run each fold
fold_results = []
max_folds = pipeline.config.get('max_folds', len(splits_data['folds']))

INFO:__main__:Building simple reference from existing columns (no pvlib, no regridding)...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ref["nam_csi"].replace([np.inf, -np.inf], np.nan, inplace=True)
INFO:__main__:Reference dataframe prepared (11560 rows).
INFO:src.utils:Loaded 35 folds from exp-004/exp-004rolling_origin_splits.json


In [25]:
import numpy as np
fold_1_data = splits_data['folds'][0]
fold_id = fold_1_data["fold_id"]

if labels is not None:
    train_idx, val_idx = pipeline.data_manager.get_fold_indices(
            X, labels, fold_1_data
    )

np.isnan(X).any()

np.False_

In [26]:
# Run fold
fold_result = pipeline.run_fold(
                X,
                Y,
                train_idx,
                val_idx,
                fold_id,
                labels_index=labels_index,
                reference_df=pipeline.reference_df,
            )
fold_results.append(fold_result)

INFO:src.pipeline:
=== Running Fold 1 ===
INFO:src.pipeline:Train samples: 21, Val samples: 18
INFO:src.pipeline:Model parameters: 182,667
INFO:src.engine:Using HuberLoss (Smooth L1 Loss)
INFO:src.engine:Epoch [10/50] - Train Loss: 0.270908, Val Loss: 0.256904, Train MAE: 0.626994, Val MAE: 0.580839, LR: 0.001000
INFO:src.engine:Epoch [20/50] - Train Loss: 0.110409, Val Loss: 0.138143, Train MAE: 0.371620, Val MAE: 0.432294, LR: 0.001000
INFO:src.engine:Epoch [30/50] - Train Loss: 0.078178, Val Loss: 0.128315, Train MAE: 0.324863, Val MAE: 0.419106, LR: 0.001000
INFO:src.engine:Epoch [40/50] - Train Loss: 0.066869, Val Loss: 0.126452, Train MAE: 0.270943, Val MAE: 0.415075, LR: 0.000500
INFO:src.engine:Early stopping at epoch 45
INFO:src.pipeline:Fold 1 - Validation Metrics:
INFO:src.pipeline:  MAE: 0.421724
INFO:src.pipeline:  RMSE: 0.510666
INFO:src.pipeline:  MAPE: 3708741.250000
INFO:src.pipeline:  per_horizon_MAE: [0.421724]


---

In [27]:
# This will now work after you apply the fixes below
# pipeline = SolarForecastingPipeline(LSTM_CONFIG)
# _, summary = pipeline.run()