In [None]:
import numpy as np
import pandas as pd
import sys
from pathlib import Path
import xarray as xr
import torch
import os

# Append base path.  May need to be modified if the folder structure changes
sys.path.append("../../HelioFM")

from train_spectformer import get_config
from utils.data import build_scalers
from datasets.helio import HelioNetCDFDataset
import eve_dataloader

In [None]:
config_path = "./ds_configs/config_resnet_18.yaml"
config = get_config(config_path)
scalers = build_scalers(info=config.data.scalers)

### Preparing Data for Neural Network Training and Testing ###

In this section, we demonstrate how to prepare the **training data** ('X_train', 'Y_train') for modeling by extracting the relevant AIA and HMI images (`ts`) and the target EVE spectra (`target`) from the dataset. This process includes parsing temporal slices, reshaping them as input tensors, and storing them for efficient reuse during model training.

To prepare **validation data** and **testing data**, a similar approach can be followed. The only required adjustments are:

- Set `phase = "val"` or `"test"` when initializing the dataset
- Set `ds_time_column = "val_time"` or `"test_time"` to ensure the correct timestamp alignment

Additionally, the dataset uses a default **time matching tolerance** of `"6m"`:

```python
ds_time_tolerance = "6m"

In [None]:
# For details of the Dataloader parameters and returns, check eve_dataloader.py

train_dataset = eve_dataloader.EVEDSDataset(
    #### All these lines are required by the parent HelioNetCDFDataset class
    index_path=config.data.train_data_path,
    time_delta_input_minutes=config.data.time_delta_input_minutes,
    time_delta_target_minutes=config.data.time_delta_target_minutes,
    n_input_timestamps=config.data.n_input_timestamps,
    rollout_steps=config.rollout_steps,
    channels=config.data.channels,
    drop_hmi_probablity=config.drop_hmi_probablity,
    num_mask_aia_channels=config.num_mask_aia_channels,
    use_latitude_in_learned_flow=config.use_latitude_in_learned_flow,
    scalers=scalers,
    phase="train",
    #### Put your donwnstream (DS) specific parameters below this line
    ds_eve_index_path= "../../hfmds/data/AIA_EVE_dataset_combined.nc",
    ds_time_column="train_time",
    ds_time_tolerance = "6m",
    ds_match_direction = "forward"    
)

print("Sample Size:", len(train_dataset))

In [None]:
# ===============================
# Configuration
# ===============================

X_TRAIN_FILE = "X_train.pt" 
# Since X_train is N samples of 13 channels each having 4k x 4k images,
# It should be saved as a torch tensor.

Y_TRAIN_FILE = "Y_train.csv"
# Y_train is N samples of spectra having 1343 wavelength bins

# ===============================
# Initialize storage lists
# ===============================
X_list = []  # For input tensors (ts[:, 0, :, :])
Y_list = []  # For target tensors (spectra)

# ===============================
# Loop over the dataset
# ===============================

for i in range(len(train_dataset)):
    # Load the i-th sample from the dataset
    item, _ = train_dataset[i]
    
    # ---------------------------------------
    # Input tensor: item['ts'] is shape (13, 2, 4096, 4096)
    # We use only the first time slice on axis 1 → ts[:, 0, :, :]
    # Resulting shape: (13, 4096, 4096)
    # ---------------------------------------
    ts = item['ts']
    ts_single = ts[:, 0, :, :]  # Extract time slice 0 for all channels

    # Ensure it's a PyTorch tensor
    ts_tensor = torch.tensor(ts_single) if not isinstance(ts_single, torch.Tensor) else ts_single
    X_list.append(ts_tensor)

    # ---------------------------------------
    # Output tensor: item['target'] is shape (1343,)
    # ---------------------------------------
    spectra = item['target']
    spectra_tensor = torch.tensor(spectra) if not isinstance(spectra, torch.Tensor) else spectra
    Y_list.append(spectra_tensor)

    print(f"Sample {i} loaded")

# ===============================
# Stack into single tensors
# ===============================
X_train = torch.stack(X_list)  # Final shape: (N, 13, 4096, 4096)
Y_train = torch.stack(Y_list)  # Final shape: (N, 1343)

print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)

# ===============================
# OPTIONAL: Save tensors to disk
# ===============================
# torch.save(X_train, X_TRAIN_FILE)

# df_y = pd.DataFrame(Y_train.cpu().numpy())
# df_y.to_csv("Y_train.csv", index=False)
