In [10]:
import pandas as pd
from dataset_postprocessors import CustomMaskPostprocessor, HeartRateInterpolationPostprocessor

df = pd.read_parquet('/mnt/shared/mhc_dataset_out/splits/test_dataset.parquet')
standardization_df = pd.read_csv("/mnt/shared/mhc_dataset_out/standardization_params.csv")

df.file_uris = df.file_uris.apply(eval)

dataset_df = df.head(50)

scaler_stats = {}
for f_idx, row in standardization_df.iloc[:6].iterrows():
    scaler_stats[f_idx] = (row["mean"], row["std_dev"])

p0 = CustomMaskPostprocessor(heart_rate_original_index=5, expected_raw_features=6)
p1 = HeartRateInterpolationPostprocessor(heart_rate_original_index=5, expected_raw_features=6)

In [14]:
import sys
import os
import logging
import datasets

# Add the src directory to the Python path to import the modules
# Adjust the path if your notebook is not directly in the 'notebooks' directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.torch_dataset import FlattenedMhcDataset
from src.huggingface_dataset import create_and_save_hf_dataset_as_gluonTS_style

# --- Configuration ---
# Configure logging

logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - [%(name)s] %(message)s')
logger = logging.getLogger(__name__)



# Define the root directory where the .npy files are located
# Adjust this path based on where your data files corresponding to 'file_uris' actually are
root_dir = '/mnt/shared/mhc_dataset/' 

# Define where to save the Hugging Face dataset
output_hf_dataset_path = "/mnt/nvme/mhc_tmp" #test_hf

cache_dir = "/mnt/nvme/hf_cache_tmp_1"

# Determine feature selection (set to None to use all features)
# If you want specific features, provide a list of indices: e.g., [0, 1, 5]
selected_features = list(range(6)) 

# Decide whether to include mask data (assumes it exists in the source .npy files)
include_mask = True
feature_stats = scaler_stats
# --- Dataset Creation ---
logger.info("Starting Hugging Face dataset conversion process...")

# 1. Instantiate the FlattenedMhcDataset
logger.info(f"Loading FlattenedMhcDataset with root directory: {root_dir}")
try:
    # Use the dataset_df you loaded earlier
    torch_dataset = FlattenedMhcDataset(
        dataframe=dataset_df, # Using the dataframe loaded in the previous cell
        root_dir=root_dir,
        include_mask=include_mask,
        feature_indices=selected_features,
        use_cache=False, # Disable caching for this potentially small/test dataset
        #feature_stats=feature_stats
    )
    logger.info(f"Successfully instantiated FlattenedMhcDataset with {len(torch_dataset)} samples.")
    
    # Optional: Check the first sample
    if len(torch_dataset) > 0:
        sample0 = torch_dataset[0]
        logger.info(f"Sample 0 data shape: {sample0['data'].shape}")
        if include_mask:
             logger.info(f"Sample 0 mask shape: {sample0['mask'].shape}")
    else:
        logger.warning("The created torch_dataset is empty!")

except Exception as e:
    logger.error(f"Error instantiating FlattenedMhcDataset: {e}", exc_info=True)
    raise # Re-raise the exception to stop execution if dataset loading fails

# 2. Create and Save the Hugging Face Dataset
logger.info(f"Attempting to create and save Hugging Face dataset to: {output_hf_dataset_path}")
try:
    create_and_save_hf_dataset_as_gluonTS_style(
        torch_dataset=torch_dataset,
        save_path=output_hf_dataset_path,
        # num_features will be inferred if None, otherwise provide it:
        # num_features=len(selected_features) if selected_features else None, 
        num_features=6, # Let the function try to infer
        include_mask_as_dynamic_feature=False,
        # Optional arguments (uncomment/adjust if needed):
        cache_dir=cache_dir, 
        # num_proc=8, 
        keep_in_memory=False,
        set_masked_target_to_nan=True,
        input_mask_key='mask'
    )
    logger.info("Successfully created and saved Hugging Face dataset.")

    # 3. Verification (Optional)
    logger.info(f"Verifying saved dataset at {output_hf_dataset_path}...")
    reloaded_dataset = datasets.load_from_disk(output_hf_dataset_path)
    print("\nReloaded Hugging Face Dataset Info:")
    print(reloaded_dataset)
    if len(reloaded_dataset) > 0:
        print("\nFirst sample from reloaded dataset:")
        print(reloaded_dataset[0])
    else:
        print("\nReloaded dataset is empty.")

except FileNotFoundError as e:
     logger.error(f"FileNotFoundError during dataset creation/saving: {e}. "
                  f"Check if the 'root_dir' ('{root_dir}') is correct and "
                  f"if the '.npy' files listed in 'dataset_df' exist at the expected locations.", exc_info=True)
except ValueError as e:
     logger.error(f"ValueError during dataset creation/saving: {e}. "
                  f"This might be due to issues inferring 'num_features' or problems with the data shapes.", exc_info=True)
except Exception as e:
    logger.error(f"An unexpected error occurred during Hugging Face dataset creation/saving: {e}", exc_info=True)



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 50 examples [00:01, 33.86 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 50/50 [00:00<00:00, 1316.19 examples/s]


Reloaded Hugging Face Dataset Info:
Dataset({
    features: ['target', 'start', 'freq', 'item_id', 'feat_static_real'],
    num_rows: 50
})

First sample from reloaded dataset:
{'target': [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0,




In [9]:
len(dataset_df)

50

In [25]:
import numpy as np
for i in range(len(reloaded_dataset)):
    dt = np.array(reloaded_dataset[i]["target"]).dtype
    if dt != np.float64:
        print(f"Non-float32 dtype found at index {i}: {dt}")


In [15]:
import numpy as np
np.nanmax(np.array(reloaded_dataset[0]["target"])[0])

np.float64(5.0)

In [4]:
output_hf_dataset_path

'/mnt/nvme/mhc_tmp'