In [2]:
# | default_exp preprocess.bigdata.dataloader

In [17]:
import gc
import logging
import os
import pickle
import time
from pathlib import Path
from typing import Optional

import h5py
import numpy as np
import pandas as pd
import psutil
import pytorch_lightning as pl
import ray
import torch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

logger = logging.getLogger(__name__)

device = "cuda" if torch.cuda.is_available() else "cpu"
torch._dynamo.config.suppress_errors = True


class HDF5Writer:
    """Manages thread-safe appending to a single HDF5 file."""

    def __init__(self, file_path, input_size, horizon):
        self.file_path = Path(file_path)
        self.input_size = input_size
        self.horizon = horizon
        self.file = None
        self.x_dset = None
        self.y_dset = None
        self.current_idx = 0

    def open(self):
        if self.file is None:
            self.file = h5py.File(self.file_path, "a")
            self.x_dset = self.file.get("x")
            self.y_dset = self.file.get("y")
            if self.x_dset is None:
                self.x_dset = self.file.create_dataset(
                    "x",
                    shape=(0, self.input_size),
                    maxshape=(None, self.input_size),
                    dtype=np.float32,
                    compression="lzf",
                )
                self.y_dset = self.file.create_dataset(
                    "y",
                    shape=(0, self.horizon),
                    maxshape=(None, self.horizon),
                    dtype=np.float32,
                    compression="lzf",
                )
            self.current_idx = self.x_dset.shape[0]

    def append(self, x_windows, y_windows):
        self.open()
        num_windows = len(x_windows)
        self.x_dset.resize(self.current_idx + num_windows, axis=0)
        self.y_dset.resize(self.current_idx + num_windows, axis=0)
        self.x_dset[self.current_idx : self.current_idx + num_windows] = x_windows
        self.y_dset[self.current_idx : self.current_idx + num_windows] = y_windows
        self.current_idx += num_windows
        self.file.flush()
        logger.debug(f"Appended {num_windows} windows to {self.file_path}")

    def close(self):
        if self.file is not None:
            self.file.close()
            self.file = None
            self.x_dset = None
            self.y_dset = None
            gc.collect()


@ray.remote
class HDF5WriterActor:
    def __init__(self, file_path, input_size, horizon):
        self.writer = HDF5Writer(file_path, input_size, horizon)

    def append(self, x_windows, y_windows):
        self.writer.append(x_windows, y_windows)

    def close(self):
        self.writer.close()


class TSPreprocessor:
    def __init__(
        self,
        df,
        input_size,
        horizon,
        target_col="y_scaled",
        train_split=0.7,
        val_split=0.15,
        split_type="horizontal",
        step_size=1,
        cache_dir=".",
        use_cache=True,
        experiment_name="default_experiment",
        num_workers=4,
        chunk_size=2000,
        adaptive_step=False,
        downsample_factor=1,
        large_dataset=True,
        save_by_window_count=False,
        max_windows_per_file=2000,
        batch_size=200,
    ):
        logger.info("Initializing TSPreprocessor")
        self.df = df
        self.target_col = target_col if target_col in df.columns else "y"
        self.input_size = input_size
        self.horizon = horizon
        self.train_split = train_split
        self.val_split = val_split
        self.split_type = split_type
        self.step_size = step_size
        self.cache_dir = Path(cache_dir) / experiment_name
        self.cache_dir.mkdir(exist_ok=True, parents=True)
        self.use_cache = use_cache
        self.num_workers = min(num_workers, os.cpu_count() or 4)
        self.experiment_name = experiment_name
        self.chunk_size = chunk_size
        self.adaptive_step = adaptive_step
        self.downsample_factor = downsample_factor
        self.large_dataset = large_dataset
        self.save_by_window_count = save_by_window_count
        self.max_windows_per_file = max_windows_per_file
        self.batch_size = batch_size

        if not 0 < train_split + val_split <= 1:
            raise ValueError("train_split + val_split must be between 0 and 1")
        if step_size < 1:
            raise ValueError("step_size must be >= 1")
        if downsample_factor < 1:
            raise ValueError("downsample_factor must be >= 1")
        if max_windows_per_file < 1:
            raise ValueError("max_windows_per_file must be >= 1")
        if batch_size < 1:
            raise ValueError("batch_size must be >= 1")

        # Initialize Ray
        if not ray.is_initialized():
            ray.init(num_cpus=self.num_workers, ignore_reinit_error=True)

        self.train_windows, self.val_windows, self.test_windows = self._process_data()

        if len(self.train_windows) == 0:
            logger.error(
                "No training windows generated. Check input_size, horizon, or data lengths."
            )
            raise ValueError(
                "Training dataset is empty. Ensure series lengths are sufficient for input_size + horizon."
            )

        del self.df
        gc.collect()

    def _generate_windows_in_memory(self, series):
        series_len = len(series)
        if series_len < self.input_size + self.horizon:
            return []

        max_idx = series_len - self.input_size - self.horizon + 1
        if max_idx <= 0:
            return []

        window_starts = np.arange(0, max_idx, self.step_size, dtype=np.int32)
        window_ends = window_starts + self.input_size
        horizon_ends = window_ends + self.horizon

        valid_windows = horizon_ends <= series_len
        window_starts = window_starts[valid_windows]
        window_ends = window_ends[valid_windows]
        horizon_ends = horizon_ends[valid_windows]

        x_windows = np.lib.stride_tricks.sliding_window_view(series, window_shape=self.input_size)[
            window_starts
        ]
        y_windows = np.stack([series[we:he] for we, he in zip(window_ends, horizon_ends)])

        return list(zip(x_windows, y_windows))

    @staticmethod
    def _generate_windows_optimized(
        series,
        unique_id,
        split,
        writer_actor,
        chunk_size,
        input_size,
        horizon,
        step_size,
        adaptive_step,
    ):
        series_len = len(series)
        if series_len < input_size + horizon:
            logger.debug(
                f"[{unique_id}] Skipping series for {split} split (length={series_len} < {input_size + horizon})"
            )
            return 0

        max_idx = series_len - input_size - horizon + 1
        if max_idx <= 0:
            logger.debug(f"[{unique_id}] No valid windows for {split} split (max_idx={max_idx})")
            return 0

        step_size = step_size
        if adaptive_step:
            step_size = max(1, series_len // 1000)

        chunk_size = chunk_size or max(100, min(1000, series_len // 10))
        window_count = 0

        for start in range(0, max_idx, chunk_size * step_size):
            end = min(start + chunk_size * step_size, max_idx)
            window_starts = np.arange(start, end, step_size, dtype=np.int32)
            window_ends = window_starts + input_size
            horizon_ends = window_ends + horizon

            valid_windows = horizon_ends <= series_len
            window_starts = window_starts[valid_windows]
            window_ends = window_ends[valid_windows]
            horizon_ends = horizon_ends[valid_windows]

            x_windows = np.lib.stride_tricks.sliding_window_view(series, window_shape=input_size)[
                window_starts
            ]
            y_windows = np.stack([series[we:he] for we, he in zip(window_ends, horizon_ends)])

            ray.get(writer_actor.append.remote(x_windows, y_windows))
            window_count += len(x_windows)
            logger.debug(f"[{unique_id}] Appended {len(x_windows)} windows for {split} split")

            del x_windows, y_windows
            gc.collect()

        logger.debug(f"[{unique_id}] Total {window_count} windows appended for {split} split")
        return window_count

    def _process_one_series_in_memory(self, unique_id_group):
        unique_id, group = unique_id_group
        series = group[self.target_col].values.astype(np.float32)

        windows = self._generate_windows_in_memory(series)
        del series
        gc.collect()

        if not windows:
            logger.warning(f"[{unique_id}] Series too short to generate any windows")
            return [], [], [], unique_id

        if self.split_type == "horizontal":
            num_windows = len(windows)
            train_end = int(num_windows * self.train_split)
            val_end = train_end + int(num_windows * self.val_split)
            return (windows[:train_end], windows[train_end:val_end], windows[val_end:], unique_id)
        else:
            return windows, [], [], unique_id

    @staticmethod
    @ray.remote(num_cpus=1)
    def _process_batch_optimized(
        unique_id_groups,
        writer_actors,
        input_size,
        horizon,
        step_size,
        adaptive_step,
        downsample_factor,
        chunk_size,
        split_type,
        train_split,
        val_split,
        target_col,
    ):
        results = []
        for unique_id_group in unique_id_groups:
            unique_id, group = unique_id_group
            series = group[target_col].values.astype(np.float32)

            if len(series) < input_size + horizon:
                logger.debug(
                    f"[{unique_id}] Skipping series (length={len(series)} < {input_size + horizon})"
                )
                continue

            if downsample_factor > 1:
                series = series[::downsample_factor]

            step_size = step_size
            if adaptive_step:
                step_size = max(1, len(series) // 1000)

            if split_type == "horizontal":
                series_len = len(series)
                num_windows = max(0, (series_len - input_size - horizon + 1) // step_size)
                train_end = int(num_windows * train_split)
                val_end = train_end + int(num_windows * val_split)

                result = []
                for split, start, end in [
                    ("train", 0, train_end * step_size),
                    ("val", train_end * step_size, val_end * step_size),
                    ("test", val_end * step_size, series_len),
                ]:
                    window_count = TSPreprocessor._generate_windows_optimized(
                        series[start:end],
                        unique_id,
                        split,
                        writer_actors[split],
                        chunk_size,
                        input_size,
                        horizon,
                        step_size,
                        adaptive_step,
                    )
                    result.append((window_count, unique_id, split))
                results.append(result)
            else:
                window_count = TSPreprocessor._generate_windows_optimized(
                    series,
                    unique_id,
                    "all",
                    writer_actors["all"],
                    chunk_size,
                    input_size,
                    horizon,
                    step_size,
                    adaptive_step,
                )
                result = [(window_count, unique_id, "all")]
                results.append(result)
        return results

    def _process_data(self):
        logger.info("Processing data for all unique_ids")
        cache_file = self.cache_dir / "preprocessed_windows.pkl"

        if self.use_cache and cache_file.exists():
            logger.info("Loading preprocessed windows from cache")
            with open(cache_file, "rb") as f:
                data = pickle.load(f)
            return data["train_windows"], data["val_windows"], data["test_windows"]

        grouped = list(self.df.groupby("unique_id"))
        logger.info(f"Found {len(grouped)} unique IDs")

        # Pre-filter series that are too short
        valid_grouped = []
        lengths = []
        for unique_id, group in grouped:
            series_len = len(group[self.target_col])
            lengths.append(series_len)
            if series_len >= self.input_size + self.horizon:
                valid_grouped.append((unique_id, group))
            else:
                logger.debug(
                    f"[{unique_id}] Excluded: series length {series_len} < {self.input_size + self.horizon}"
                )
        logger.info(f"After filtering, {len(valid_grouped)} valid series remain")
        if lengths:
            lengths = np.array(lengths)
            logger.info(
                f"Series length stats: min={lengths.min()}, max={lengths.max()}, mean={lengths.mean():.2f}, "
                f"median={np.median(lengths):.2f}, too_short={np.sum(lengths < self.input_size + self.horizon)}"
            )

        if not valid_grouped:
            logger.error("No series are long enough to generate windows.")
            raise ValueError("No valid series found for window generation.")

        train_windows, val_windows, test_windows = [], [], []
        train_ids, val_ids, test_ids = [], [], []

        if self.split_type == "vertical":
            logger.info("Applying vertical split")
            unique_ids = [unique_id for unique_id, _ in valid_grouped]
            np.random.shuffle(unique_ids)
            total_series = len(unique_ids)
            train_end = int(total_series * self.train_split)
            val_end = train_end + int(total_series * self.val_split)

            train_ids = unique_ids[:train_end]
            val_ids = unique_ids[train_end:val_end]
            test_ids = unique_ids[val_end:]
            logger.info(
                f"Train IDs: {len(train_ids)}, Val IDs: {len(val_ids)}, Test IDs: {len(test_ids)}"
            )

        if self.large_dataset:
            # Initialize HDF5 writer actors
            writer_actors = {
                "train": HDF5WriterActor.remote(
                    self.cache_dir / "train_windows.h5", self.input_size, self.horizon
                ),
                "val": HDF5WriterActor.remote(
                    self.cache_dir / "val_windows.h5", self.input_size, self.horizon
                ),
                "test": HDF5WriterActor.remote(
                    self.cache_dir / "test_windows.h5", self.input_size, self.horizon
                ),
                "all": HDF5WriterActor.remote(
                    self.cache_dir / "all_windows.h5", self.input_size, self.horizon
                ),
            }

            # Batch unique IDs
            batches = [
                valid_grouped[i : i + self.batch_size]
                for i in range(0, len(valid_grouped), self.batch_size)
            ]
            logger.debug(f"Created {len(batches)} batches")

            # Submit Ray tasks
            futures = [
                TSPreprocessor._process_batch_optimized.remote(
                    batch,
                    writer_actors,
                    self.input_size,
                    self.horizon,
                    self.step_size,
                    self.adaptive_step,
                    self.downsample_factor,
                    self.chunk_size,
                    self.split_type,
                    self.train_split,
                    self.val_split,
                    self.target_col,
                )
                for batch in batches
            ]
            logger.debug(f"Submitted {len(futures)} Ray tasks")

            # Collect results
            pbar = tqdm(total=len(futures), desc="Processing batches", dynamic_ncols=True)
            total_windows = {"train": 0, "val": 0, "test": 0, "all": 0}
            while futures:
                done, futures = ray.wait(futures, num_returns=min(len(futures), 1))
                batch_results = ray.get(done)[0]
                for result in batch_results:
                    for window_count, unique_id, split in result:
                        if window_count > 0:
                            total_windows[split] += window_count
                            if self.split_type == "horizontal":
                                if split == "train":
                                    train_windows.append(
                                        (self.cache_dir / "train_windows.h5", unique_id)
                                    )
                                elif split == "val":
                                    val_windows.append(
                                        (self.cache_dir / "val_windows.h5", unique_id)
                                    )
                                elif split == "test":
                                    test_windows.append(
                                        (self.cache_dir / "test_windows.h5", unique_id)
                                    )
                            else:
                                if unique_id in train_ids:
                                    train_windows.append(
                                        (self.cache_dir / "all_windows.h5", unique_id)
                                    )
                                elif unique_id in val_ids:
                                    val_windows.append(
                                        (self.cache_dir / "all_windows.h5", unique_id)
                                    )
                                elif unique_id in test_ids:
                                    test_windows.append(
                                        (self.cache_dir / "all_windows.h5", unique_id)
                                    )
                pbar.update(len(done))
            pbar.close()

            # Close writer actors
            for actor in writer_actors.values():
                ray.get(actor.close.remote())

            logger.info(
                f"Total windows: Train={total_windows['train']}, Val={total_windows['val']}, Test={total_windows['test']}"
            )

        else:
            from concurrent.futures import ProcessPoolExecutor

            with ProcessPoolExecutor(max_workers=self.num_workers) as executor:
                results = list(
                    tqdm(
                        executor.map(self._process_one_series_in_memory, valid_grouped),
                        total=len(valid_grouped),
                        desc="Processing series",
                    )
                )

            for train_w, val_w, test_w, unique_id in results:
                if self.split_type == "horizontal":
                    train_windows.extend(train_w)
                    val_windows.extend(val_w)
                    test_windows.extend(test_w)
                else:
                    if unique_id in train_ids:
                        train_windows.extend(train_w)
                    elif unique_id in val_ids:
                        val_windows.extend(val_w)
                    elif unique_id in test_ids:
                        test_windows.extend(test_w)

        train_windows = np.array(train_windows, dtype=object)
        val_windows = np.array(val_windows, dtype=object)
        test_windows = np.array(test_windows, dtype=object)

        logger.info(
            f"Train windows: {len(train_windows)}, Val windows: {len(val_windows)}, Test windows: {len(test_windows)}"
        )

        if len(train_windows) == 0:
            logger.error(
                "No training windows generated after processing. Check split_type, train_split, or data."
            )
            raise ValueError("Training dataset is empty. Adjust parameters or verify data.")

        if self.use_cache:
            logger.info("Saving preprocessed windows metadata to cache")
            with open(cache_file, "wb") as f:
                pickle.dump(
                    {
                        "train_windows": train_windows,
                        "val_windows": val_windows,
                        "test_windows": test_windows,
                    },
                    f,
                )

        return train_windows, val_windows, test_windows


class UnivariateTSDataset(Dataset):
    def __init__(self, windows, device: str = None, large_dataset: bool = True):
        logger.info("Initializing UnivariateTSDataset")
        self.windows = windows
        self.device = device
        self.large_dataset = large_dataset
        if large_dataset:
            # Group by HDF5 file to avoid redundant opens
            self.h5_file_map = {}
            for h5_file, _ in windows:
                h5_file = str(h5_file)  # Ensure string path
                if h5_file not in self.h5_file_map:
                    with h5py.File(h5_file, "r") as f:
                        self.h5_file_map[h5_file] = {"size": f["x"].shape[0], "offset": 0}
            # Compute cumulative offsets
            current_offset = 0
            for h5_file in self.h5_file_map:
                self.h5_file_map[h5_file]["offset"] = current_offset
                current_offset += self.h5_file_map[h5_file]["size"]
        else:
            self.x = np.stack([w[0] for w in windows], axis=0).astype(np.float32)
            self.y = np.stack([w[1] for w in windows], axis=0).astype(np.float32)
            if device:
                logger.info("Preloading tensors to device: %s", device)
                self.x_tensor = torch.from_numpy(self.x).to(device)
                self.y_tensor = torch.from_numpy(self.y).to(device)
            else:
                self.x_tensor = self.y_tensor = None

    def __len__(self):
        if self.large_dataset:
            return sum(info["size"] for info in self.h5_file_map.values())
        return len(self.x)

    def __getitem__(self, idx):
        start_time = time.time()
        mem_before = psutil.virtual_memory().used / 1024**3
        if self.large_dataset:
            # Find the HDF5 file and local index
            for h5_file, info in self.h5_file_map.items():
                if idx < info["offset"] + info["size"]:
                    local_idx = idx - info["offset"]
                    with h5py.File(h5_file, "r") as f:
                        x = f["x"][local_idx]
                        y = f["y"][local_idx]
                        x_tensor = torch.from_numpy(x)
                        y_tensor = torch.from_numpy(y)
                        if self.device:
                            x_tensor = x_tensor.to(self.device)
                            y_tensor = y_tensor.to(self.device)
                            if self.device == "cuda":
                                torch.cuda.empty_cache()
                        mem_after = psutil.virtual_memory().used / 1024**3
                        # logger.info(
                        #     f"Loaded batch idx={idx}, time={time.time() - start_time:.2f}s, "
                        #     f"RAM delta={mem_after - mem_before:.2f}GB, "
                        #     f"x.shape={x_tensor.shape}, y.shape={y_tensor.shape}"
                        # )
                        gc.collect()
                        return x_tensor, y_tensor
            raise IndexError("Index out of range")
        else:
            x_tensor = self.x_tensor[idx] if self.device else torch.from_numpy(self.x[idx])
            y_tensor = self.y_tensor[idx] if self.device else torch.from_numpy(self.y[idx])
            if self.device == "cuda":
                torch.cuda.empty_cache()
            mem_after = psutil.virtual_memory().used / 1024**3
            # logger.info(
            #     f"Loaded batch idx={idx}, time={time.time() - start_time:.2f}s, "
            #     f"RAM delta={mem_after - mem_before:.2f}GB, "
            #     f"x.shape={x_tensor.shape}, y.shape={y_tensor.shape}"
            # )
            gc.collect()
            return x_tensor, y_tensor

    def __del__(self):
        if self.large_dataset:
            gc.collect()


class UnivariateTSDataModule(pl.LightningDataModule):
    def __init__(
        self,
        preprocessor: TSPreprocessor,
        batch_size=32,
        num_workers=1,
        pin_memory=False,
        prefetch_factor=1,
        persistent_workers=False,
        gpu_preload=False,
        experiment_name="default_experiment",
    ):
        logger.info("Initializing UnivariateTSDataModule")
        super().__init__()
        self.save_hyperparameters(ignore=["preprocessor"])
        self.preprocessor = preprocessor
        self.batch_size = batch_size
        self.num_workers = min(num_workers, torch.get_num_threads())
        self.pin_memory = pin_memory and not gpu_preload
        self.prefetch_factor = prefetch_factor
        self.persistent_workers = persistent_workers
        self.gpu_preload = gpu_preload
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.experiment_name = experiment_name

    def setup(self, stage: str = None):
        logger.info(f"Setting up datamodule for stage: {stage}")
        if stage in ("fit", None):
            self.train_dataset = UnivariateTSDataset(
                self.preprocessor.train_windows,
                device=self.device if self.gpu_preload else None,
                large_dataset=self.preprocessor.large_dataset,
            )
            self.val_dataset = UnivariateTSDataset(
                self.preprocessor.val_windows,
                device=self.device if self.gpu_preload else None,
                large_dataset=self.preprocessor.large_dataset,
            )
        if stage in ("validate", None):
            self.val_dataset = UnivariateTSDataset(
                self.preprocessor.val_windows,
                device=self.device if self.gpu_preload else None,
                large_dataset=self.preprocessor.large_dataset,
            )
        if stage in ("test", None):
            self.test_dataset = UnivariateTSDataset(
                self.preprocessor.test_windows,
                device=self.device if self.gpu_preload else None,
                large_dataset=self.preprocessor.large_dataset,
            )

    def _create_dataloader(self, dataset, shuffle=False):
        logger.info("Creating dataloader")
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            prefetch_factor=self.prefetch_factor,
            persistent_workers=self.persistent_workers,
            drop_last=shuffle,
        )

    def train_dataloader(self):
        logger.info("Getting train dataloader")
        return self._create_dataloader(self.train_dataset, shuffle=True)

    def val_dataloader(self):
        logger.info("Getting val dataloader")
        return self._create_dataloader(self.val_dataset)

    def test_dataloader(self):
        logger.info("Getting test dataloader")
        return self._create_dataloader(self.test_dataset)

In [18]:
%%time
# Example usage
## Data loading and preprocessing df.columns - unique_id,ds,y
from datasetsforecast.m5 import M5

df = M5().load("../data")[0]  # , group="Monthly"
df.sort_values(["unique_id", "ds"], inplace=True)


horizon = 180  # <-- FORECAST HORIZON
input_size = horizon * 5

batch_size = 64
step_size = 7

# ----------
# from datasetsforecast.m3 import M3

# df = M3().load("../data", group="Monthly")[0]  #
# df.sort_values(["unique_id", "ds"], inplace=True)

# horizon = 12  # <-- FORECAST HORIZON
# input_size = horizon * 5

# batch_size = 32
# step_size = 3

import os

# from ts.preprocess.bigdata.dataloader import TSPreprocessor, UnivariateTSDataModule

preprocessor = TSPreprocessor(
    df=df,
    input_size=input_size,
    horizon=horizon,
    target_col="y_scaled",
    train_split=0.7,
    val_split=0.15,
    split_type="vertical",
    step_size=1,
    cache_dir=".",
    use_cache=True,
    experiment_name="m5_experiment",
    num_workers=os.cpu_count() - 2,
    chunk_size=1000,
    adaptive_step=False,
    downsample_factor=1,
    save_by_window_count=True,
    max_windows_per_file=1000,
    batch_size=batch_size,
)

ds = UnivariateTSDataModule(
    preprocessor=preprocessor,
    batch_size=32,
    num_workers=os.cpu_count(),
    pin_memory=True,
    prefetch_factor=1,
    persistent_workers=True,
    gpu_preload=False,
    experiment_name="m5_experiment",
)

INFO:__main__:Initializing TSPreprocessor
INFO:__main__:Processing data for all unique_ids
INFO:__main__:Loading preprocessed windows from cache
INFO:__main__:Initializing UnivariateTSDataModule


CPU times: user 1.8 s, sys: 1.41 s, total: 3.21 s
Wall time: 3.2 s




In [21]:
%%time
import time

ds = UnivariateTSDataModule(
    preprocessor=preprocessor,
    batch_size=32,  # Reduced
    num_workers=1,  # Reduced
    pin_memory=True,  # Disabled
    prefetch_factor=1,
    persistent_workers=False,  # Disabled
    gpu_preload=False,
    experiment_name="m5_experiment",
)

# Profile DataLoader
start_time = time.time()
mem_before = psutil.virtual_memory().used / 1024**3  # GB
ds.setup("fit")
train_loader = ds.train_dataloader()

# Iterate over a few batches to profile
num_batches = 5
for i, (x, y) in enumerate(train_loader):
    mem_after = psutil.virtual_memory().used / 1024**3  # GB

    logger.info(
        f"Batch {i+1}/{num_batches}, time={time.time() - start_time:.2f}s, "
        f"RAM delta={mem_after - mem_before:.2f}GB, "
        f"x.shape={x.shape}, y.shape={y.shape}"
    )
    if i + 1 >= num_batches:
        break

INFO:__main__:Initializing UnivariateTSDataModule
INFO:__main__:Setting up datamodule for stage: fit
INFO:__main__:Initializing UnivariateTSDataset
INFO:__main__:Initializing UnivariateTSDataset
INFO:__main__:Getting train dataloader
INFO:__main__:Creating dataloader
INFO:__main__:Batch 1/5, time=7.07s, RAM delta=0.85GB, x.shape=torch.Size([32, 900]), y.shape=torch.Size([32, 180])
INFO:__main__:Batch 2/5, time=13.16s, RAM delta=0.84GB, x.shape=torch.Size([32, 900]), y.shape=torch.Size([32, 180])
INFO:__main__:Batch 3/5, time=19.26s, RAM delta=0.81GB, x.shape=torch.Size([32, 900]), y.shape=torch.Size([32, 180])
INFO:__main__:Batch 4/5, time=25.37s, RAM delta=0.83GB, x.shape=torch.Size([32, 900]), y.shape=torch.Size([32, 180])
INFO:__main__:Batch 5/5, time=31.66s, RAM delta=0.82GB, x.shape=torch.Size([32, 900]), y.shape=torch.Size([32, 180])


CPU times: user 1.3 s, sys: 427 ms, total: 1.73 s
Wall time: 37 s


In [4]:
%%time
ds.setup("fit")
train_loader = ds.train_dataloader()
for x, y in train_loader:
    print(x.shape, y.shape)
    break

INFO:ts.preprocess.bigdata.dataloader:Setting up datamodule for stage: fit
INFO:ts.preprocess.bigdata.dataloader:Initializing UnivariateTSDataset
INFO:ts.preprocess.bigdata.dataloader:Initializing UnivariateTSDataset
INFO:ts.preprocess.bigdata.dataloader:Getting train dataloader
INFO:ts.preprocess.bigdata.dataloader:Creating dataloader


torch.Size([32, 900]) torch.Size([32, 180])
CPU times: user 27.3 s, sys: 5.67 s, total: 33 s
Wall time: 3min 47s


INFO:ts.preprocess.bigdata.dataloader:Initializing UnivariateTSDataModule
INFO:ts.preprocess.bigdata.dataloader:Setting up datamodule for stage: fit
INFO:ts.preprocess.bigdata.dataloader:Initializing UnivariateTSDataset
INFO:ts.preprocess.bigdata.dataloader:Initializing UnivariateTSDataset
INFO:ts.preprocess.bigdata.dataloader:Getting train dataloader
INFO:ts.preprocess.bigdata.dataloader:Creating dataloader
Exception ignored in: <function UnivariateTSDataset.__del__ at 0x77930a75f6a0>
Traceback (most recent call last):
  File "/home/pranav-pc/projects/ts/ts/preprocess/bigdata/dataloader.py", line 607, in __del__
    h5_file.close()
  File "/home/pranav-pc/projects/ts/.venv/lib/python3.12/site-packages/h5py/_hl/files.py", line 632, in close
    self.id.close()
KeyboardInterrupt: 


torch.Size([32, 900]) torch.Size([32, 180])


In [9]:
ds.__dict__

{'_log_hyperparams': True,
 'prepare_data_per_node': True,
 'allow_zero_length_dataloader_with_multiple_devices': False,
 'trainer': None,
 '_hparams_name': 'kwargs',
 '_hparams': "batch_size":         64
 "experiment_name":    m5_experiment
 "gpu_preload":        False
 "num_workers":        30
 "persistent_workers": True
 "pin_memory":         True
 "prefetch_factor":    2,
 '_hparams_initial': "batch_size":         64
 "experiment_name":    m5_experiment
 "gpu_preload":        False
 "num_workers":        30
 "persistent_workers": True
 "pin_memory":         True
 "prefetch_factor":    2,
 'preprocessor': <ts.preprocess.bigdata.dataloader.TSPreprocessor at 0x7ff6a24db470>,
 'batch_size': 64,
 'num_workers': 24,
 'pin_memory': True,
 'prefetch_factor': 2,
 'persistent_workers': True,
 'gpu_preload': False,
 'device': device(type='cpu'),
 'experiment_name': 'm5_experiment',
 'train_dataset': <ts.preprocess.bigdata.dataloader.UnivariateTSDataset at 0x7ff625d71160>,
 'val_dataset': <ts.

In [7]:
%%time
ds.setup()
for x, y in ds.train_dataloader():
    break

ValueError: num_samples should be a positive integer value, but got num_samples=0