In [3]:
# | default_exp preprocess.dataloader

In [4]:
# | export

import gc

# from torch.serialization import safe_globals
import os
import pickle
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.utils.data import DataLoader, Dataset

In [5]:
# | export
import logging
import warnings

# Configure logging
logging.basicConfig(level=logging.ERROR)  # Change to DEBUG for more details

logger = logging.getLogger(__name__)

In [6]:
# | hide


from fastcore.test import test_eq
from nbdev.showdoc import show_doc

In [7]:
# | export
device = "cuda" if torch.cuda.is_available() else "cpu"
torch._dynamo.config.suppress_errors = True

In [8]:
# | export


class TSRegressionDataset(Dataset):
    def __init__(self, df, in_features, out_features, window=1):
        super(TSRegressionDataset, self).__init__()

        self.data = df
        self.in_features = in_features
        self.out_features = out_features
        self.window = window

        self.groups = {uid: group.reset_index(drop=True) for uid, group in df.groupby("unique_id")}

    def __len__(self):
        assert self.in_features + self.out_features < len(
            self.data
        ), f"in_features + out_features should not be greater than series len {self.data.shape[0]}"

        return sum(
            [
                (len(g) - (self.in_features + self.out_features)) // self.window + 1
                for g in self.groups.values()
            ]
        )

    def __getitem__(self, idx):

        for uid, df in self.groups.items():
            max_idx = (len(df) - (self.in_features + self.out_features)) // self.window + 1

            if idx < max_idx:
                start = idx * self.window
                x = torch.tensor(
                    df["y"].iloc[start : start + self.in_features].values,
                    dtype=torch.float32,  # device=torch.device("cuda")
                )
                y = torch.tensor(
                    df["y"]
                    .iloc[start + self.in_features : start + self.in_features + self.out_features]
                    .values,
                    dtype=torch.float32,  # device=torch.device("cuda")
                )
                return x, y

            idx -= max_idx

In [9]:
# | export


class TSDataLoader(pl.LightningDataModule):
    def __init__(self, df, in_features, out_features, window=1, batch_size=32):
        super(TSDataLoader, self).__init__()
        self.data = df
        self.in_features = in_features
        self.out_features = out_features
        self.window = window
        self.batch_size = batch_size

    def setup(self, stage=None):
        assert list(self.data.columns) == [
            "unique_id",
            "ds",
            "y",
        ], "Columns must be ['unique_id', 'ds', 'y']"

        self.data = self.data.sort_values(["unique_id", "ds"])

        # Train / Val / Test Split (70/15/15)
        train_sz = int(len(self.data) * 0.7)
        val_sz = int(len(self.data) * 0.15)

        self.train_df = self.data.iloc[:train_sz]
        self.val_df = self.data.iloc[train_sz : train_sz + val_sz]
        self.test_df = self.data.iloc[train_sz + val_sz :]

        self.train = TSRegressionDataset(
            self.train_df, self.in_features, self.out_features, self.window
        )
        self.val = TSRegressionDataset(
            self.val_df, self.in_features, self.out_features, self.window
        )
        self.test = TSRegressionDataset(
            self.test_df, self.in_features, self.out_features, self.window
        )

    def train_dataloader(self):
        return DataLoader(
            self.train,
            batch_size=self.batch_size,
            shuffle=True,
            pin_memory=True,
            num_workers=32,
            persistent_workers=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=32,
            persistent_workers=True,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=32,
            persistent_workers=True,
        )

In [10]:
# | export
if __name__ == "__main__":
    from neuralforecast.utils import AirPassengersDF as df

    in_features, out_features, window, batch_sz = 12, 6, 1, 32
    ds = TSDataLoader(df, in_features, out_features, window, batch_sz)
    ds.setup()
    for features, labels in ds.train_dataloader():
        print(features.shape, labels.shape)

torch.Size([32, 12]) torch.Size([32, 6])
torch.Size([32, 12]) torch.Size([32, 6])
torch.Size([19, 12]) torch.Size([19, 6])


In [2]:
# | export


class UnivariateTSDataset(Dataset):
    def __init__(self, windows, device: Optional[str] = None):
        logger.info("Initializing UnivariateTSDataset")
        self.x = np.stack([w[0] for w in windows], axis=0).astype(np.float32)
        self.y = np.stack([w[1] for w in windows], axis=0).astype(np.float32)

        self.device = device
        if device:
            logger.info("Preloading tensors to device: %s", device)
            self.x_tensor = torch.from_numpy(self.x).to(device)
            self.y_tensor = torch.from_numpy(self.y).to(device)
        else:
            self.x_tensor = self.y_tensor = None

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        if self.device:
            return self.x_tensor[idx], self.y_tensor[idx]
        else:
            return (torch.from_numpy(self.x[idx]), torch.from_numpy(self.y[idx]))


class UnivariateTSDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df,
        input_size,
        horizon,
        target_col="y",
        batch_size=32,
        num_workers=8,
        train_split=0.7,
        val_split=0.15,
        normalize=True,
        scaler_type="minmax",
        split_type="horizontal",
        step_size=1,
        pin_memory=True,
        prefetch_factor=2,
        persistent_workers=True,
        gpu_preload=False,
        cache_dir=".",
        use_cache=True,
        persist_scaler=False,
        experiment_name="default_experiment",
    ):
        logger.info("Initializing UnivariateTSDataModule")
        super().__init__()
        self.save_hyperparameters(ignore=["df"])
        self.df = df
        self.input_size = input_size
        self.horizon = horizon
        self.target_col = target_col
        self.batch_size = batch_size
        self.num_workers = min(num_workers, torch.get_num_threads())
        self.train_split = train_split
        self.val_split = val_split
        self.normalize = normalize
        self.scaler_type = scaler_type
        self.split_type = split_type
        self.step_size = step_size
        self.pin_memory = pin_memory
        self.prefetch_factor = prefetch_factor
        self.persistent_workers = persistent_workers
        self.gpu_preload = gpu_preload
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.cache_dir = Path(cache_dir) / experiment_name
        self.cache_dir.mkdir(exist_ok=True, parents=True)
        self.use_cache = use_cache
        self.persist_scaler = persist_scaler
        self.experiment_name = experiment_name

        self.scaler_dir = self.cache_dir / "scalers"
        self.scaler_dir.mkdir(exist_ok=True, parents=True)

        if not 0 < train_split + val_split <= 1:
            raise ValueError("train_split + val_split must be between 0 and 1")
        if step_size < 1:
            raise ValueError("step_size must be >= 1")

    def _generate_windows(self, series):
        logger.info("Generating windows from series")
        series_len = len(series)
        if series_len < self.input_size + self.horizon:
            logger.warning("Series too short to generate any windows")
            return []

        max_idx = series_len - self.input_size - self.horizon + 1
        if max_idx <= 0:
            return []

        window_starts = np.arange(0, max_idx, self.step_size, dtype=np.int32)
        window_ends = window_starts + self.input_size
        horizon_ends = window_ends + self.horizon

        valid_windows = horizon_ends <= series_len
        window_starts = window_starts[valid_windows]
        window_ends = window_ends[valid_windows]
        horizon_ends = horizon_ends[valid_windows]

        x_windows = np.lib.stride_tricks.sliding_window_view(series, window_shape=self.input_size)[
            window_starts
        ]

        y_windows = np.stack(
            [
                series[window_end:horizon_end]
                for window_end, horizon_end in zip(window_ends, horizon_ends)
            ]
        )

        return list(zip(x_windows, y_windows))

    def setup(self, stage=None):
        logger.info("Setting up the datamodule")
        cache_file = self.cache_dir / "split_data.pt"

        if self.use_cache and cache_file.exists():
            logger.info("Loading dataset splits from cache")
            data = torch.load(cache_file, weights_only=False)
            self.train_dataset = data["train"]
            self.val_dataset = data["val"]
            self.test_dataset = data["test"]
            return

        grouped = self.df.groupby("unique_id")
        train_windows, val_windows, test_windows = [], [], []

        if self.split_type == "vertical":
            logger.info("Applying vertical split")
            unique_ids = list(grouped.groups.keys())
            np.random.shuffle(unique_ids)
            total_series = len(unique_ids)
            train_end = int(total_series * self.train_split)
            val_end = train_end + int(total_series * self.val_split)

            train_ids = unique_ids[:train_end]
            val_ids = unique_ids[train_end:val_end]
            test_ids = unique_ids[val_end:]

        for unique_id, group in grouped:
            logger.info(f"Processing unique_id: {unique_id}")
            series = group[self.target_col].values.astype(np.float32)

            if self.normalize:
                scaler_file = self.scaler_dir / f"{unique_id}_scaler.pkl"
                if self.persist_scaler and scaler_file.exists():
                    logger.info(f"Loading scaler for {unique_id}")
                    with open(scaler_file, "rb") as f:
                        scaler = pickle.load(f)

                    series = scaler.transform(series.reshape(-1, 1)).flatten()

                else:
                    if self.scaler_type == "minmax":
                        scaler = MinMaxScaler()
                    elif self.scaler_type == "standard":
                        scaler = StandardScaler()
                    else:
                        raise ValueError("scaler_type must be 'minmax' or 'standard'")

                    series = scaler.fit_transform(series.reshape(-1, 1)).flatten()
                    if self.persist_scaler:
                        logger.info(f"Saving scaler for {unique_id}")
                        with open(scaler_file, "wb") as f:
                            pickle.dump(scaler, f)

            windows = self._generate_windows(series)
            if not windows:
                logger.warning(f"{unique_id} - Series too short for windowing")
                continue

            if self.split_type == "horizontal":
                num_windows = len(windows)
                train_end = int(num_windows * self.train_split)
                val_end = train_end + int(num_windows * self.val_split)
                train_windows.extend(windows[:train_end])
                val_windows.extend(windows[train_end:val_end])
                test_windows.extend(windows[val_end:])
            elif self.split_type == "vertical":
                if unique_id in train_ids:
                    train_windows.extend(windows)
                elif unique_id in val_ids:
                    val_windows.extend(windows)
                elif unique_id in test_ids:
                    test_windows.extend(windows)
            else:
                raise ValueError("split_type must be 'horizontal' or 'vertical'")

        print(train_windows)
        logger.info("Creating dataset objects")
        self.train_dataset = UnivariateTSDataset(
            train_windows, device=self.device if self.gpu_preload else None
        )
        self.val_dataset = UnivariateTSDataset(
            val_windows, device=self.device if self.gpu_preload else None
        )
        self.test_dataset = UnivariateTSDataset(
            test_windows, device=self.device if self.gpu_preload else None
        )

        logger.info(
            f"Train windows: {len(self.train_dataset)}, Val windows: {len(self.val_dataset)}, Test windows: {len(self.test_dataset)}"
        )

        if self.use_cache and not cache_file.exists():
            logger.info("Saving dataset splits to cache")
            torch.save(
                {"train": self.train_dataset, "val": self.val_dataset, "test": self.test_dataset},
                cache_file,
            )

        logger.info("Cleaning up original dataframe")
        # del self.df
        gc.collect()

    def _create_dataloader(self, dataset, shuffle=False):
        logger.info("Creating dataloader")
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory and not self.gpu_preload,
            prefetch_factor=self.prefetch_factor,
            persistent_workers=self.persistent_workers,
            drop_last=shuffle,
        )

    def train_dataloader(self):
        logger.info("Getting train dataloader")
        return self._create_dataloader(self.train_dataset, shuffle=True)

    def val_dataloader(self):
        logger.info("Getting val dataloader")
        return self._create_dataloader(self.val_dataset)

    def test_dataloader(self):
        logger.info("Getting test dataloader")
        return self._create_dataloader(self.test_dataset)

    def inverse_transform(self, data, unique_id):
        logger.info(f"Inverse transforming data for unique_id: {unique_id}")
        scaler_file = self.scaler_dir / f"{unique_id}_scaler.pkl"
        if scaler_file.exists():
            with open(scaler_file, "rb") as f:
                scaler = pickle.load(f)
            return scaler.inverse_transform(data.reshape(-1, 1)).flatten()
        logger.warning(f"Scaler for {unique_id} not found. Returning original data.")
        return data

    def _load_all_scalers(self):
        scaler_map = {}
        for file in self.scaler_dir.glob("*_scaler.pkl"):
            uid = file.stem.replace("_scaler", "")
            with open(file, "rb") as f:
                scaler_map[uid] = pickle.load(f)
        return scaler_map

    def pre_prediction_transform(self, series: np.ndarray, unique_id: str) -> np.ndarray:

        scaler_file = self.scaler_dir / f"{unique_id}_scaler.pkl"
        if scaler_file.exists():
            with open(scaler_file, "rb") as f:
                scaler = pickle.load(f)
            return scaler.transform(series.reshape(-1, 1)).flatten()
        logger.warning(f"Scaler for {unique_id} not found. Returning original series.")
        return series

    def post_prediction_transform(self, prediction: np.ndarray, unique_id: str) -> np.ndarray:

        return self.inverse_transform(prediction, unique_id)

NameError: name 'Dataset' is not defined