In [None]:
# | default_exp preprocess.dataloader

In [None]:
# | export
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader, Dataset

In [None]:
# | hide
import logging
import warnings

from fastcore.test import test_eq
from nbdev.showdoc import show_doc

In [None]:
# | export
device = "cuda" if torch.cuda.is_available() else "cpu"
torch._dynamo.config.suppress_errors = True

In [None]:
# | export


class TSRegressionDataset(Dataset):
    def __init__(self, df, in_features, out_features, window=1):
        super(TSRegressionDataset, self).__init__()

        self.data = df
        self.in_features = in_features
        self.out_features = out_features
        self.window = window

        self.groups = {uid: group.reset_index(drop=True) for uid, group in df.groupby("unique_id")}

    def __len__(self):
        assert self.in_features + self.out_features < len(
            self.data
        ), f"in_features + out_features should not be greater than series len {self.data.shape[0]}"

        return sum(
            [
                (len(g) - (self.in_features + self.out_features)) // self.window + 1
                for g in self.groups.values()
            ]
        )

    def __getitem__(self, idx):

        for uid, df in self.groups.items():
            max_idx = (len(df) - (self.in_features + self.out_features)) // self.window + 1

            if idx < max_idx:
                start = idx * self.window
                x = torch.tensor(
                    df["y"].iloc[start : start + self.in_features].values,
                    dtype=torch.float32,  # device=torch.device("cuda")
                )
                y = torch.tensor(
                    df["y"]
                    .iloc[start + self.in_features : start + self.in_features + self.out_features]
                    .values,
                    dtype=torch.float32,  # device=torch.device("cuda")
                )
                return x, y

            idx -= max_idx

In [None]:
# | export


class TSDataLoader(pl.LightningDataModule):
    def __init__(self, df, in_features, out_features, window=1, batch_size=32):
        super(TSDataLoader, self).__init__()
        self.data = df
        self.in_features = in_features
        self.out_features = out_features
        self.window = window
        self.batch_size = batch_size

    def setup(self, stage=None):
        assert list(self.data.columns) == [
            "unique_id",
            "ds",
            "y",
        ], "Columns must be ['unique_id', 'ds', 'y']"

        self.data = self.data.sort_values(["unique_id", "ds"])

        # Train / Val / Test Split (70/15/15)
        train_sz = int(len(self.data) * 0.7)
        val_sz = int(len(self.data) * 0.15)

        self.train_df = self.data.iloc[:train_sz]
        self.val_df = self.data.iloc[train_sz : train_sz + val_sz]
        self.test_df = self.data.iloc[train_sz + val_sz :]

        self.train = TSRegressionDataset(
            self.train_df, self.in_features, self.out_features, self.window
        )
        self.val = TSRegressionDataset(
            self.val_df, self.in_features, self.out_features, self.window
        )
        self.test = TSRegressionDataset(
            self.test_df, self.in_features, self.out_features, self.window
        )

    def train_dataloader(self):
        return DataLoader(
            self.train,
            batch_size=self.batch_size,
            shuffle=True,
            pin_memory=True,
            num_workers=32,
            persistent_workers=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=32,
            persistent_workers=True,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=32,
            persistent_workers=True,
        )

In [None]:
# | export
if __name__ == "__main__":
    from neuralforecast.utils import AirPassengersDF as df

    in_features, out_features, window, batch_sz = 12, 6, 1, 32
    ds = TSDataLoader(df, in_features, out_features, window, batch_sz)
    ds.setup()
    for features, labels in ds.train_dataloader():
        print(features.shape, labels.shape)

torch.Size([32, 12]) torch.Size([32, 6])
torch.Size([32, 12]) torch.Size([32, 6])
torch.Size([19, 12]) torch.Size([19, 6])
