In [50]:
# ## Prepare data 
# import torch
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import LabelEncoder
# from datasetsforecast.m4 import M4

# # Load Labels
# label_df = pd.read_parquet('../data/evaluation_df.parquet').set_index('unique_id')['best_model']

# # Load M4 dataset
# groups = ['Yearly', 'Monthly', 'Quarterly', 'Hourly', 'Weekly', 'Daily']
# m4_df_bucket = []
# for group in groups:
#     await M4.async_download('data', group=group)
#     Y_df, *_ = M4.load(directory='data', group=group)
#     m4_df_bucket.append(Y_df)
# m4_dataset = pd.concat(m4_df_bucket)

# m4_dataset = m4_dataset.sort_values(['unique_id', 'ds']).drop_duplicates(subset=['unique_id', 'ds'])
# no_of_datapoints = m4_dataset.groupby('unique_id').apply(len).to_dict()



# # Convert to wide format (fixed)
# m4_dataset = m4_dataset.pivot(index='unique_id', columns='ds', values='y')

# # Merge with labels
# m4_dataset = m4_dataset.merge(label_df, left_index=True, right_index=True, how='right')

# best_model  = m4_dataset['best_model'].to_dict()
# df_min = m4_dataset.drop('best_model',axis=1).min(axis=1)
# df_max = m4_dataset.drop('best_model',axis=1).max(axis=1)

# m4_dataset = (m4_dataset.drop('best_model',axis=1) - df_min.values.reshape(-1,1))/(df_max-df_min).values.reshape(-1,1)

# m4_dataset['best_model'] =  m4_dataset.index.map(best_model)
# m4_dataset['no_of_datapoints'] = m4_dataset.index.map(no_of_datapoints)

# m4_dataset.sort_values('no_of_datapoints',inplace=True)
# m4_dataset.drop('no_of_datapoints',axis=1,inplace=True)
# # clean up the memory
# del df_max
# del df_min
# del m4_df_bucket, no_of_datapoints

# # Encode categorical labels
# le = LabelEncoder()
# m4_dataset['best_model'] = le.fit_transform(m4_dataset['best_model'])
# m4_dataset.columns = m4_dataset.columns.astype(str)

# # Save as optimized Parquet
# m4_dataset.to_parquet("../data/m4_preprocessed.parquet", engine='fastparquet', compression='snappy')

# print("✅ Preprocessing Complete! Data saved as Parquet.")


In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import torchmetrics

# Enable CUDNN optimizations
torch.backends.cudnn.benchmark = True

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Special padding token (ensure it's outside normal data range)
PAD_TOKEN = -9999.0

# Custom dataset for time series classification
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        # Replace NaN values with PAD_TOKEN
        X = [np.nan_to_num(x, nan=PAD_TOKEN) for x in X]

        # Convert to tensors
        self.X = [torch.tensor(x, dtype=torch.float32, device=device) for x in X]
        self.y = torch.tensor(y, dtype=torch.long, device=device)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    @staticmethod
    def collate_fn(batch):
        """Custom collate function to dynamically adjust sequence lengths per batch."""
        X_batch, y_batch = zip(*batch)

        # Determine max sequence length in this batch, excluding padding tokens
        max_seq_len = max(len(x[~(x[:, 0] == PAD_TOKEN)]) for x in X_batch)

        # Ensure each sequence has the same feature dimension (based on dynamic max length)
        num_features = max(x.shape[1] for x in X_batch)

        # Truncate or pad each sequence
        X_padded = []
        masks = []
        for x in X_batch:
            x = x[~(x[:, 0] == PAD_TOKEN)]  # Remove padding
            x = x[:, :num_features]  # Ensure consistent feature count

            if x.shape[0] > max_seq_len:
                x = x[:max_seq_len]  # Truncate
            pad_size = max_seq_len - x.shape[0]
            x_padded = torch.cat([x, torch.full((pad_size, num_features), PAD_TOKEN, device=device)])

            mask = (x_padded[:, 0] != PAD_TOKEN).float()  # Mask: 1 for real data, 0 for padding

            X_padded.append(x_padded)
            masks.append(mask)

        # Stack into tensors
        X_padded = torch.stack(X_padded)
        masks = torch.stack(masks)
        y_batch = torch.stack(y_batch)

        return X_padded, masks, y_batch


# Transformer-based time series classification model
class TransformerModel(pl.LightningModule):
    def __init__(self, num_classes, d_model=64, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Linear(d_model, d_model)  # Input dim will be dynamic
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(d_model, num_classes)
        self.criterion = nn.CrossEntropyLoss()

        # Metrics
        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes).to(device)
        self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes).to(device)
        self.train_precision = torchmetrics.Precision(task="multiclass", num_classes=num_classes).to(device)
        self.train_recall = torchmetrics.Recall(task="multiclass", num_classes=num_classes).to(device)

    def forward(self, x, mask):
        x = self.embedding(x)  # [batch, seq_len, d_model]
        x = x.permute(1, 0, 2)  # [seq_len, batch, d_model]
        x = self.transformer_encoder(x, src_key_padding_mask=mask.bool())  # Apply masking
        x = x.permute(1, 2, 0)  # [batch, d_model, seq_len]
        x = self.global_avg_pool(x).squeeze(-1)  # [batch, d_model]
        return self.fc(x)  # [batch, num_classes]

    def training_step(self, batch, batch_idx):
        x, mask, y = batch
        y_hat = self(x, mask)
        loss = self.criterion(y_hat, y)

        # Compute metrics
        preds = torch.argmax(y_hat, dim=1)
        self.train_acc.update(preds, y)
        self.train_f1.update(preds, y)
        self.train_precision.update(preds, y)
        self.train_recall.update(preds, y)

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", self.train_acc, prog_bar=True)
        self.log("train_f1", self.train_f1, prog_bar=True)
        self.log("train_precision", self.train_precision, prog_bar=True)
        self.log("train_recall", self.train_recall, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=1e-3)


# Load dataset
m4_dataset = pd.read_parquet("../data/m4_preprocessed.parquet")

# Prepare dataset
X_data = [x for x in m4_dataset.drop("best_model", axis=1).values]
y_data = m4_dataset["best_model"].values

# Encode labels
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(y_data)

# Create dataset and dataloader
dataset = TimeSeriesDataset(X_data, y_data)
dataloader = DataLoader(dataset, batch_size=512, shuffle=True, num_workers=4, pin_memory=True, collate_fn=TimeSeriesDataset.collate_fn)

# Initialize model and trainer
num_classes = len(np.unique(y_data))
model = TransformerModel(num_classes=num_classes).to(device)

trainer = pl.Trainer(max_epochs=50, accelerator="gpu", devices=1, precision=16)
trainer.fit(model, dataloader)


/home/pranav-pc/.cache/pypoetry/virtualenvs/ts-EBaOKu-T-py3.12/lib/python3.12/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INF

Training: |                                                                                                   …

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/pranav-pc/.cache/pypoetry/virtualenvs/ts-EBaOKu-T-py3.12/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/pranav-pc/.cache/pypoetry/virtualenvs/ts-EBaOKu-T-py3.12/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/tmp/ipykernel_50398/1384673802.py", line 34, in __getitem__
    return self.X[idx], self.y[idx]
                        ~~~~~~^^^^^
RuntimeError: CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



  no_of_datapoints = m4_dataset.groupby('unique_id').apply(len).to_dict()


✅ Preprocessing Complete! Data saved as Parquet.
