In [50]:
## Prepare data 
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasetsforecast.m4 import M4

# Load Labels
label_df = pd.read_parquet('../data/evaluation_df.parquet').set_index('unique_id')['best_model']

# Load M4 dataset
groups = ['Yearly', 'Monthly', 'Quarterly', 'Hourly', 'Weekly', 'Daily']
m4_df_bucket = []
for group in groups:
    await M4.async_download('data', group=group)
    Y_df, *_ = M4.load(directory='data', group=group)
    m4_df_bucket.append(Y_df)
m4_dataset = pd.concat(m4_df_bucket)

m4_dataset = m4_dataset.sort_values(['unique_id', 'ds']).drop_duplicates(subset=['unique_id', 'ds'])
no_of_datapoints = m4_dataset.groupby('unique_id').apply(len).to_dict()

# Convert to wide format (fixed)
m4_dataset = m4_dataset.pivot(index='unique_id', columns='ds', values='y')

# Merge with labels
m4_dataset = m4_dataset.merge(label_df, left_index=True, right_index=True, how='right')

best_model  = m4_dataset['best_model'].to_dict()
df_min = m4_dataset.drop('best_model',axis=1).min(axis=1)
df_max = m4_dataset.drop('best_model',axis=1).max(axis=1)

m4_dataset = (m4_dataset.drop('best_model',axis=1) - df_min.values.reshape(-1,1))/(df_max-df_min).values.reshape(-1,1)

m4_dataset['best_model'] =  m4_dataset.index.map(best_model)
m4_dataset['no_of_datapoints'] = m4_dataset.index.map(no_of_datapoints)

m4_dataset.sort_values('no_of_datapoints',inplace=True)
m4_dataset.drop('no_of_datapoints',axis=1,inplace=True)
# clean up the memory
del df_max
del df_min
del m4_df_bucket, no_of_datapoints

# Encode categorical labels
le = LabelEncoder()
m4_dataset['best_model'] = le.fit_transform(m4_dataset['best_model'])
m4_dataset.columns = m4_dataset.columns.astype(str)

# Save as optimized Parquet
m4_dataset.to_parquet("../data/m4_preprocessed.parquet", engine='fastparquet', compression='snappy')

print("✅ Preprocessing Complete! Data saved as Parquet.")


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import torchmetrics

# Enable CUDNN optimizations
torch.backends.cudnn.benchmark = True

# Define device
device = 'cpu'  # Change to 'cuda' if GPU is available

# Special padding token (ensure it's outside normal data range)
PAD_TOKEN = -9999.0

# Custom dataset for time series classification
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        # Replace NaN values with PAD_TOKEN
        X = [np.nan_to_num(x, nan=PAD_TOKEN) for x in X]

        # Convert to tensors
        self.X = [torch.tensor(x, dtype=torch.float32, device=device) for x in X]
        self.y = torch.tensor(y, dtype=torch.long, device=device)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    @staticmethod
    def collate_fn(batch):
        """Custom collate function to dynamically adjust sequence lengths per batch."""
        X_batch, y_batch = zip(*batch)

        # Convert to numpy arrays & ensure uniform shapes
        X_batch = [np.array(x.cpu(), dtype=np.float32) for x in X_batch]

        # Ensure sequences have at least 2 dimensions (time_steps, features)
        for i, x in enumerate(X_batch):
            if x.ndim == 1:  # If 1D, reshape to (len, 1)
                X_batch[i] = x.reshape(-1, 1)

        # Determine max sequence length in this batch, excluding padding tokens
        max_seq_len = max(len(x[~(x[:, 0] == PAD_TOKEN)]) for x in X_batch)

        # Ensure each sequence has the same feature dimension (based on dynamic max length)
        num_features = max(x.shape[1] for x in X_batch)

        # Truncate or pad each sequence
        X_padded = []
        masks = []
        for x in X_batch:
            x = x[~(x[:, 0] == PAD_TOKEN)]  # Remove padding
            x = x[:, :num_features]  # Ensure consistent feature count

            if x.shape[0] > max_seq_len:
                x = x[:max_seq_len]  # Truncate
            pad_size = max_seq_len - x.shape[0]
            x_padded = np.concatenate([x, np.full((pad_size, num_features), PAD_TOKEN, dtype=np.float32)], axis=0)

            mask = (x_padded[:, 0] != PAD_TOKEN).astype(np.float32)  # Mask: 1 for real data, 0 for padding

            X_padded.append(x_padded)
            masks.append(mask)

        # Convert to tensors
        X_padded = torch.tensor(np.array(X_padded), dtype=torch.float32, device=device)
        masks = torch.tensor(np.array(masks), dtype=torch.float32, device=device)
        y_batch = torch.tensor(np.array(y_batch), dtype=torch.long, device=device)

        return X_padded, masks, y_batch


# Transformer-based time series classification model
class TransformerModel(pl.LightningModule):
    def __init__(self, num_classes, num_features, d_model=64, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Linear(num_features, d_model)  # FIX: Map input features to d_model
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(d_model, num_classes)
        self.criterion = nn.CrossEntropyLoss()

        # Metrics
        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes).to(device)
        self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes).to(device)
        self.train_precision = torchmetrics.Precision(task="multiclass", num_classes=num_classes).to(device)
        self.train_recall = torchmetrics.Recall(task="multiclass", num_classes=num_classes).to(device)

    def forward(self, x, mask):
        x = self.embedding(x)  # FIX: Ensure correct feature mapping
        x = x.permute(1, 0, 2)  # [seq_len, batch, d_model]
        x = self.transformer_encoder(x, src_key_padding_mask=mask.bool())
        x = x.permute(1, 2, 0)  # [batch, d_model, seq_len]
        x = self.global_avg_pool(x).squeeze(-1)  # [batch, d_model]
        return self.fc(x)  # [batch, num_classes]

    def training_step(self, batch, batch_idx):
        x, mask, y = batch
        y_hat = self(x, mask)
        loss = self.criterion(y_hat, y)

        # Compute metrics
        preds = torch.argmax(y_hat, dim=1)
        self.train_acc.update(preds, y)
        self.train_f1.update(preds, y)
        self.train_precision.update(preds, y)
        self.train_recall.update(preds, y)

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", self.train_acc, prog_bar=True)
        self.log("train_f1", self.train_f1, prog_bar=True)
        self.log("train_precision", self.train_precision, prog_bar=True)
        self.log("train_recall", self.train_recall, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=1e-3)


# Load dataset
m4_dataset = pd.read_parquet("../data/m4_preprocessed.parquet")

# Prepare dataset
X_data = [x for x in m4_dataset.drop("best_model", axis=1).values]
y_data = m4_dataset["best_model"].values


# Create dataset and dataloader
dataset = TimeSeriesDataset(X_data, y_data)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True, collate_fn=TimeSeriesDataset.collate_fn)

num_classes = len(np.unique(y_data))
num_features = X_data[0].shape[1]  # Get feature size dynamically
model = TransformerModel(num_classes=num_classes, num_features=num_features).to(device)


trainer = pl.Trainer(max_epochs=50, accelerator="cpu", devices=1, precision=16)
trainer.fit(model, dataloader)


IndexError: tuple index out of range

In [9]:
y_data

array([2, 2, 3, ..., 3, 6, 5])

✅ Preprocessing Complete! Data saved as Parquet.
