In [None]:
import torch

torch.manual_seed(42)
import lightning.pytorch as pl
import numpy as np
import pandas as pd
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch import nn, optim, utils
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm.notebook import tqdm

torch.set_float32_matmul_precision("high")

In [None]:
class TabularDataLoader(pl.LightningDataModule):
    def __init__(
        self, df, target_column, batch_size=32, num_workers=4, test_size=0.2, val_size=0.1
    ):
        super(TabularDataLoader, self).__init__()

        self.df = df
        self.target_column = target_column
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.test_size = test_size
        self.val_size = val_size
        self.scaler = StandardScaler()
        self.label_encoders = {}  # Store label encoders for categorical features
        self.target_encoder = LabelEncoder()

    def prepare_data(self):
        df = self.df.copy()

        # Identify numerical and categorical columns
        self.num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
        self.cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

        # Remove target column from numerical/categorical lists
        if self.target_column in self.num_cols:
            self.num_cols.remove(self.target_column)
        if self.target_column in self.cat_cols:
            self.cat_cols.remove(self.target_column)

        # Fit label encoders for categorical columns
        for col in self.cat_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.label_encoders[col] = le  # Store encoder for later use

        # Fit scaler on numerical columns
        if self.num_cols:
            self.scaler.fit(df[self.num_cols])

        # Fit target label encoder
        self.target_encoder.fit(df[self.target_column])

    def setup(self, stage=None):
        df = self.df.copy()

        # Encode categorical columns safely
        cat_features = []
        for col in self.cat_cols:
            if col in self.label_encoders:  # Ensure encoder exists
                try:
                    transformed = self.label_encoders[col].transform(df[col])
                except ValueError:  # Handle unseen categories
                    transformed = np.array([0] * len(df))  # Assign default category
                cat_features.append(transformed.reshape(-1, 1))

        cat_features = np.hstack(cat_features) if cat_features else np.zeros((len(df), 0))

        # Scale numerical columns
        if self.num_cols:
            num_features = self.scaler.transform(df[self.num_cols])
        else:
            num_features = np.zeros((len(df), 0))  # Handle empty numerical features

        # Concatenate numerical and categorical features
        X = np.hstack([num_features, cat_features])
        y = self.target_encoder.transform(df[self.target_column])  # Encode target

        # Convert to PyTorch tensors
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y, dtype=torch.long)

        # Create dataset
        dataset = TensorDataset(X_tensor, y_tensor)

        # Handle cases where test/val size could be 0
        test_size = max(1, int(self.test_size * len(dataset)))
        val_size = max(1, int(self.val_size * len(dataset)))
        train_size = len(dataset) - test_size - val_size

        # Ensure train_size is at least 1
        if train_size < 1:
            train_size = max(1, train_size)
            val_size = max(1, val_size)
            test_size = len(dataset) - train_size - val_size  # Adjust dynamically

        self.train_dataset, self.val_dataset, self.test_dataset = random_split(
            dataset, [train_size, val_size, test_size]
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
        )

In [None]:
df = pd.read_parquet("data/extracted_features.parquet")

df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
df = df[
    [
        "adf_p",
        "approx_entropy",
        "kpss_p",
        "hurst_exponent",
        "dfa",
        "spectral_entropy",
        "acf_3",
        "acf_4",
        "acf_2",
        "cv",
        "acf_5",
        "stl_seasonal_std",
        "acf_6",
        "pacf_1",
        "acf_7",
        "acf_1",
        "acf_8",
        "svd_entropy",
        "longest_positive_run",
        "no_of_datapoints",
        "stl_trend_std",
        "skewness",
        "perm_entropy",
        "acf_9",
        "iqr",
        "num_peaks",
        "mad",
        "std",
        "medad",
        "fft_peak",
        "best_model",
    ]
]
df["granularity"] = df.index.map(lambda x: x[0])
df = df[df.granularity == "Y"]

In [None]:
df

Unnamed: 0,adf_p,approx_entropy,kpss_p,hurst_exponent,dfa,spectral_entropy,acf_3,acf_4,acf_2,cv,...,perm_entropy,acf_9,iqr,num_peaks,mad,std,medad,fft_peak,best_model,granularity
Y22913,0.853813,0.173891,0.044816,1.075553,1.712483,2.043767,0.083644,0.029903,0.288571,0.154704,...,2.418961,0.022624,165.099976,4.0,131.534302,182.511734,90.969971,2.181735e+04,AutoETS,Y
Y22914,0.474260,0.212835,0.010610,0.871953,1.713866,2.067558,0.493805,0.361643,0.616590,0.334599,...,1.140068,0.216120,714.828918,2.0,385.583893,463.934845,375.533325,2.564162e+04,AutoTheta,Y
Y22923,0.012405,0.121413,0.061821,0.742512,1.305405,0.950454,0.347210,0.233375,0.566274,0.219326,...,2.307189,0.495337,600.000000,4.0,324.321350,363.265717,320.000000,3.063000e+04,AutoTheta,Y
Y22924,0.944756,0.198430,0.019794,1.089394,1.795032,1.645063,0.501115,0.325602,0.625242,0.325791,...,2.063559,0.314846,3060.000000,4.0,1802.493042,2142.020020,900.000000,1.215900e+05,CES,Y
Y22925,0.988861,0.157908,0.013609,0.735311,1.330696,1.969494,0.485675,0.396006,0.569674,0.271941,...,1.556901,0.266161,1173.000000,2.0,575.623230,656.248779,621.000000,4.462800e+04,AutoMFLES,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Y21058,0.006888,0.091336,0.027579,0.939807,1.260311,4.397556,0.897082,0.861514,0.932822,0.151439,...,2.281913,0.681457,1030.000000,137.0,872.500366,1431.285767,450.000000,6.998620e+06,AutoRegressive,Y
Y9954,0.139216,0.069007,0.010000,0.912576,1.402497,2.857009,0.935567,0.914806,0.956925,0.295074,...,1.079133,0.823715,997.750000,60.0,526.130981,627.253235,488.000000,1.627263e+06,AutoARIMA,Y
Y22755,0.031039,0.109345,0.010000,0.934446,1.392672,2.933853,0.924948,0.900637,0.949775,0.205451,...,1.205908,0.790049,3070.000000,76.0,1448.790405,1631.843750,1560.000000,6.358150e+06,AutoRegressive,Y
Y20492,0.991577,0.024429,0.010000,0.914175,1.692145,2.062266,0.991239,0.988084,0.994248,0.617424,...,1.959285,0.972205,5433.891602,122.0,2752.426025,3094.803223,3119.452881,4.142784e+06,CES,Y


In [None]:
target_column = "best_model"
batch_size = 2408 * 6
num_workers = 24
test_size = 0.2
val_size = 0.1
ds = TabularDataLoader(df, target_column, batch_size, num_workers, test_size, val_size)

In [None]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics


class NN(pl.LightningModule):

    def __init__(self, input_shape, output_shape):
        super(NN, self).__init__()

        self.fc1 = nn.Linear(input_shape, 256)
        self.bn1 = nn.BatchNorm1d(256)

        self.fc2 = nn.Linear(256, 512)
        self.bn2 = nn.BatchNorm1d(512)

        self.fc3 = nn.Linear(512, 512)
        self.bn3 = nn.BatchNorm1d(512)

        self.fc4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(256)

        self.fc5 = nn.Linear(256, 128)
        self.bn5 = nn.BatchNorm1d(128)

        self.fc6 = nn.Linear(128, output_shape)

        self.dropout = nn.Dropout(0.1)  # Dropout for regularization

        self.accuracy = torchmetrics.Accuracy(
            task="multiclass", num_classes=output_shape, average="macro"
        )
        self.f1_score = torchmetrics.F1Score(
            task="multiclass", num_classes=output_shape, average="macro"
        )

        self.lr = 1e-3

        self._init_weights()  # Initialize weights

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")  # Better weight init
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x):

        x = self.bn1(self.fc1(x))
        x = F.gelu(x)
        x = self.dropout(x)

        x = self.bn2(self.fc2(x))
        x = F.gelu(x)
        x = self.dropout(x)

        x = self.bn3(self.fc3(x))
        x = F.gelu(x)
        x = self.dropout(x)

        x = self.bn4(self.fc4(x))
        x = F.gelu(x)
        x = self.dropout(x)

        x = self.bn5(self.fc5(x))
        x = F.gelu(x)
        x = self.dropout(x)

        return self.fc6(x)  # Output logits

    def _common_step(self, batch, batch_idx):
        x, y = batch
        x_hat = self.forward(x)
        loss = F.cross_entropy(x_hat, y)
        return loss, x_hat, y

    def training_step(self, batch, batch_idx):
        loss, x_hat, y = self._common_step(batch, batch_idx)
        accuracy = self.accuracy(x_hat, y)
        f1_score = self.f1_score(x_hat, y)

        self.log_dict(
            {"train_loss": loss, "train_accuracy": accuracy, "train_f1score": f1_score},
            prog_bar=True,
        )
        return loss

    def validation_step(self, batch, batch_idx):
        loss, x_hat, y = self._common_step(batch, batch_idx)
        accuracy = self.accuracy(x_hat, y)
        f1_score = self.f1_score(x_hat, y)

        self.log_dict(
            {"val_loss": loss, "val_accuracy": accuracy, "val_f1score": f1_score}, prog_bar=True
        )
        return loss

    def test_step(self, batch, batch_idx):
        loss, x_hat, y = self._common_step(batch, batch_idx)
        accuracy = self.accuracy(x_hat, y)
        f1_score = self.f1_score(x_hat, y)

        self.log_dict(
            {"test_loss": loss, "test_accuracy": accuracy, "test_f1score": f1_score}, prog_bar=True
        )
        return loss

    def configure_optimizers(self):
        optimizer = optim.AdamW(
            self.parameters(), lr=self.lr, weight_decay=1e-4
        )  # AdamW for better generalization
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="min", factor=0.5, patience=5
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {"scheduler": scheduler, "monitor": "val_loss"},
        }

In [None]:
# Hyperparametersa
input_shape = df.shape[1] - 1
output_shape = len(df.best_model.unique())

model = NN(input_shape, output_shape)

In [None]:
logger = pl.loggers.TensorBoardLogger(save_dir="./log/", name="model_classifier", version=0.1)


# saves top-K checkpoints based on "val_loss" metric
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    save_top_k=3,
    monitor="val_f1score",
    mode="max",
    dirpath="checkpoints/",
    filename="model-classifier-{epoch}-{val_f1score}",
)


trainer = pl.Trainer(
    logger=logger,
    accelerator="auto",
    devices=[0],
    min_epochs=1,
    max_epochs=1000,
    # precision='16-mixed',
    enable_model_summary=True,
    callbacks=[
        pl.callbacks.EarlyStopping("val_loss", patience=15, verbose=False),
        checkpoint_callback,
    ],
    #     default_root_dir="mnist_checkpoints/",
    enable_checkpointing=True,
)
if checkpoint_callback.best_model_path:
    trainer.fit(
        model,
        ds,
        ckpt_path=checkpoint_callback.best_model_path,
    )
else:
    trainer.fit(model, ds)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/pranav-pc/projects/ts/ts/classification/.venv/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/pranav-pc/projects/ts/nbs/src/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name     | Type               | Params | Mode 
---------------------------------------------------------
0  | fc1      | Linear             | 8.2 K  | train
1  | bn1      | BatchNorm1d        | 512    | train
2  | fc2      | Linear             | 131 K  | train
3  | bn2      | BatchNorm1d        | 1.0 K  | train
4  | fc3      | Linear             | 262 K  | train
5  | bn3      | BatchNorm1d        | 1.0 K  | train
6  | fc4      | Linear             | 131 K  | train
7  | bn4      | BatchNorm1d        | 512    | train
8  | fc5      | Linear             | 32.9 K | train
9  | bn5      | BatchNorm1d        

Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

/home/pranav-pc/projects/ts/ts/classification/.venv/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

In [None]:
trainer.validate(model, ds);
#

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |                                             | 0/? [00:00<?, ?it/s]

In [None]:
trainer.test(model, ds);

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                | 0/? [00:00<?, ?it/s]