In [10]:
import pandas as pd
from collections import Counter
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import torchsummary
import torch.optim as optim

# Load the Datasets

In [21]:
train = pd.read_pickle("train.pk")
val = pd.read_pickle("val.pk")
test = pd.read_pickle("test.pk")

In [22]:
len(train["BookingCode"].unique())

100

In [23]:
label_dict = {i:  booking_code for i, booking_code in enumerate(train["BookingCode"].unique().tolist())}
booking_code_to_label = {v: k for k, v in label_dict.items()}

# PartNumber
part_number_dict = {i:  part_number for i, part_number in enumerate(train["PartNumber"].unique().tolist())}
part_number_to_label = {v: k for k, v in part_number_dict.items()}

# hdbscanCluster
cluster_dict = {i:  cluster_number for i, cluster_number in enumerate(train["hdbscanCluster"].unique().tolist())}
cluster_to_label = {v: k for k, v in cluster_dict.items()}

In [29]:
# Convert to pytorch datasets

class CustomDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, preprocess_transforms=None):
        self.raw_data = dataframe
        self.preprocess_transforms = preprocess_transforms

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.raw_data.iloc[idx]
        
        sample = {
            "text": row["Description"],
            "text_emb": row["descriptionEmbeddings"],
            "part_num": part_number_to_label[row["PartNumber"]],
            "price": row["Price"],
            "price_norm": row["NormPrice"],
            "cluster": cluster_to_label[row["hdbscanCluster"]]                   
        }

        if self.preprocess_transforms is not None:
            sample = self.preprocess_transforms(sample)

        return sample


In [30]:
train_ds = CustomDataset(train)
val_ds = CustomDataset(val)
test_ds = CustomDataset(test)

In [32]:
train.columns

Index(['ID', 'PartNumber', 'Description', 'Count', 'SumPrice', 'BookingCode',
       'DocumentId', 'descriptionEmbeddings', 'Price', 'NormPrice',
       'hdbscanCluster'],
      dtype='object')

# Building the Model

In [None]:
class classifierModel(nn.Module):
    def __init__(self, device):
        super(classifierModel, self).__init__()

        self.device = device
        
        self.n_part_number = len(part_number_dict)
        self.n_cluster = len(cluster_dict)
        
        self.part_number_emb = nn.Linear(in_features=len(part_number_dict), out_features=768, bias=False, device=self.device)
        self.price_adapter = nn.Linear(in_features=1, out_features=768, bias=False, device=self.device)
        self.cluster_emb = nn.Linear(in_features=len(cluster_dict), out_features=768, bias=False, device=self.device)

        self.norm1 = nn.BatchNorm1d(768, device=self.device)
        self.linear1 = nn.Linear(in_features=768, out_features=384, device=self.device)

        self.norm2 = nn.BatchNorm1d(384, device=self.device)
        self.linear2 = nn.Linear(in_features=384, out_features=len(label_dict), device=self.device) # Classification Head

        self.relu = F.leaky_relu 


    def forward(self, sample: dict):
        text_emb = torch.tensor(sample["text_emb"]).to(self.device)
        part_num = torch.tensor(sample["part_num"]).to(self.device)
        price = torch.tensor(sample["price_norm"]).to(self.device)
        cluster = torch.tensor(sample["cluster"]).to(self.device)

        part_emb = self.part_number_emb(F.one_hot(part_num, num_classes=self.n_part_number))
        price_emb = self.cluster_adapter(price_emb)
        cluster_emb = self.cluster_emb(F.one_hot(cluster, num_classes=self.n_cluster))

        out = torch.sum(torch.tensor(text_emb, part_emb, price_emb, cluster_emb), dim=0)
        

        
            
    

In [None]:
def train_model(model, train_loader, val_loader, epochs=10, lr=1e-4):
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    #model = torch.compile(model)
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, batch in enumerate(train_loader):
            #print("Batch: ", i, end="\r")
            images = batch['image'].to(device).float()
            labels = batch['class'].to(device).long()
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * images.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                images = batch['image'].to(device).float()
                labels = batch['class'].to(device).long()
                outputs = model(images)
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        val_acc = correct / total
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Val Acc: {val_acc:.4f}")