In [1]:
import pandas as pd
from collections import Counter
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import torchsummary
import torch.optim as optim
import wandb 

  from .autonotebook import tqdm as notebook_tqdm


# Load the Datasets

In [2]:
train = pd.read_pickle("train.pk")
val = pd.read_pickle("val.pk")
test = pd.read_pickle("test.pk")

In [3]:
len(train["BookingCode"].unique())

100

In [4]:
full_ds = pd.concat([train, val, test])

label_dict = {i:  booking_code for i, booking_code in enumerate(full_ds["BookingCode"].unique().tolist())}
booking_code_to_label = {v: k for k, v in label_dict.items()}

# PartNumber
part_number_dict = {i:  part_number for i, part_number in enumerate(full_ds["PartNumber"].unique().tolist())}
part_number_to_label = {v: k for k, v in part_number_dict.items()}

# hdbscanCluster
cluster_dict = {i:  cluster_number for i, cluster_number in enumerate(full_ds["hdbscanCluster"].unique().tolist())}
cluster_to_label = {v: k for k, v in cluster_dict.items()}

In [5]:
# Convert to pytorch datasets

class CustomDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, preprocess_transforms=None):
        self.raw_data = dataframe
        self.preprocess_transforms = preprocess_transforms

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.raw_data.iloc[idx]
        
        sample = {
            "text": row["Description"],
            "text_emb": row["descriptionEmbeddings"],
            "part_num": part_number_to_label[row["PartNumber"]],
            "price": [row["Price"]],
            "price_norm": row["NormPrice"],
            "cluster": cluster_to_label[row["hdbscanCluster"]],
            "label": booking_code_to_label[row["BookingCode"]]
        }

        if self.preprocess_transforms is not None:
            sample = self.preprocess_transforms(sample)

        return sample


In [6]:
train_ds = CustomDataset(train)
val_ds = CustomDataset(val)
test_ds = CustomDataset(test)

In [7]:
# Dataloaders 

train_loader = DataLoader(train_ds, batch_size=100,  shuffle=True)
val_loader = DataLoader(val_ds, batch_size=100,  shuffle=True)
test_loader = DataLoader(test_ds, batch_size=100,  shuffle=True)

In [8]:
for batch in train_loader:
    break

In [9]:
batch["label"].shape

torch.Size([100])

In [10]:
train.columns

Index(['ID', 'PartNumber', 'Description', 'Count', 'SumPrice', 'BookingCode',
       'DocumentId', 'descriptionEmbeddings', 'Price', 'NormPrice',
       'hdbscanCluster'],
      dtype='object')

# Building the Model

In [11]:
class classifierModel(nn.Module):
    def __init__(self, device):
        super(classifierModel, self).__init__()

        self.device = device
        
        self.n_part_number = len(part_number_dict)
        self.n_cluster = len(cluster_dict)
        
        self.part_number_emb = nn.Linear(in_features=len(part_number_dict), out_features=768, bias=False, device=self.device)
        self.price_adapter = nn.Linear(in_features=1, out_features=768, bias=False, device=self.device)
        self.cluster_emb = nn.Linear(in_features=len(cluster_dict), out_features=768, bias=False, device=self.device)

        self.norm1 = nn.BatchNorm1d(768, device=self.device)
        self.linear1 = nn.Linear(in_features=768, out_features=384, device=self.device)
        self.dropout1 = nn.Dropout(p=0.5)

        self.norm2 = nn.BatchNorm1d(384, device=self.device)
        self.linear2 = nn.Linear(in_features=384, out_features=192, device=self.device) 
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.norm3 = nn.BatchNorm1d(192, device=self.device)
        self.linear3 = nn.Linear(in_features=192, out_features=len(label_dict), device=self.device) # Classification Head
        self.dropout3 = nn.Dropout(p=0.5)

        self.relu = F.leaky_relu 


    def forward(self, sample: dict):
        text_emb = torch.tensor(sample["text_emb"], dtype=torch.float32).to(self.device)
        part_num = torch.tensor(sample["part_num"]).to(self.device)
        price = torch.tensor(sample["price_norm"], dtype=torch.float32).to(self.device)
        price = torch.reshape(price, [price.shape[0], 1])
        cluster = torch.tensor(sample["cluster"]).to(self.device)

        part_emb = self.part_number_emb(F.one_hot(part_num, num_classes=self.n_part_number).float())
        price_emb = self.price_adapter(price)
        cluster_emb = self.cluster_emb(F.one_hot(cluster, num_classes=self.n_cluster).float())

        out = torch.sum(torch.stack([text_emb, part_emb, price_emb, cluster_emb], dim=0), dim=0)

        out = self.norm1(out)
        out = self.dropout1(out)
        out = self.linear1(out)
        out = self.relu(out)
        
        out = self.norm2(out)
        out = self.dropout2(out)
        out = self.linear2(out)
        out = self.relu(out)
        
        out = self.norm3(out)
        out = self.dropout3(out)
        out = self.linear3(out)
        

        return out
               
            
    

In [12]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model = classifierModel(device=device)

In [13]:

def train_model(model, train_loader, val_loader, epochs=10, lr=1e-4, l1=0, l2=0, run_name=None):
    # Initialize Weights & Biases
    wandb.init(
        project="Buchungscodes",
        config={
            "epochs": epochs,
            "learning_rate": lr,
            "architecture": str(model),
            "optimizer": "Adam",
            "loss_function": "CrossEntropyLoss",
            "l1": l1,  # Added L1 regularization to config
            "l2": l2,  # Added L2 regularization to config
            "run_name": run_name  # Added run_name to config
        },
        reinit=True
    )
    config = wandb.config
    device = model.device

    model = torch.compile(model)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=l2)  # L2 regularization

    wandb.watch(model, log="all", log_freq=10)

    best_val_loss = float('inf')
    best_val_acc = 0.0

    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        for batch in train_loader:
            labels = batch["label"].to(device)
            optimizer.zero_grad()
            outputs = model(batch)
            loss = criterion(outputs, labels)
            
            # L1 regularization
            if l1 > 0:
                l1_loss = sum(p.abs().sum() for p in model.parameters())
                loss += l1 * l1_loss

            loss.backward()
            optimizer.step()
            running_loss += loss.item() * labels.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        
        # Validation 
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                labels = batch["label"].to(device)               
                outputs = model(batch)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * labels.size(0)
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        
        epoch_val_loss = val_loss / len(val_loader.dataset)
        val_acc = correct / total

        # Log to wandb
        wandb.log({
            "Epoch": epoch,
            "Training Loss": epoch_loss,
            "Validation Loss": epoch_val_loss,
            "Validation Accuracy": val_acc
        })

        print(f"Epoch {epoch}/{epochs} | "
              f"Train Loss: {epoch_loss:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f} | "
              f"Val Acc: {val_acc:.4f}", end="\r")
        
        # Define filenames with run_name if provided
        run_suffix = f"_{run_name}" if run_name else ""

        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            torch.save(model.state_dict(), f"best_val_loss{run_suffix}.pth")
            print(f"\nSaved Best Val Loss Model at epoch {epoch}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f"best_val_acc{run_suffix}.pth")
            print(f"--> Saved Best Val Acc Model at epoch {epoch}")

    wandb.finish()

In [None]:
train_model(model, train_loader, val_loader, epochs=2000, lr=1e-4, l2=0, run_name="BaseModel")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlukas-bckrs[0m ([33mlukas-beckers[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return target(*args, **kwargs)
  return func(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return target(*args, **kwargs)
  return func(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return target(*args, **kwargs)
  return func(*args, **kwargs)


Epoch 1/2000 | Train Loss: 4.7723 | Val Loss: 4.3660 | Val Acc: 0.1823
Saved Best Val Loss Model at epoch 1
--> Saved Best Val Acc Model at epoch 1
Epoch 2/2000 | Train Loss: 4.4850 | Val Loss: 4.0486 | Val Acc: 0.2952
Saved Best Val Loss Model at epoch 2
--> Saved Best Val Acc Model at epoch 2
Epoch 3/2000 | Train Loss: 4.2267 | Val Loss: 3.8012 | Val Acc: 0.3469
Saved Best Val Loss Model at epoch 3
--> Saved Best Val Acc Model at epoch 3
Epoch 4/2000 | Train Loss: 4.0314 | Val Loss: 3.5871 | Val Acc: 0.3660
Saved Best Val Loss Model at epoch 4
--> Saved Best Val Acc Model at epoch 4
Epoch 5/2000 | Train Loss: 3.8217 | Val Loss: 3.4225 | Val Acc: 0.3728
Saved Best Val Loss Model at epoch 5
--> Saved Best Val Acc Model at epoch 5
Epoch 6/2000 | Train Loss: 3.6716 | Val Loss: 3.2387 | Val Acc: 0.3878
Saved Best Val Loss Model at epoch 6
--> Saved Best Val Acc Model at epoch 6
Epoch 7/2000 | Train Loss: 3.4976 | Val Loss: 3.1074 | Val Acc: 0.4177
Saved Best Val Loss Model at epoch 7
--> 

# Feature Selection 

In [None]:
class justTextModel(nn.Module):
    def __init__(self, device):
        super(justTextModel, self).__init__()

        self.device = device
        
        self.norm1 = nn.BatchNorm1d(768, device=self.device)
        self.linear1 = nn.Linear(in_features=768, out_features=384, device=self.device)
        self.dropout1 = nn.Dropout(p=0.5)

        self.norm2 = nn.BatchNorm1d(384, device=self.device)
        self.linear2 = nn.Linear(in_features=384, out_features=192, device=self.device) 
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.norm3 = nn.BatchNorm1d(192, device=self.device)
        self.linear3 = nn.Linear(in_features=192, out_features=len(label_dict), device=self.device) # Classification Head
        self.dropout3 = nn.Dropout(p=0.5)

        self.relu = F.leaky_relu 


    def forward(self, sample: dict):
        text_emb = torch.tensor(sample["text_emb"], dtype=torch.float32).to(self.device)

        out = self.norm1(text_emb)
        out = self.dropout1(out)
        out = self.linear1(out)
        out = self.relu(out)
        
        out = self.norm2(out)
        out = self.dropout2(out)
        out = self.linear2(out)
        out = self.relu(out)
        
        out = self.norm3(out)
        out = self.dropout3(out)
        out = self.linear3(out)
        

        return out

In [None]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

just_text_model = justTextModel(device=device)

train_model(just_text_model, train_loader, val_loader, epochs=2000, lr=1e-4, l2=0, run_name="Just_Text_Base")

In [None]:
class TextPartModel(nn.Module):
    def __init__(self, device):
        super(TextPartModel, self).__init__()

        self.device = device
        
        self.n_part_number = len(part_number_dict)

        self.part_number_emb = nn.Linear(in_features=len(part_number_dict), out_features=768, bias=False, device=self.device)

        self.norm1 = nn.BatchNorm1d(768, device=self.device)
        self.linear1 = nn.Linear(in_features=768, out_features=384, device=self.device)
        self.dropout1 = nn.Dropout(p=0.5)

        self.norm2 = nn.BatchNorm1d(384, device=self.device)
        self.linear2 = nn.Linear(in_features=384, out_features=192, device=self.device) 
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.norm3 = nn.BatchNorm1d(192, device=self.device)
        self.linear3 = nn.Linear(in_features=192, out_features=len(label_dict), device=self.device) # Classification Head
        self.dropout3 = nn.Dropout(p=0.5)

        self.relu = F.leaky_relu 


    def forward(self, sample: dict):
        text_emb = torch.tensor(sample["text_emb"], dtype=torch.float32).to(self.device)
        part_num = torch.tensor(sample["part_num"]).to(self.device)
        
        part_emb = self.part_number_emb(F.one_hot(part_num, num_classes=self.n_part_number).float())
 
        out = torch.sum(torch.stack([text_emb, part_emb], dim=0), dim=0)

        out = self.norm1(out)
        out = self.dropout1(out)
        out = self.linear1(out)
        out = self.relu(out)
        
        out = self.norm2(out)
        out = self.dropout2(out)
        out = self.linear2(out)
        out = self.relu(out)
        
        out = self.norm3(out)
        out = self.dropout3(out)
        out = self.linear3(out)
        

        return out
               
            
    

In [None]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model = TextPartModel(device=device)

train_model(jmodel, train_loader, val_loader, epochs=2000, lr=1e-4, l2=0, run_name="TextPartModel_Base")