In [1]:
import pandas as pd
from collections import Counter
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import torchsummary
import torch.optim as optim
import wandb 
import os

#torch.cuda.set_device(1)

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Available GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Available GPUs: 1
Current GPU: 0
GPU Name: NVIDIA GeForce RTX 4090


# Load the Datasets

In [3]:
train = pd.read_pickle("train.pk")
val = pd.read_pickle("val.pk")
test = pd.read_pickle("test.pk")

In [4]:
len(train["BookingCode"].unique())

100

In [5]:
train.columns 

Index(['ID', 'PartNumber', 'Description', 'Count', 'SumPrice', 'BookingCode',
       'DocumentId', 'descriptionEmbeddings', 'Price', 'NormPrice',
       'hdbscanCluster'],
      dtype='object')

In [6]:
full_ds = pd.concat([train, val, test])

label_dict = {i:  booking_code for i, booking_code in enumerate(full_ds["BookingCode"].unique().tolist())}
booking_code_to_label = {v: k for k, v in label_dict.items()}

# PartNumber
part_number_dict = {i:  part_number for i, part_number in enumerate(full_ds["PartNumber"].unique().tolist())}
part_number_to_label = {v: k for k, v in part_number_dict.items()}

# hdbscanCluster
cluster_dict = {i:  cluster_number for i, cluster_number in enumerate(full_ds["hdbscanCluster"].unique().tolist())}
cluster_to_label = {v: k for k, v in cluster_dict.items()}

In [7]:

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = f"Description: {row['Description']}, PartNumber: {row['PartNumber']}, Count: {row['Count']}, Sum Price: {row['SumPrice']}, Price: {row['Price']}, DocumentId: {row['DocumentId']}"
        label = booking_code_to_label[row["BookingCode"]]
        enc = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)
        enc['labels'] = torch.tensor(label, dtype=torch.long)
        return {key: torch.tensor(val) for key, val in enc.items()}

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

train_ds = CustomDataset(train, tokenizer=tokenizer)
val_ds = CustomDataset(val, tokenizer=tokenizer)
test_ds = CustomDataset(test, tokenizer=tokenizer)

# Training BERT Classifier

In [9]:
def validate_model(model, dataset, batch_size=20, device=None):
    if device is None:
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    model.to(device)
    model.eval()

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    total_loss = 0.0
    all_preds = []
    all_labels = []
    
    loss_fn = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item() * input_ids.size(0)

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataset)
    accuracy = accuracy_score(all_labels, all_preds)

    return avg_loss, accuracy

In [10]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=100)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    evaluation_strategy="epoch",
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./trained_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlukas-bckrs[0m ([33mlukas-beckers[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  return {key: torch.tensor(val) for key, val in enc.items()}


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.441144,0.240816
2,No log,2.350554,0.495238
3,No log,1.70351,0.6
4,2.689000,1.415007,0.708844
5,2.689000,1.301673,0.748299


  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}


In [12]:
def validate_model(model, dataset, batch_size=20, device=None):
    if device is None:
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    model.to(device)
    model.eval()

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    total_loss = 0.0
    all_preds = []
    all_labels = []
    
    loss_fn = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item() * input_ids.size(0)

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataset)
    accuracy = accuracy_score(all_labels, all_preds)

    return avg_loss, accuracy

In [14]:
loss, acc = validate_model(model, test_ds, batch_size=20, device="cuda")

  return {key: torch.tensor(val) for key, val in enc.items()}


In [15]:
print(f"The BERT Model has a loss of {loss :.4f} on the test set and and accuracy of {acc :.4f}")

The BERT Model has a loss of 1.3017 on the test set and and accuracy of 0.7483


# Testing ROBERTA

In [18]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=100)

train_ds = CustomDataset(train, tokenizer=tokenizer)
val_ds = CustomDataset(val, tokenizer=tokenizer)
test_ds = CustomDataset(test, tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    evaluation_strategy="epoch",
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./trained_model")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val) for key, val in enc.items()}


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.124645,0.553741
2,No log,1.243092,0.734694
3,No log,0.943837,0.791837
4,1.843100,0.784188,0.832653
5,1.843100,0.745296,0.835374


  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}


In [19]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=100)

train_ds = CustomDataset(train, tokenizer=tokenizer)
val_ds = CustomDataset(val, tokenizer=tokenizer)
test_ds = CustomDataset(test, tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./trained_model")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val) for key, val in enc.items()}


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.625747,0.478912
2,No log,1.363786,0.684354
3,No log,1.02531,0.746939
4,1.999700,0.79157,0.823129
5,1.999700,0.667333,0.851701
6,1.999700,0.59783,0.854422
7,1.999700,0.524535,0.868027
8,0.536700,0.501989,0.880272
9,0.536700,0.441822,0.881633
10,0.536700,0.448714,0.892517


  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return {key: torch.tensor(val) for key, val in enc.items()}
  return

KeyboardInterrupt: 

In [None]:
torch.cuda.empty_cache()

from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=100)

train_ds = CustomDataset(train, tokenizer=tokenizer)
val_ds = CustomDataset(val, tokenizer=tokenizer)
test_ds = CustomDataset(test, tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./trained_model")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val) for key, val in enc.items()}


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.040187,0.563265


  return {key: torch.tensor(val) for key, val in enc.items()}
