## Configuration

In [None]:
TRAIN_PATH = 'data/ebay/Tagged_Titles_Train.tsv'
LIST_PATH = 'data/ebay/Listing_Titles.tsv'

SAVE_DIRECTORY = "./saved_models"
EPOCHS = 3
BATCH_SIZE = 64
FINE_TUNE_BERT = False # set to true to train BERT as well
LR = 2e-5 if FINE_TUNE_BERT else 1e-3
MODEL_NAME = "GottBERT/GottBERT_base_last"


SyntaxError: invalid syntax (921157865.py, line 6)

## Initialization

In [None]:
import csv 
import pandas as pd
import numpy as np
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, AutoConfig
import matplotlib.pyplot as plt
import seaborn as sns
import traceback 
import numpy as np
import os
from tqdm import tqdm
# Load model directly
from transformers import AutoTokenizer, AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE=device
print(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
tokenizer = tokenizer

tagged_train = pd.read_csv(TRAIN_PATH, sep='\t')
os.makedirs(SAVE_DIRECTORY, exist_ok=True)




## MLP Classifier Architecture 

In [None]:
import torch
import torch.nn as nn

class MLPClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dims=[64, 256, 128], num_classes=59, dropout=0.3):
        """
        Args:
            input_dim (int): Size of input features.
            hidden_dims (list): List of hidden layer sizes. Pass [] for a single linear layer.
            num_classes (int): Number of output classes.
            dropout (float): Dropout probability.
        """
        super().__init__()
        layers = []
        current_dim = input_dim

        # Dynamically create hidden layers
        for h_dim in hidden_dims:
            layers.append(nn.Linear(current_dim, h_dim))
            layers.append(nn.LayerNorm(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            current_dim = h_dim

        # Final output layer (Project to num_classes)
        layers.append(nn.Linear(current_dim, num_classes))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        logits = self.model(x)
        return logits

In [None]:
def get_original_classes(df):
    tags = df['Tag'].fillna("").astype(str).str.strip()
    unique_tags = tags[tags != ''].unique()
    sorted_tags = sorted(unique_tags)
    print(f"--- Original Class Analysis ---")
    print(f"Total Unique Classes Found: {len(sorted_tags)}")
    print(f"Classes: {sorted_tags}")
    
    return sorted_tags

## Preprocessing

In [None]:
from torch.utils.data import Dataset

def preprocess_ebay_ner_data(df):
    df['Tag'] = df['Tag'].fillna('')
    all_raw_tags = df['Tag'].astype(str).str.strip().unique()
    unique_aspects = set()
    
    for tag in all_raw_tags:
        if tag not in ['', 'O', '0', 'nan']:
            unique_aspects.add(tag)
            
    # 3. Construct the full label set (O + B-Tag + I-Tag for every aspect)
    # This ensures 59 classes (1 'O' + 29*2) if all 29 aspects are in the data.
    unique_labels = ['O']
    for aspect in sorted(list(unique_aspects)):
        unique_labels.append(f"B-{aspect}")
        unique_labels.append(f"I-{aspect}")
        
    # Create mappings
    # We sort again to ensure 'O' and the rest are in a deterministic order
    unique_labels = sorted(unique_labels)
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for i, label in enumerate(unique_labels)}

    print(f"Generated {len(unique_labels)} classes from data.")
    grouped_titles = []
    grouped_labels = []

    for record_id, group in df.groupby('Record Number'):
        tokens = group['Token'].astype(str).tolist()
        raw_tags = group['Tag'].tolist()
        bio_tags = []
        current_entity_tag = None
        for tag in raw_tags: 
            tag = tag.strip()
            if tag == 'O' or tag == '0': 
                bio_tags.append('O')
                current_entity_tag = None
            elif tag != '':
                bio_tags.append(f"B-{tag}")
                current_entity_tag = tag
            else:
                if current_entity_tag:
                    bio_tags.append(f"I-{current_entity_tag}")
                else:
                    bio_tags.append('O')

        grouped_titles.append(tokens)
        grouped_labels.append(bio_tags)
    # unique_labels = sorted(list(set([l for seq in grouped_labels for l in seq])))
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for i, label in enumerate(unique_labels)}

    return grouped_titles, grouped_labels, label2id, id2label 


class BertDataset(Dataset):
    def __init__(self, token_lists, label_lists, label2id, max_len=128):
        self.token_lists = token_lists
        self.label_lists = label_lists
        self.label2id = label2id
        self.tokenizer = tokenizer
        self.max_len = max_len 

    def __len__(self):
        return len(self.token_lists)

    def __getitem__(self, idx):
        word_list = self.token_lists[idx]
        label_list = self.label_lists[idx]
        encoding = self.tokenizer(
            word_list,
            is_split_into_words=True,
            padding='max_length',
            truncation =True, 
            max_length=self.max_len,
            return_tensors='pt'
        )
        word_ids = encoding.word_ids(batch_index=0)
        encoded_labels= []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                encoded_labels.append(-100)
            elif word_idx != previous_word_idx:
                label_str = label_list[word_idx]
                encoded_labels.append(self.label2id[label_str])

            else: 
                encoded_labels.append(-100)
            
            previous_word_idx = word_idx 

        labels_tensor = torch.tensor(encoded_labels, dtype=torch.long)
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = labels_tensor

        return item





In [None]:
import pandas as pd
import csv
import os
from sklearn.model_selection import train_test_split

def load_and_prepare_data(train_csv_path, listing_csv_path=None):
    """
    Loads the eBay NER datasets with specific parameters to handle 
    tab-separation and empty 'continuation' tags correctly.
    
    Args:
        train_csv_path (str): Path to the tagged training data file.
        listing_csv_path (str, optional): Path to the listing titles file (if needed for context).
        
    Returns:
        tuple: (train_df, token_lists, label_lists, label2id, id2label)
    """
    print(f"Loading data from {train_csv_path}...")
    
    # 1. Load Training Data
    # source: [113-116] - Specific pandas settings required:
    # - sep="\t": Data is tab-separated
    # - keep_default_na=False: Prevent pandas from turning empty strings into NaN
    # - na_values=None: Ensure no other values are treated as NA
    # - quoting: The file uses CSV-style quoting 
    try:
        train_df = pd.read_csv(
            train_csv_path, 
            sep="\t", 
            keep_default_na=False, 
            na_values=None,
            quoting=csv.QUOTE_MINIMAL, # Handles standard CSV quoting used in the file
            encoding='utf-8'
        )
    except Exception as e:
        print(f"Error loading train data: {e}")
        return None

    # 2. Load Listing Data (Optional, context only)
    # The training file already contains tokens, but this can be useful for EDA.
    if listing_csv_path and os.path.exists(listing_csv_path):
        listing_df = pd.read_csv(
            listing_csv_path, 
            sep="\t", 
            keep_default_na=False, 
            na_values=None,
            encoding='utf-8'
        )
        print(f"Loaded {len(listing_df)} listing records.")
    
    print(f"Loaded {len(train_df)} training tokens.")

    # 3. Preprocess to BIO Format
    # Re-using the logic defined in previous steps
    print("Preprocessing data into BIO format...")
    labels = get_original_classes(train_df)
    token_lists, label_lists, label2id, id2label = preprocess_ebay_ner_data(train_df)
    
    print(f"Processed {len(token_lists)} sentences.")
    print(f"Found {len(label2id)} unique classes (including 'O' and B-/I- prefixes).")
    
    return train_df, token_lists, label_lists, label2id, id2label

In [None]:
train_df, grouped_titles, grouped_labels, label2id, id2label = load_and_prepare_data(TRAIN_PATH, LIST_PATH)

## Dataset Verification

In [None]:
total_dataset = BertDataset(token_lists = grouped_titles, label_lists = grouped_labels, label2id = label2id, max_len=128)
sample = total_dataset[1826]
print(sample)
print(tokenizer.convert_ids_to_tokens(sample['input_ids']))
print(sample['input_ids'].shape)

In [None]:
example = "MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803"

In [None]:
tokens = tokenizer(example.split(" "), return_tensors="pt", padding=True, is_split_into_words=True)
tokens = tokens.to(DEVICE)
print(tokens.word_ids())
print(tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]))
with torch.no_grad():
    outs = model(**tokens)

print(outs.last_hidden_state.shape)
classifier = MLPClassifier(input_dim=768, hidden_dims=[64], num_classes=59).to(DEVICE)
preds = classifier(outs.last_hidden_state)
print(preds.shape)

## Training Functions

In [None]:
import traceback
def validate(bert_model, classifier, dataloader, device, id2label):
    bert_model.eval()
    classifier.eval()

    loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
    running_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = bert_model(input_ids, attention_mask=attention_masks)
            logits = classifier(outputs.last_hidden_state)
            loss = loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
            running_loss += loss.item()
            preds = torch.argmax(logits, dim=2)
            mask = labels != -100 
            all_preds.extend(preds[mask].cpu().numpy())
            all_labels.extend(labels[mask].cpu().numpy())
    avg_loss = running_loss / len(dataloader)
    unique_labels = sorted(list(set(all_labels)))
    target_names = [id2label[i] for i in unique_labels]
    report = classification_report(
        all_labels,
        all_preds, 
        labels=unique_labels,
        target_names=target_names,
        output_dict=True,
        zero_division=0
    )
    return avg_loss, report

def plot_results(train_loss_history, val_loss_history, final_report, epochs):
    """
    Plots training vs validation loss and per-class F1 scores.
    """
    plt.figure(figsize=(16, 6))

    # Plot 1: Loss Curves
    plt.subplot(1, 2, 1)
    plt.plot(train_loss_history, label='Train Loss', color='#1f77b4', linewidth=2)
    
    # We plot val loss as points or a line corresponding to epochs
    # Since val_loss is recorded once per epoch, we stretch it or plot it against epoch indices
    # Here we just plot the sequence of validation losses
    plt.plot(np.linspace(0, len(train_loss_history), len(val_loss_history)), 
             val_loss_history, label='Val Loss', color='#d62728', marker='o', linewidth=2)
    
    plt.title(f"Loss Curve over {epochs} Epochs")
    plt.xlabel("Training Steps")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True, alpha=0.3)

    # Plot 2: Top 20 Class F1 Scores
    plt.subplot(1, 2, 2)
    class_metrics = {k: v['f1-score'] for k, v in final_report.items() 
                     if isinstance(v, dict) and k not in ['accuracy', 'macro avg', 'weighted avg']}
    
    # Sort and slice top 20
    sorted_classes = sorted(class_metrics.items(), key=lambda x: x[1], reverse=True)[:20]
    names = [x[0] for x in sorted_classes]
    scores = [x[1] for x in sorted_classes]
    
    sns.barplot(x=scores, y=names, palette='viridis', hue=names, legend=False)
    plt.title(f"Top 20 F1 Scores (Epoch {epochs})")
    plt.xlabel("F1 Score")
    plt.xlim(0, 1.0)
    plt.grid(axis='x', alpha=0.3)

    plt.tight_layout()
    plt.show()

def train_model(bert_model, classifier, train_loader, val_loader, label2id, id2label):
    print(f"Initializing Models... (Fine-tune Bert: {FINE_TUNE_BERT})")
    for param in bert_model.parameters():
        param.requires_grad = FINE_TUNE_BERT
    
    params_to_optimize = list(classifier.parameters())
    if FINE_TUNE_BERT:
        params_to_optimize += list(bert_model.parameters())
    optimizer = torch.optim.AdamW(params_to_optimize, lr=LR)
    loss_fn = nn.CrossEntropyLoss(ignore_index = -100)

    train_loss_history = []
    val_loss_history = []
    final_val_report = {}
    try:
        print(f"Starting Training for {EPOCHS} Epochs... ")
        for epoch in range(EPOCHS):
            bert_model.train()
            classifier.train()
            running_train_loss = 0.0
            progress_bar = tqdm(train_loader, desc=f"EPOCH {epoch + 1}/ {EPOCHS}")
            for batch in progress_bar: 
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)
                
                optimizer.zero_grad()
                
                # Forward Pass
                outputs = bert_model(input_ids, attention_mask=attention_mask)
                logits = classifier(outputs.last_hidden_state)
                
                # Loss
                loss = loss_fn(logits.view(-1, len(label2id)), labels.view(-1))
                
                # Backward
                loss.backward()
                optimizer.step()
                
                # Track Loss
                running_train_loss += loss.item()
                train_loss_history.append(loss.item())
                
                # Update Progress Bar
                progress_bar.set_postfix({'loss': loss.item()})
            print(f"Validating Epoch {epoch+1}... ")
            val_loss, val_report = validate(bert_model, classifier, val_loader, DEVICE, id2label)
            val_loss_history.append(val_loss)
            final_val_report = val_report

            print(f"Epoch {epoch+1} Summary | Train Loss: {running_train_loss/len(train_loader):.4f} | Val Loss: {val_loss:.4f}")
            print(val_report)
    except Exception as e:
        raise

    finally:
        torch.save(bert_model.state_dict(), os.path.join(SAVE_DIRECTORY, "bert_model.pth"))
        torch.save(classifier.state_dict(), os.path.join(SAVE_DIRECTORY, "classifier_head.pth"))
        plot_results(train_loss_history, val_loss_history, final_val_report, EPOCHS)
    
    return bert_model, classifier, train_loss_history, val_loss_history, final_val_report



## Training Loop

In [None]:

train_toks, val_toks, train_labs, val_labs = train_test_split(
    grouped_titles, grouped_labels, test_size=0.1, random_state=42
)
train_ds = BertDataset(train_toks, train_labs, label2id)
val_ds = BertDataset(val_toks, val_labs, label2id)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
bert_model, classifier, train_loss, val_loss, report = train_model(
    model, classifier, train_loader, val_loader, label2id, id2label
)

