Zelle 1: Install

In [None]:
# !pip -q install -U transformers datasets accelerate scikit-learn pandas

Zelle 2: Imports

In [None]:
import json
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

Zelle 3: Pfade + Modell

In [None]:
DEPT_CSV = "../data/department-v2.csv"
CV_ANN   = "../data/linkedin-cvs-annotated.json"

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 32
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Device:", DEVICE)
if DEVICE == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

Zelle 4: Trainingsdaten laden (Lookup CSV)

In [None]:
dept_df = pd.read_csv(DEPT_CSV)
dept_df = dept_df.dropna(subset=["text", "label"]).copy()
dept_df["text"] = dept_df["text"].astype(str).str.strip()
dept_df["label"] = dept_df["label"].astype(str).str.strip()

print("Train rows:", len(dept_df))
display(dept_df.head())

Zelle 5: Eval-Daten laden (annotated JSON) und flatten + ACTIVE

In [None]:
with open(CV_ANN, "r", encoding="utf-8") as f:
    ann = json.load(f)

# ann: list[list[dict]] -> flatten
positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)

# ACTIVE
eval_df["status"] = eval_df["status"].astype(str).str.upper()
eval_df = eval_df[eval_df["status"] == "ACTIVE"].copy()

# position -> title
eval_df["title"] = eval_df["position"].astype(str).str.strip()

# labels clean
eval_df["department"] = eval_df["department"].astype(str).str.strip()

eval_df = eval_df[["title", "department"]].dropna().copy()

print("Eval ACTIVE rows:", len(eval_df))
display(eval_df.head())

Zelle 6: Klassenverteilung (Train vs Eval)

In [None]:
def show_dist(series, name):
    vc = series.value_counts()
    df = pd.DataFrame({"count": vc, "pct": (vc / vc.sum() * 100).round(2)})
    print(f"\n--- {name} ---")
    display(df)

show_dist(dept_df["label"], "Department TRAIN (lookup)")
show_dist(eval_df["department"], "Department EVAL (annotated ACTIVE)")

Zelle 7: LabelEncoder + Train/Val Split (nur aus Lookup)

In [None]:
le = LabelEncoder()
dept_df["y"] = le.fit_transform(dept_df["label"])

# stratified split nur aus Lookup
train_part, val_part = train_test_split(
    dept_df,
    test_size=0.2,
    random_state=SEED,
    stratify=dept_df["y"]
)

print("Train split:", len(train_part), "Val split:", len(val_part))
print("Num classes:", len(le.classes_))
print("Classes:", list(le.classes_))

Zelle 8: Class Weights berechnen + “nach balancing” anzeigen

In [None]:
def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * counts)  # N / (K * n_c)
    return counts, weights

def oversample_to_median(df, label_col="y", random_state=42):
    vc = df[label_col].value_counts()
    target = int(vc.median())  # moderat

    parts = []
    for cls, n in vc.items():
        df_c = df[df[label_col] == cls]
        if n < target:
            df_c = df_c.sample(target, replace=True, random_state=random_state)
        parts.append(df_c)

    return pd.concat(parts).sample(frac=1, random_state=random_state).reset_index(drop=True)

# Oversampling NUR Train-Split
train_part_os = oversample_to_median(train_part, label_col="y", random_state=SEED)

print("\n--- Train BEFORE oversampling ---")
display(train_part["label"].value_counts())

print("\n--- Train AFTER oversampling (median target) ---")
display(train_part_os["label"].value_counts())

# Class weights auf dem oversampleten Train berechnen (oder alternativ auf original train_part)
counts, weights = compute_class_weights(train_part_os["y"].values, len(le.classes_))

balance_df = pd.DataFrame({
    "class": le.classes_,
    "count": counts,
    "pct": (counts / counts.sum() * 100).round(2),
    "weight": weights.round(4),
})
balance_df["count_x_weight"] = (balance_df["count"] * balance_df["weight"]).round(4)

print("\n--- Oversampled train distribution + class weights (Loss-Balancing) ---")
display(balance_df.sort_values("count", ascending=False))
print("Sum(count_x_weight):", balance_df["count_x_weight"].sum())

Zelle 9: Weighted Trainer (CrossEntropy mit weights)

In [None]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

Zelle 10: Tokenizer + Dataset Builder

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def make_ds(df, text_col, y_col=None):
    data = {"text": df[text_col].astype(str).tolist()}
    if y_col is not None:
        data["labels"] = df[y_col].astype(int).tolist()
    ds = Dataset.from_dict(data)

    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

    return ds.map(tok, batched=True)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_ds = make_ds(train_part_os, "text", "y")
val_ds   = make_ds(val_part, "text", "y")

Zelle 11: Training-Runner (für LR Sweep) mit Early Stopping

In [None]:
def run_train(lr, max_epochs=20, batch_size=64, patience=3):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(le.classes_)
    )

    args = TrainingArguments(
        output_dir=f"./out_dept_lr{lr}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,

        learning_rate=lr,
        lr_scheduler_type="linear",
        warmup_ratio=0.06,
        weight_decay=0.01,

        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=max_epochs,

        logging_steps=50,
        seed=SEED,
        report_to="none",

        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1_macro": f1_score(labels, preds, average="macro"),
            "f1_weighted": f1_score(labels, preds, average="weighted"),
        }

    cw = torch.tensor(weights, dtype=torch.float)

    trainer = WeightedTrainer(
        class_weights=cw,
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )

    trainer.train()
    best_val = trainer.evaluate()
    return trainer, best_val

Zelle 12: LR Sweep (nur auf Val aus Lookup)

In [None]:
lrs = [3e-5, 2e-5]
results = []
trainers = {}

for lr in lrs:
    print(f"\n### LR = {lr} ###")
    trainer, m = run_train(lr=lr, max_epochs=20, batch_size=64, patience=3)
    trainers[lr] = trainer
    results.append({
        "lr": lr,
        "val_f1_macro": m.get("eval_f1_macro"),
        "val_accuracy": m.get("eval_accuracy"),
        "val_f1_weighted": m.get("eval_f1_weighted"),
    })

res_df = pd.DataFrame(results).sort_values("val_f1_macro", ascending=False)
display(res_df)

best_lr = res_df.iloc[0]["lr"]
print("Best LR:", best_lr)
best_trainer = trainers[best_lr]

Zelle 13: Finale Evaluation auf eval_df (nur messen, kein Training)


In [None]:
# eval_df Labels auf Train-Labelset filtern
eval_use = eval_df[eval_df["department"].isin(set(le.classes_))].copy()
print("Eval after label filter:", len(eval_use))

# encode eval labels
y_eval = le.transform(eval_use["department"].astype(str))

# dataset for eval
eval_use_ds = Dataset.from_dict({"text": eval_use["title"].astype(str).tolist(), "labels": y_eval.tolist()})

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

eval_use_ds = eval_use_ds.map(tok, batched=True)

pred = best_trainer.predict(eval_use_ds)
pred_ids = np.argmax(pred.predictions, axis=-1)
pred_labels = le.inverse_transform(pred_ids)

y_true = eval_use["department"].astype(str).values
y_pred = pred_labels.astype(str)

print("\n=== FINAL EVAL on eval_df (Department) ===")
print("Accuracy       :", accuracy_score(y_true, y_pred))
print("Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro Recall   :", recall_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro F1       :", f1_score(y_true, y_pred, average="macro", zero_division=0))
print("Weighted F1    :", f1_score(y_true, y_pred, average="weighted", zero_division=0))

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

# optional: ein paar predictions anschauen
out_preview = eval_use[["title", "department"]].copy()
out_preview["pred"] = y_pred
out_preview["correct"] = out_preview["department"] == out_preview["pred"]
display(out_preview.head(30))