Zelle 1: Install

In [None]:
!pip -q install -U transformers datasets accelerate scikit-learn pandas

Zelle 2: Imports

In [None]:
import json
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

Zelle 3: Pfade + Modell

In [None]:
DEPT_CSV = "data/department-v2.csv"
CV_ANN   = "data/linkedin-cvs-annotated.json"

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 32
SEED = 42

print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

Zelle 4: Train Lookup laden

In [None]:
dept_df = pd.read_csv(DEPT_CSV).dropna(subset=["text", "label"]).copy()
dept_df["text"]  = dept_df["text"].astype(str).str.strip()
dept_df["label"] = dept_df["label"].astype(str).str.strip()

print("Train rows:", len(dept_df))
display(dept_df.head())
print(dept_df["label"].value_counts())

Zelle 5: Eval laden (nur messen)

In [None]:
with open(CV_ANN, "r", encoding="utf-8") as f:
    ann = json.load(f)

positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)
eval_df["status"] = eval_df["status"].astype(str).str.upper()
eval_df = eval_df[eval_df["status"] == "ACTIVE"].copy()

eval_df["title"] = eval_df["position"].astype(str).str.strip()
eval_df["department"] = eval_df["department"].astype(str).str.strip()
eval_df = eval_df[["title", "department"]].dropna().copy()

print("Eval rows:", len(eval_df))
print(eval_df["department"].value_counts())
display(eval_df.head())

Zelle 6: Tokenizer + Helper

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

def make_ds(df, text_col, label_col=None):
    data = {"text": df[text_col].astype(str).tolist()}
    if label_col is not None:
        data["labels"] = df[label_col].astype(int).tolist()
    ds = Dataset.from_dict(data)
    return ds.map(tokenize, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }

def make_trainer(task_name, num_labels, train_ds, val_ds, lr=2e-5, batch_size=512, epochs=20, patience=3):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels
    )

    args = TrainingArguments(
        output_dir=f"./out_{task_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        learning_rate=lr,
        lr_scheduler_type="linear",
        warmup_ratio=0.06,
        weight_decay=0.01,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        logging_steps=50,
        seed=SEED,
        report_to="none",
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )
    return trainer

In [None]:
def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * counts)
    return weights

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

STAGE 1: Other vs NotOther

Zelle 7: Stage1 Dataset bauen (Binary)

In [None]:
stage1_df = dept_df.copy()
stage1_df["bin_label"] = (stage1_df["label"] == "Other").astype(int)  # 1=Other, 0=NotOther

train1, val1 = train_test_split(
    stage1_df,
    test_size=0.2,
    random_state=SEED,
    stratify=stage1_df["bin_label"]
)

train1_ds = make_ds(train1, "text", "bin_label")
val1_ds   = make_ds(val1, "text", "bin_label")

print("Stage1 train:", len(train1), "val:", len(val1))
print(train1["bin_label"].value_counts())

Zelle 8: Stage1 train (optional LR Sweep)

In [None]:
# Stage 1 train (Other vs NotOther) mit Class Weights

w1 = torch.tensor(compute_class_weights(train1["bin_label"].values, 2), dtype=torch.float)
print("Stage1 class weights:", w1.tolist())
print("Stage1 train label counts:\n", train1["bin_label"].value_counts())

model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args1 = TrainingArguments(
    output_dir="./out_dept_stage1_other_vs_not",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    weight_decay=0.01,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=20,
    logging_steps=100,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
)

stage1_trainer = WeightedTrainer(
    class_weights=w1,
    model=model1,
    args=args1,
    train_dataset=train1_ds,
    eval_dataset=val1_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

stage1_trainer.train()
print("Stage1 val:", stage1_trainer.evaluate())

STAGE 2: Multi-class f체r NotOther

Zelle 9: Stage2 Dataset bauen (nur NotOther)

In [None]:
stage2_df = dept_df[dept_df["label"] != "Other"].copy()

le2 = LabelEncoder()
stage2_df["y"] = le2.fit_transform(stage2_df["label"])

train2, val2 = train_test_split(
    stage2_df,
    test_size=0.2,
    random_state=SEED,
    stratify=stage2_df["y"]
)

train2_ds = make_ds(train2, "text", "y")
val2_ds   = make_ds(val2, "text", "y")

print("Stage2 train:", len(train2), "val:", len(val2))
print("Stage2 classes:", list(le2.classes_))
print(stage2_df["label"].value_counts())

Zelle 10: Stage2 train

In [None]:
# Stage 2 train (NotOther Multi-class) mit Class Weights

w2 = torch.tensor(compute_class_weights(train2["y"].values, len(le2.classes_)), dtype=torch.float)
print("Stage2 class weights (first 10):", w2[:10].tolist())
print("Stage2 train label counts:\n", train2["label"].value_counts().head(20))

model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le2.classes_))

args2 = TrainingArguments(
    output_dir="./out_dept_stage2_notother_multiclass",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    weight_decay=0.01,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=20,
    logging_steps=50,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
)

stage2_trainer = WeightedTrainer(
    class_weights=w2,
    model=model2,
    args=args2,
    train_dataset=train2_ds,
    eval_dataset=val2_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

stage2_trainer.train()
print("Stage2 val:", stage2_trainer.evaluate())

INFERENCE PIPELINE (Stage1 -> Stage2)

Zelle 11: Zwei-Stufen Vorhersage auf eval_df + Metriken

In [None]:
# eval labels filter: nur labels, die Stage2 kennt + Other
valid_labels = set(le2.classes_) | {"Other"}
eval_use = eval_df[eval_df["department"].isin(valid_labels)].copy()
print("Eval used:", len(eval_use))

# Stage1 prediction: Other prob
eval_stage1_ds = Dataset.from_dict({"text": eval_use["title"].astype(str).tolist()}).map(tokenize, batched=True)
p1 = stage1_trainer.predict(eval_stage1_ds).predictions
p1_prob_other = torch.softmax(torch.tensor(p1), dim=-1)[:, 1].numpy()

# Threshold f체r Other (0.5 default, kannst du sp채ter tunen)
TH = 0.5
pred_is_other = p1_prob_other >= TH

# Stage2 prediction nur f체r NotOther
eval_notother = eval_use.loc[~pred_is_other].copy()
eval_stage2_ds = Dataset.from_dict({"text": eval_notother["title"].astype(str).tolist()}).map(tokenize, batched=True)
p2 = stage2_trainer.predict(eval_stage2_ds).predictions
p2_ids = np.argmax(p2, axis=-1)
p2_labels = le2.inverse_transform(p2_ids)

# Combine predictions
y_pred = np.array(["Other"] * len(eval_use), dtype=object)
y_pred[~pred_is_other] = p2_labels

y_true = eval_use["department"].astype(str).values

print("\n=== Two-stage FINAL EVAL on eval_df (Department) ===")
print("Accuracy       :", accuracy_score(y_true, y_pred))
print("Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro Recall   :", recall_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro F1       :", f1_score(y_true, y_pred, average="macro", zero_division=0))
print("Weighted F1    :", f1_score(y_true, y_pred, average="weighted", zero_division=0))

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

preview = eval_use[["title", "department"]].copy()
preview["pred"] = y_pred
preview["p_other"] = p1_prob_other
preview["correct"] = preview["department"] == preview["pred"]
display(preview.head(50))