Zelle 1: Install

In [19]:
# !pip -q install -U transformers datasets accelerate scikit-learn pandas

Zelle 2: Imports

In [20]:
import json
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

Zelle 3: Pfade + Modell

In [21]:
DEPT_CSV = "../data/department-v2.csv"
CV_ANN   = "../data/linkedin-cvs-annotated.json"

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 32
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Device:", DEVICE)
if DEVICE == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

Device: cuda
GPU: NVIDIA GeForce RTX 3080 Laptop GPU


Zelle 4: Trainingsdaten laden (Lookup CSV)

In [22]:
dept_df = pd.read_csv(DEPT_CSV)
dept_df = dept_df.dropna(subset=["text", "label"]).copy()
dept_df["text"] = dept_df["text"].astype(str).str.strip()
dept_df["label"] = dept_df["label"].astype(str).str.strip()

print("Train rows:", len(dept_df))
display(dept_df.head())

Train rows: 10145


Unnamed: 0,text,label
0,Adjoint directeur communication,Marketing
1,Advisor Strategy and Projects,Project Management
2,Beratung & Projekte,Project Management
3,Beratung & Projektmanagement,Project Management
4,Beratung und Projektmanagement kommunale Partner,Project Management


Zelle 5: Eval-Daten laden (annotated JSON) und flatten + ACTIVE

In [23]:
with open(CV_ANN, "r", encoding="utf-8") as f:
    ann = json.load(f)

# ann: list[list[dict]] -> flatten
positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)

# ACTIVE
eval_df["status"] = eval_df["status"].astype(str).str.upper()
eval_df = eval_df[eval_df["status"] == "ACTIVE"].copy()

# position -> title
eval_df["title"] = eval_df["position"].astype(str).str.strip()

# labels clean
eval_df["department"] = eval_df["department"].astype(str).str.strip()

eval_df = eval_df[["title", "department"]].dropna().copy()

print("Eval ACTIVE rows:", len(eval_df))
display(eval_df.head())

Eval ACTIVE rows: 623


Unnamed: 0,title,department
0,Prokurist,Other
1,CFO,Other
2,Betriebswirtin,Other
3,Prokuristin,Other
4,CFO,Other


Zelle 6: Klassenverteilung (Train vs Eval)

In [24]:
def show_dist(series, name):
    vc = series.value_counts()
    df = pd.DataFrame({"count": vc, "pct": (vc / vc.sum() * 100).round(2)})
    print(f"\n--- {name} ---")
    display(df)

show_dist(dept_df["label"], "Department TRAIN (lookup)")
show_dist(eval_df["department"], "Department EVAL (annotated ACTIVE)")


--- Department TRAIN (lookup) ---


Unnamed: 0_level_0,count,pct
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Marketing,4295,42.34
Sales,3328,32.8
Information Technology,1305,12.86
Business Development,620,6.11
Project Management,201,1.98
Consulting,167,1.65
Administrative,83,0.82
Other,42,0.41
Purchasing,40,0.39
Customer Support,33,0.33



--- Department EVAL (annotated ACTIVE) ---


Unnamed: 0_level_0,count,pct
department,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,344,55.22
Information Technology,62,9.95
Sales,46,7.38
Consulting,39,6.26
Project Management,39,6.26
Marketing,22,3.53
Business Development,20,3.21
Human Resources,16,2.57
Purchasing,15,2.41
Administrative,14,2.25


Zelle 7: LabelEncoder + Train/Val Split (nur aus Lookup)

In [25]:
le = LabelEncoder()
dept_df["y"] = le.fit_transform(dept_df["label"])

# stratified split nur aus Lookup
train_part, val_part = train_test_split(
    dept_df,
    test_size=0.2,
    random_state=SEED,
    stratify=dept_df["y"]
)

print("Train split:", len(train_part), "Val split:", len(val_part))
print("Num classes:", len(le.classes_))
print("Classes:", list(le.classes_))

Train split: 8116 Val split: 2029
Num classes: 11
Classes: ['Administrative', 'Business Development', 'Consulting', 'Customer Support', 'Human Resources', 'Information Technology', 'Marketing', 'Other', 'Project Management', 'Purchasing', 'Sales']


Zelle 8: Class Weights berechnen + “nach balancing” anzeigen

In [26]:
def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * counts)  # N / (K * n_c)
    return counts, weights

counts, weights = compute_class_weights(train_part["y"].values, len(le.classes_))

balance_df = pd.DataFrame({
    "class": le.classes_,
    "count": counts,
    "pct": (counts / counts.sum() * 100).round(2),
    "weight": weights.round(4),
})
balance_df["count_x_weight"] = (balance_df["count"] * balance_df["weight"]).round(4)

print("\n--- Train distribution + class weights (Loss-Balancing) ---")
display(balance_df.sort_values("count", ascending=False))
print("Sum(count_x_weight):", balance_df["count_x_weight"].sum())


--- Train distribution + class weights (Loss-Balancing) ---


Unnamed: 0,class,count,pct,weight,count_x_weight
6,Marketing,3436,42.34,0.2147,737.7092
10,Sales,2662,32.8,0.2772,737.9064
5,Information Technology,1044,12.86,0.7067,737.7948
1,Business Development,496,6.11,1.4875,737.8
8,Project Management,161,1.98,4.5827,737.8147
2,Consulting,134,1.65,5.5061,737.8174
0,Administrative,66,0.81,11.1791,737.8206
7,Other,34,0.42,21.7005,737.817
9,Purchasing,32,0.39,23.0568,737.8176
3,Customer Support,26,0.32,28.3776,737.8176


Sum(count_x_weight): 8115.9328


Zelle 9: Weighted Trainer (CrossEntropy mit weights)

In [27]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

Zelle 10: Tokenizer + Dataset Builder

In [28]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def make_ds(df, text_col, y_col=None):
    data = {"text": df[text_col].astype(str).tolist()}
    if y_col is not None:
        data["labels"] = df[y_col].astype(int).tolist()
    ds = Dataset.from_dict(data)

    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

    return ds.map(tok, batched=True)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_ds = make_ds(train_part, "text", "y")
val_ds   = make_ds(val_part, "text", "y")

Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Zelle 11: Training-Runner (für LR Sweep) mit Early Stopping

In [29]:
def run_train(lr, max_epochs=20, batch_size=64, patience=3):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(le.classes_)
    )
    args = TrainingArguments(
        output_dir=f"./out_dept_lr{lr}",
        eval_strategy="epoch",  # ← CHANGED from evaluation_strategy
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        learning_rate=lr,
        lr_scheduler_type="linear",
        warmup_ratio=0.06,
        weight_decay=0.01,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=max_epochs,
        logging_steps=50,
        seed=SEED,
        report_to="none",
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
    )
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1_macro": f1_score(labels, preds, average="macro"),
            "f1_weighted": f1_score(labels, preds, average="weighted"),
        }
    cw = torch.tensor(weights, dtype=torch.float)
    trainer = WeightedTrainer(
        class_weights=cw,
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )
    trainer.train()
    best_val = trainer.evaluate()
    return trainer, best_val

Zelle 12: LR Sweep (nur auf Val aus Lookup)

In [30]:
lrs = [3e-5, 2e-5]
results = []
trainers = {}

for lr in lrs:
    print(f"\n### LR = {lr} ###")
    trainer, m = run_train(lr=lr, max_epochs=20, batch_size=64, patience=3)
    trainers[lr] = trainer
    results.append({
        "lr": lr,
        "val_f1_macro": m.get("eval_f1_macro"),
        "val_accuracy": m.get("eval_accuracy"),
        "val_f1_weighted": m.get("eval_f1_weighted"),
    })

res_df = pd.DataFrame(results).sort_values("val_f1_macro", ascending=False)
display(res_df)

best_lr = res_df.iloc[0]["lr"]
print("Best LR:", best_lr)
best_trainer = trainers[best_lr]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



### LR = 3e-05 ###


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,2.2435,1.279612,0.914243,0.695666,0.92185
2,0.3424,0.079221,0.9931,0.968853,0.993208
3,0.0738,0.032554,0.994579,0.988563,0.994624
4,0.0132,0.083487,0.996057,0.984666,0.996058
5,0.011,0.018058,0.997043,0.996802,0.997045
6,0.0051,0.017628,0.997536,0.992982,0.997544
7,0.0045,0.034552,0.99655,0.994189,0.996535
8,0.0029,0.010141,0.998521,0.998677,0.998523
9,0.0086,0.011708,0.998521,0.998677,0.998523
10,0.0026,0.016552,0.998029,0.996916,0.998029



### LR = 2e-05 ###


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,2.3168,1.856921,0.817644,0.475593,0.827041
2,0.6542,0.276822,0.977329,0.922316,0.978644
3,0.1614,0.054738,0.994579,0.984765,0.994583
4,0.0396,0.029502,0.995564,0.987375,0.995585
5,0.0275,0.018285,0.997536,0.998021,0.997542
6,0.0151,0.021225,0.997043,0.994248,0.997041
7,0.0088,0.026569,0.997536,0.995559,0.997531
8,0.0064,0.014868,0.997536,0.994679,0.997532


Unnamed: 0,lr,val_f1_macro,val_accuracy,val_f1_weighted
0,3e-05,0.998677,0.998521,0.998523
1,2e-05,0.998021,0.997536,0.997542


Best LR: 3e-05


Zelle 13: Finale Evaluation auf eval_df (nur messen, kein Training)


In [31]:
# eval_df Labels auf Train-Labelset filtern
eval_use = eval_df[eval_df["department"].isin(set(le.classes_))].copy()
print("Eval after label filter:", len(eval_use))

# encode eval labels
y_eval = le.transform(eval_use["department"].astype(str))

# dataset for eval
eval_use_ds = Dataset.from_dict({"text": eval_use["title"].astype(str).tolist(), "labels": y_eval.tolist()})

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

eval_use_ds = eval_use_ds.map(tok, batched=True)

pred = best_trainer.predict(eval_use_ds)
pred_ids = np.argmax(pred.predictions, axis=-1)
pred_labels = le.inverse_transform(pred_ids)

y_true = eval_use["department"].astype(str).values
y_pred = pred_labels.astype(str)

print("\n=== FINAL EVAL on eval_df (Department) ===")
print("Accuracy       :", accuracy_score(y_true, y_pred))
print("Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro Recall   :", recall_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro F1       :", f1_score(y_true, y_pred, average="macro", zero_division=0))
print("Weighted F1    :", f1_score(y_true, y_pred, average="weighted", zero_division=0))

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

# optional: ein paar predictions anschauen
out_preview = eval_use[["title", "department"]].copy()
out_preview["pred"] = y_pred
out_preview["correct"] = out_preview["department"] == out_preview["pred"]
display(out_preview.head(30))

Eval after label filter: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== FINAL EVAL on eval_df (Department) ===
Accuracy       : 0.27929373996789725
Macro Precision: 0.39619620376479714
Macro Recall   : 0.458734701700878
Macro F1       : 0.339676060412491
Weighted F1    : 0.20003423272856632

Classification report:
                        precision    recall  f1-score   support

        Administrative     0.0377    0.2857    0.0667        14
  Business Development     0.2727    0.3000    0.2857        20
            Consulting     0.1802    0.5128    0.2667        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.2963    0.5000    0.3721        16
Information Technology     0.3006    0.7903    0.4356        62
             Marketing     0.2821    0.5000    0.3607        22
                 Other     0.5714    0.0116    0.0228       344
    Project Management     0.5082    0.7949    0.6200        39
            Purchasing     0.4375    0.4667    0.4516        15
                 Sales     0.4714    0.7174   

Unnamed: 0,title,department,pred,correct
0,Prokurist,Other,Project Management,False
1,CFO,Other,Information Technology,False
2,Betriebswirtin,Other,Administrative,False
3,Prokuristin,Other,Project Management,False
4,CFO,Other,Information Technology,False
6,Solutions Architect,Information Technology,Information Technology,True
14,Medizintechnik Beratung,Consulting,Business Development,False
17,Director expansión de negocio.,Business Development,Marketing,False
18,Gerente comercial,Sales,Marketing,False
19,Administrador Unico,Administrative,Information Technology,False
