Zelle 1: Install

In [23]:
# !pip -q install -U transformers datasets accelerate scikit-learn pandas

Zelle 2: Imports

In [24]:
import json
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

Zelle 3: Pfade + Modell

In [25]:
DEPT_CSV = "../data/department-v2.csv"
CV_ANN   = "../data/linkedin-cvs-annotated.json"

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 32
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Device:", DEVICE)
if DEVICE == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

Device: cuda
GPU: NVIDIA GeForce RTX 3080 Laptop GPU


Zelle 4: Trainingsdaten laden (Lookup CSV)

In [26]:
dept_df = pd.read_csv(DEPT_CSV)
dept_df = dept_df.dropna(subset=["text", "label"]).copy()
dept_df["text"] = dept_df["text"].astype(str).str.strip()
dept_df["label"] = dept_df["label"].astype(str).str.strip()

print("Train rows:", len(dept_df))
display(dept_df.head())

Train rows: 10145


Unnamed: 0,text,label
0,Adjoint directeur communication,Marketing
1,Advisor Strategy and Projects,Project Management
2,Beratung & Projekte,Project Management
3,Beratung & Projektmanagement,Project Management
4,Beratung und Projektmanagement kommunale Partner,Project Management


Zelle 5: Eval-Daten laden (annotated JSON) und flatten + ACTIVE

In [27]:
with open(CV_ANN, "r", encoding="utf-8") as f:
    ann = json.load(f)

# ann: list[list[dict]] -> flatten
positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)

# ACTIVE
eval_df["status"] = eval_df["status"].astype(str).str.upper()
eval_df = eval_df[eval_df["status"] == "ACTIVE"].copy()

# position -> title
eval_df["title"] = eval_df["position"].astype(str).str.strip()

# labels clean
eval_df["department"] = eval_df["department"].astype(str).str.strip()

eval_df = eval_df[["title", "department"]].dropna().copy()

print("Eval ACTIVE rows:", len(eval_df))
display(eval_df.head())

Eval ACTIVE rows: 623


Unnamed: 0,title,department
0,Prokurist,Other
1,CFO,Other
2,Betriebswirtin,Other
3,Prokuristin,Other
4,CFO,Other


Zelle 6: Klassenverteilung (Train vs Eval)

In [28]:
def show_dist(series, name):
    vc = series.value_counts()
    df = pd.DataFrame({"count": vc, "pct": (vc / vc.sum() * 100).round(2)})
    print(f"\n--- {name} ---")
    display(df)

show_dist(dept_df["label"], "Department TRAIN (lookup)")
show_dist(eval_df["department"], "Department EVAL (annotated ACTIVE)")


--- Department TRAIN (lookup) ---


Unnamed: 0_level_0,count,pct
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Marketing,4295,42.34
Sales,3328,32.8
Information Technology,1305,12.86
Business Development,620,6.11
Project Management,201,1.98
Consulting,167,1.65
Administrative,83,0.82
Other,42,0.41
Purchasing,40,0.39
Customer Support,33,0.33



--- Department EVAL (annotated ACTIVE) ---


Unnamed: 0_level_0,count,pct
department,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,344,55.22
Information Technology,62,9.95
Sales,46,7.38
Consulting,39,6.26
Project Management,39,6.26
Marketing,22,3.53
Business Development,20,3.21
Human Resources,16,2.57
Purchasing,15,2.41
Administrative,14,2.25


Zelle 7: LabelEncoder + Train/Val Split (nur aus Lookup)

In [29]:
le = LabelEncoder()
dept_df["y"] = le.fit_transform(dept_df["label"])

# stratified split nur aus Lookup
train_part, val_part = train_test_split(
    dept_df,
    test_size=0.2,
    random_state=SEED,
    stratify=dept_df["y"]
)

print("Train split:", len(train_part), "Val split:", len(val_part))
print("Num classes:", len(le.classes_))
print("Classes:", list(le.classes_))

Train split: 8116 Val split: 2029
Num classes: 11
Classes: ['Administrative', 'Business Development', 'Consulting', 'Customer Support', 'Human Resources', 'Information Technology', 'Marketing', 'Other', 'Project Management', 'Purchasing', 'Sales']


Zelle 8: Oversampling + Verteilung zeigen

In [30]:
def oversample_to_max(df, label_col="y", random_state=42):
    vc = df[label_col].value_counts()
    max_n = vc.max()

    parts = []
    for cls, n in vc.items():
        df_c = df[df[label_col] == cls]
        if n < max_n:
            df_c = df_c.sample(max_n, replace=True, random_state=random_state)
        parts.append(df_c)

    return pd.concat(parts).sample(frac=1, random_state=random_state).reset_index(drop=True)

# Oversample NUR den Train-Split (nicht val, nicht eval)
train_part_os = oversample_to_max(train_part, label_col="y", random_state=SEED)

print("\n--- Train BEFORE oversampling ---")
display(train_part["label"].value_counts())

print("\n--- Train AFTER oversampling ---")
display(train_part_os["label"].value_counts())


--- Train BEFORE oversampling ---


label
Marketing                 3436
Sales                     2662
Information Technology    1044
Business Development       496
Project Management         161
Consulting                 134
Administrative              66
Other                       34
Purchasing                  32
Customer Support            26
Human Resources             25
Name: count, dtype: int64


--- Train AFTER oversampling ---


label
Sales                     3436
Information Technology    3436
Consulting                3436
Administrative            3436
Human Resources           3436
Purchasing                3436
Customer Support          3436
Other                     3436
Business Development      3436
Project Management        3436
Marketing                 3436
Name: count, dtype: int64

Zelle 9: Tokenizer + Dataset Builder

In [31]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def make_ds(df, text_col, y_col=None):
    data = {"text": df[text_col].astype(str).tolist()}
    if y_col is not None:
        data["labels"] = df[y_col].astype(int).tolist()
    ds = Dataset.from_dict(data)

    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

    return ds.map(tok, batched=True)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_ds = make_ds(train_part_os, "text", "y")
val_ds   = make_ds(val_part, "text", "y")

Map:   0%|          | 0/37796 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Zelle 10: Training-Runner (für LR Sweep) mit Early Stopping

In [32]:
def run_train(lr, max_epochs=20, batch_size=64, patience=3):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(le.classes_)
    )

    args = TrainingArguments(
        output_dir=f"./out_dept_os_lr{lr}",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,

        learning_rate=lr,
        lr_scheduler_type="linear",
        warmup_ratio=0.06,
        weight_decay=0.01,

        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=max_epochs,

        logging_steps=50,
        seed=SEED,
        report_to="none",

        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1_macro": f1_score(labels, preds, average="macro"),
            "f1_weighted": f1_score(labels, preds, average="weighted"),
        }

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,      # train_ds nutzt jetzt train_part_os
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )

    trainer.train()
    best_val = trainer.evaluate()
    return trainer, best_val

Zelle 11: LR Sweep (nur auf Val aus Lookup)

In [33]:
lrs = [3e-5, 2e-5]
results = []
trainers = {}

for lr in lrs:
    print(f"\n### LR = {lr} ###")
    trainer, m = run_train(lr=lr, max_epochs=20, batch_size=64, patience=3)
    trainers[lr] = trainer
    results.append({
        "lr": lr,
        "val_f1_macro": m.get("eval_f1_macro"),
        "val_accuracy": m.get("eval_accuracy"),
        "val_f1_weighted": m.get("eval_f1_weighted"),
    })

res_df = pd.DataFrame(results).sort_values("val_f1_macro", ascending=False)
display(res_df)

best_lr = res_df.iloc[0]["lr"]
print("Best LR:", best_lr)
best_trainer = trainers[best_lr]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



### LR = 3e-05 ###


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.0266,0.034599,0.993593,0.980718,0.99362
2,0.0075,0.028935,0.994579,0.985954,0.99455
3,0.0006,0.023295,0.997536,0.992126,0.997534
4,0.0003,0.021469,0.997043,0.989809,0.997035
5,0.0004,0.028836,0.997043,0.991233,0.997071
6,0.0002,0.029115,0.99655,0.990802,0.99658



### LR = 2e-05 ###


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.0403,0.044767,0.9931,0.974282,0.993157
2,0.0073,0.029501,0.995071,0.985832,0.995043
3,0.0018,0.033863,0.995071,0.981257,0.995113
4,0.0008,0.023423,0.99655,0.990582,0.996568
5,0.0011,0.018313,0.997536,0.996878,0.997534
6,0.0003,0.020028,0.99655,0.991871,0.996575
7,0.0004,0.028448,0.995071,0.989891,0.995124
8,0.0041,0.027904,0.995564,0.981077,0.995587


Unnamed: 0,lr,val_f1_macro,val_accuracy,val_f1_weighted
1,2e-05,0.996878,0.997536,0.997534
0,3e-05,0.992126,0.997536,0.997534


Best LR: 2e-05


Zelle 12: Finale Evaluation auf eval_df (nur messen, kein Training)


In [34]:
# eval_df Labels auf Train-Labelset filtern
eval_use = eval_df[eval_df["department"].isin(set(le.classes_))].copy()
print("Eval after label filter:", len(eval_use))

# encode eval labels
y_eval = le.transform(eval_use["department"].astype(str))

# dataset for eval
eval_use_ds = Dataset.from_dict({"text": eval_use["title"].astype(str).tolist(), "labels": y_eval.tolist()})

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

eval_use_ds = eval_use_ds.map(tok, batched=True)

pred = best_trainer.predict(eval_use_ds)
pred_ids = np.argmax(pred.predictions, axis=-1)
pred_labels = le.inverse_transform(pred_ids)

y_true = eval_use["department"].astype(str).values
y_pred = pred_labels.astype(str)

print("\n=== FINAL EVAL on eval_df (Department) ===")
print("Accuracy       :", accuracy_score(y_true, y_pred))
print("Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro Recall   :", recall_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro F1       :", f1_score(y_true, y_pred, average="macro", zero_division=0))
print("Weighted F1    :", f1_score(y_true, y_pred, average="weighted", zero_division=0))

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

# optional: ein paar predictions anschauen
out_preview = eval_use[["title", "department"]].copy()
out_preview["pred"] = y_pred
out_preview["correct"] = out_preview["department"] == out_preview["pred"]
display(out_preview.head(30))

Eval after label filter: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== FINAL EVAL on eval_df (Department) ===
Accuracy       : 0.2857142857142857
Macro Precision: 0.49103244278239677
Macro Recall   : 0.4734592088889656
Macro F1       : 0.388105814546846
Weighted F1    : 0.2153784453613295

Classification report:
                        precision    recall  f1-score   support

        Administrative     0.0789    0.4286    0.1333        14
  Business Development     0.3158    0.3000    0.3077        20
            Consulting     0.2151    0.5128    0.3030        39
      Customer Support     1.0000    0.3333    0.5000         6
       Human Resources     0.8000    0.5000    0.6154        16
Information Technology     0.2249    0.9032    0.3601        62
             Marketing     0.1636    0.4091    0.2338        22
                 Other     0.6667    0.0174    0.0340       344
    Project Management     0.4697    0.7949    0.5905        39
            Purchasing     0.6667    0.4000    0.5000        15
                 Sales     0.8000    0.6087    

Unnamed: 0,title,department,pred,correct
0,Prokurist,Other,Project Management,False
1,CFO,Other,Information Technology,False
2,Betriebswirtin,Other,Information Technology,False
3,Prokuristin,Other,Information Technology,False
4,CFO,Other,Information Technology,False
6,Solutions Architect,Information Technology,Information Technology,True
14,Medizintechnik Beratung,Consulting,Information Technology,False
17,Director expansión de negocio.,Business Development,Information Technology,False
18,Gerente comercial,Sales,Marketing,False
19,Administrador Unico,Administrative,Information Technology,False
