Zelle 1: Install

In [2]:
#!pip -q install -U transformers datasets accelerate scikit-learn pandas

Zelle 2: Imports

In [3]:
import json
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

Zelle 3: Pfade + Modell

In [4]:
DEPT_CSV = "../data/department-v2.csv"
CV_ANN   = "../data/linkedin-cvs-annotated.json"

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 32
SEED = 42

print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA: True
GPU: NVIDIA GeForce RTX 3080 Laptop GPU


Zelle 4: Train Lookup laden

In [5]:
dept_df = pd.read_csv(DEPT_CSV).dropna(subset=["text", "label"]).copy()
dept_df["text"]  = dept_df["text"].astype(str).str.strip()
dept_df["label"] = dept_df["label"].astype(str).str.strip()

print("Train rows:", len(dept_df))
display(dept_df.head())
print(dept_df["label"].value_counts())

Train rows: 10145


Unnamed: 0,text,label
0,Adjoint directeur communication,Marketing
1,Advisor Strategy and Projects,Project Management
2,Beratung & Projekte,Project Management
3,Beratung & Projektmanagement,Project Management
4,Beratung und Projektmanagement kommunale Partner,Project Management


label
Marketing                 4295
Sales                     3328
Information Technology    1305
Business Development       620
Project Management         201
Consulting                 167
Administrative              83
Other                       42
Purchasing                  40
Customer Support            33
Human Resources             31
Name: count, dtype: int64


Zelle 5: Eval laden (nur messen)

In [6]:
with open(CV_ANN, "r", encoding="utf-8") as f:
    ann = json.load(f)

positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)
eval_df["status"] = eval_df["status"].astype(str).str.upper()
eval_df = eval_df[eval_df["status"] == "ACTIVE"].copy()

eval_df["title"] = eval_df["position"].astype(str).str.strip()
eval_df["department"] = eval_df["department"].astype(str).str.strip()
eval_df = eval_df[["title", "department"]].dropna().copy()

print("Eval rows:", len(eval_df))
print(eval_df["department"].value_counts())
display(eval_df.head())

Eval rows: 623
department
Other                     344
Information Technology     62
Sales                      46
Consulting                 39
Project Management         39
Marketing                  22
Business Development       20
Human Resources            16
Purchasing                 15
Administrative             14
Customer Support            6
Name: count, dtype: int64


Unnamed: 0,title,department
0,Prokurist,Other
1,CFO,Other
2,Betriebswirtin,Other
3,Prokuristin,Other
4,CFO,Other


Zelle 6: Tokenizer + Helper

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

def make_ds(df, text_col, label_col=None):
    data = {"text": df[text_col].astype(str).tolist()}
    if label_col is not None:
        data["labels"] = df[label_col].astype(int).tolist()
    ds = Dataset.from_dict(data)
    return ds.map(tokenize, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }

def make_trainer(task_name, num_labels, train_ds, val_ds, lr=2e-5, batch_size=512, epochs=20, patience=3):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels
    )

    args = TrainingArguments(
        output_dir=f"./out_{task_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        learning_rate=lr,
        lr_scheduler_type="linear",
        warmup_ratio=0.06,
        weight_decay=0.01,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        logging_steps=50,
        seed=SEED,
        report_to="none",
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
    )
    return trainer

In [8]:
def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * counts)
    return weights

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

STAGE 1: Other vs NotOther

Zelle 7: Stage1 Dataset bauen (Binary)

In [9]:
stage1_df = dept_df.copy()
stage1_df["bin_label"] = (stage1_df["label"] == "Other").astype(int)  # 1=Other, 0=NotOther

train1, val1 = train_test_split(
    stage1_df,
    test_size=0.2,
    random_state=SEED,
    stratify=stage1_df["bin_label"]
)

train1_ds = make_ds(train1, "text", "bin_label")
val1_ds   = make_ds(val1, "text", "bin_label")

print("Stage1 train:", len(train1), "val:", len(val1))
print(train1["bin_label"].value_counts())

Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Stage1 train: 8116 val: 2029
bin_label
0    8082
1      34
Name: count, dtype: int64


Zelle 8: Stage1 train (optional LR Sweep)

In [10]:
# Stage 1 train (Other vs NotOther) mit Class Weights

w1 = torch.tensor(compute_class_weights(train1["bin_label"].values, 2), dtype=torch.float)
print("Stage1 class weights:", w1.tolist())
print("Stage1 train label counts:\n", train1["bin_label"].value_counts())

model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args1 = TrainingArguments(
    output_dir="./out_dept_stage1_other_vs_not",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    weight_decay=0.01,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=20,
    logging_steps=100,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
)

stage1_trainer = WeightedTrainer(
    class_weights=w1,
    model=model1,
    args=args1,
    train_dataset=train1_ds,
    eval_dataset=val1_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

stage1_trainer.train()
print("Stage1 val:", stage1_trainer.evaluate())

Stage1 class weights: [0.5021034479141235, 119.35294342041016]
Stage1 train label counts:
 bin_label
0    8082
1      34
Name: count, dtype: int64


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.656388,0.630853,0.395517,0.769712
2,No log,0.328948,0.991129,0.733058,0.993457
3,No log,0.056958,0.993593,0.774249,0.995019
4,No log,0.309109,0.998029,0.874505,0.998029
5,No log,0.080409,0.997536,0.867802,0.997729
6,No log,0.268624,0.999507,0.966543,0.999491
7,0.228900,0.379241,0.999014,0.928324,0.998944
8,0.228900,0.329879,0.999507,0.966543,0.999491
9,0.228900,0.276964,0.999507,0.966543,0.999491


Stage1 val: {'eval_loss': 0.2686242461204529, 'eval_accuracy': 0.9995071463775259, 'eval_f1_macro': 0.9665429961249897, 'eval_f1_weighted': 0.999490778874918, 'eval_runtime': 14.8816, 'eval_samples_per_second': 136.343, 'eval_steps_per_second': 0.269, 'epoch': 9.0}


STAGE 2: Multi-class für NotOther

Zelle 9: Stage2 Dataset bauen (nur NotOther)

In [11]:
stage2_df = dept_df[dept_df["label"] != "Other"].copy()

le2 = LabelEncoder()
stage2_df["y"] = le2.fit_transform(stage2_df["label"])

train2, val2 = train_test_split(
    stage2_df,
    test_size=0.2,
    random_state=SEED,
    stratify=stage2_df["y"]
)

train2_ds = make_ds(train2, "text", "y")
val2_ds   = make_ds(val2, "text", "y")

print("Stage2 train:", len(train2), "val:", len(val2))
print("Stage2 classes:", list(le2.classes_))
print(stage2_df["label"].value_counts())

Map:   0%|          | 0/8082 [00:00<?, ? examples/s]

Map:   0%|          | 0/2021 [00:00<?, ? examples/s]

Stage2 train: 8082 val: 2021
Stage2 classes: ['Administrative', 'Business Development', 'Consulting', 'Customer Support', 'Human Resources', 'Information Technology', 'Marketing', 'Project Management', 'Purchasing', 'Sales']
label
Marketing                 4295
Sales                     3328
Information Technology    1305
Business Development       620
Project Management         201
Consulting                 167
Administrative              83
Purchasing                  40
Customer Support            33
Human Resources             31
Name: count, dtype: int64


Zelle 10: Stage2 train

In [13]:
# Stage 2 train (NotOther Multi-class) mit Class Weights

w2 = torch.tensor(compute_class_weights(train2["y"].values, len(le2.classes_)), dtype=torch.float)
print("Stage2 class weights (first 10):", w2[:10].tolist())
print("Stage2 train label counts:\n", train2["label"].value_counts().head(20))

model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le2.classes_))

args2 = TrainingArguments(
    output_dir="./out_dept_stage2_notother_multiclass",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.06,
    weight_decay=0.01,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=20,
    logging_steps=50,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
)

stage2_trainer = WeightedTrainer(
    class_weights=w2,
    model=model2,
    args=args2,
    train_dataset=train2_ds,
    eval_dataset=val2_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

stage2_trainer.train()
print("Stage2 val:", stage2_trainer.evaluate())

Stage2 class weights (first 10): [12.245454788208008, 1.6294355392456055, 6.031343460083008, 31.08461570739746, 32.327999114990234, 0.7741379141807556, 0.23521536588668823, 5.019876003265381, 25.256250381469727, 0.3036063015460968]
Stage2 train label counts:
 label
Marketing                 3436
Sales                     2662
Information Technology    1044
Business Development       496
Project Management         161
Consulting                 134
Administrative              66
Purchasing                  32
Customer Support            26
Human Resources             25
Name: count, dtype: int64


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,2.247615,0.359228,0.189491,0.367547
2,No log,2.020841,0.68481,0.459926,0.675586
3,No log,1.490459,0.856507,0.635104,0.872767
4,2.069800,0.958623,0.919347,0.761394,0.928613
5,2.069800,0.587033,0.938644,0.819278,0.945468
6,2.069800,0.36793,0.958436,0.869515,0.962858
7,0.806600,0.250872,0.97526,0.91697,0.976602
8,0.806600,0.189458,0.983177,0.936132,0.983714
9,0.806600,0.143939,0.989609,0.947587,0.989818
10,0.195000,0.119299,0.991588,0.950453,0.991719


Stage2 val: {'eval_loss': 0.11349191516637802, 'eval_accuracy': 0.9940623453735774, 'eval_f1_macro': 0.9734120618646955, 'eval_f1_weighted': 0.9940742444685896, 'eval_runtime': 17.0433, 'eval_samples_per_second': 118.58, 'eval_steps_per_second': 0.235, 'epoch': 20.0}


INFERENCE PIPELINE (Stage1 -> Stage2)

Zelle 11: Zwei-Stufen Vorhersage auf eval_df + Metriken

In [14]:
# eval labels filter: nur labels, die Stage2 kennt + Other
valid_labels = set(le2.classes_) | {"Other"}
eval_use = eval_df[eval_df["department"].isin(valid_labels)].copy()
print("Eval used:", len(eval_use))

# Stage1 prediction: Other prob
eval_stage1_ds = Dataset.from_dict({"text": eval_use["title"].astype(str).tolist()}).map(tokenize, batched=True)
p1 = stage1_trainer.predict(eval_stage1_ds).predictions
p1_prob_other = torch.softmax(torch.tensor(p1), dim=-1)[:, 1].numpy()

# Threshold für Other (0.5 default, kannst du später tunen)
TH = 0.5
pred_is_other = p1_prob_other >= TH

# Stage2 prediction nur für NotOther
eval_notother = eval_use.loc[~pred_is_other].copy()
eval_stage2_ds = Dataset.from_dict({"text": eval_notother["title"].astype(str).tolist()}).map(tokenize, batched=True)
p2 = stage2_trainer.predict(eval_stage2_ds).predictions
p2_ids = np.argmax(p2, axis=-1)
p2_labels = le2.inverse_transform(p2_ids)

# Combine predictions
y_pred = np.array(["Other"] * len(eval_use), dtype=object)
y_pred[~pred_is_other] = p2_labels

y_true = eval_use["department"].astype(str).values

print("\n=== Two-stage FINAL EVAL on eval_df (Department) ===")
print("Accuracy       :", accuracy_score(y_true, y_pred))
print("Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro Recall   :", recall_score(y_true, y_pred, average="macro", zero_division=0))
print("Macro F1       :", f1_score(y_true, y_pred, average="macro", zero_division=0))
print("Weighted F1    :", f1_score(y_true, y_pred, average="weighted", zero_division=0))

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

preview = eval_use[["title", "department"]].copy()
preview["pred"] = y_pred
preview["p_other"] = p1_prob_other
preview["correct"] = preview["department"] == preview["pred"]
display(preview.head(50))

Eval used: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]


=== Two-stage FINAL EVAL on eval_df (Department) ===
Accuracy       : 0.2841091492776886
Macro Precision: 0.4112368585556103
Macro Recall   : 0.4712076237688619
Macro F1       : 0.33909374950436805
Weighted F1    : 0.19300763595348547

Classification report:
                        precision    recall  f1-score   support

        Administrative     0.0602    0.3571    0.1031        14
  Business Development     0.2857    0.3000    0.2927        20
            Consulting     0.3390    0.5128    0.4082        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.3478    0.5000    0.4103        16
Information Technology     0.2670    0.8871    0.4104        62
             Marketing     0.3235    0.5000    0.3929        22
                 Other     0.7500    0.0087    0.0172       344
    Project Management     0.2613    0.7436    0.3867        39
            Purchasing     0.3636    0.5333    0.4324        15
                 Sales     0.5254  

Unnamed: 0,title,department,pred,p_other,correct
0,Prokurist,Other,Project Management,0.00482,False
1,CFO,Other,Information Technology,0.006241,False
2,Betriebswirtin,Other,Information Technology,0.003531,False
3,Prokuristin,Other,Information Technology,0.008627,False
4,CFO,Other,Information Technology,0.006241,False
6,Solutions Architect,Information Technology,Information Technology,0.004078,True
14,Medizintechnik Beratung,Consulting,Marketing,0.004118,False
17,Director expansión de negocio.,Business Development,Information Technology,0.011984,False
18,Gerente comercial,Sales,Marketing,0.004315,False
19,Administrador Unico,Administrative,Information Technology,0.004207,False
