Zelle 1: Imports + Pfade

In [39]:
import os, json, re
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [48]:
# Deine Struktur aus dem Screenshot
DEPT_CSV = "/Users/batuklkn/Desktop/BuzzwordLearner/data/department-v2.csv"
SEN_CSV  = "/Users/batuklkn/Desktop/BuzzwordLearner/data/seniority-v2.csv"
CV_ANN   = "/Users/batuklkn/Desktop/BuzzwordLearner/data/linkedin-cvs-annotated.json"
CV_RAW   = "/Users/batuklkn/Desktop/BuzzwordLearner/data/linkedin-cvs-not-annotated.json"

# Modellwahl:
# - wenn Titel stark deutsch/mehrsprachig sind -> multilingual
# - sonst reicht uncased
MODEL_NAME = "distilbert-base-multilingual-cased"
# MODEL_NAME = "distilbert-base-uncased"

Zelle 2: CSVs laden

In [41]:
dept_df = pd.read_csv(DEPT_CSV)
sen_df  = pd.read_csv(SEN_CSV)

assert set(dept_df.columns) >= {"text", "label"}
assert set(sen_df.columns)  >= {"text", "label"}

display(dept_df.head())
display(sen_df.head())

print("Department labels:", dept_df["label"].nunique())
print("Seniority labels:", sen_df["label"].nunique())

Unnamed: 0,text,label
0,Adjoint directeur communication,Marketing
1,Advisor Strategy and Projects,Project Management
2,Beratung & Projekte,Project Management
3,Beratung & Projektmanagement,Project Management
4,Beratung und Projektmanagement kommunale Partner,Project Management


Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


Department labels: 11
Seniority labels: 5


Zelle 3: JSON Loader (annotated + not-annotated)

In [42]:
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

cvs_annotated = load_json(CV_ANN)
cvs_raw = load_json(CV_RAW)

print("annotated type:", type(cvs_annotated))
print("raw type:", type(cvs_raw))

annotated type: <class 'list'>
raw type: <class 'list'>


Zelle 4: Annotated JSON laden und zu eval_df machen

In [43]:
import json
import pprint

with open(CV_ANN, "r", encoding="utf-8") as f:
    ann = json.load(f)

print("Type ann:", type(ann))
print("Type ann[0]:", type(ann[0]))
print("Type ann[0][0]:", type(ann[0][0]))

print("\nKeys of a POSITION dict:")
print(ann[0][0].keys())

print("\nFull example position:")
pprint.pprint(ann[0][0], depth=5)

Type ann: <class 'list'>
Type ann[0]: <class 'list'>
Type ann[0][0]: <class 'dict'>

Keys of a POSITION dict:
dict_keys(['organization', 'linkedin', 'position', 'startDate', 'endDate', 'status', 'department', 'seniority'])

Full example position:
{'department': 'Other',
 'endDate': None,
 'linkedin': 'https://www.linkedin.com/company/depot4design-gmbh',
 'organization': 'Depot4Design GmbH',
 'position': 'Prokurist',
 'seniority': 'Management',
 'startDate': '2019-08',
 'status': 'ACTIVE'}


In [45]:
# eval_df = nur ACTIVE Positionen + saubere Spaltennamen
# ann ist: list[list[dict]]  → erst flatten
positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)

# nur ACTIVE
eval_df["status"] = eval_df["status"].astype(str).str.upper()
eval_df = eval_df[eval_df["status"] == "ACTIVE"].copy()

# Jobtitel-Text ist bei dir "position" -> wir nennen ihn "title"
eval_df["title"] = eval_df["position"].astype(str).str.strip()

# Labels säubern
eval_df["department"] = eval_df["department"].astype(str).str.strip()
eval_df["seniority"]  = eval_df["seniority"].astype(str).str.strip()

# nur benötigte Spalten
eval_df = eval_df[["title", "department", "seniority", "organization", "status"]].copy()

display(eval_df.head(10))
print("Eval ACTIVE records:", len(eval_df))
print("Dept labels present:", eval_df["department"].notna().sum())
print("Sen labels present:", eval_df["seniority"].notna().sum())
print("Unique dept labels in eval:", eval_df["department"].nunique())
print("Unique seniority labels in eval:", eval_df["seniority"].nunique())

Unnamed: 0,title,department,seniority,organization,status
0,Prokurist,Other,Management,Depot4Design GmbH,ACTIVE
1,CFO,Other,Management,Depot4Design GmbH,ACTIVE
2,Betriebswirtin,Other,Professional,Depot4Design GmbH,ACTIVE
3,Prokuristin,Other,Management,Depot4Design GmbH,ACTIVE
4,CFO,Other,Management,Depot4Design GmbH,ACTIVE
6,Solutions Architect,Information Technology,Professional,Computer Solutions,ACTIVE
14,Medizintechnik Beratung,Consulting,Professional,Udo Weber,ACTIVE
17,Director expansión de negocio.,Business Development,Director,Grupo Viajes Kontiki.,ACTIVE
18,Gerente comercial,Sales,Lead,Air & Ground Operations Consultancy,ACTIVE
19,Administrador Unico,Administrative,Professional,Viajes Oceano S.L.,ACTIVE


Eval ACTIVE records: 623
Dept labels present: 623
Sen labels present: 623
Unique dept labels in eval: 11
Unique seniority labels in eval: 6


Zelle 4b: Not-annotated JSON als raw_df für Inference

In [50]:
# raw_df für Inference aus not-annotated JSON
# raw ist: list[list[dict]] → flatten
positions = [p for cv in raw for p in cv]
raw_df = pd.DataFrame(positions)

# falls status existiert: nur ACTIVE
if "status" in raw_df.columns:
    raw_df["status"] = raw_df["status"].astype(str).str.upper()
    raw_df = raw_df[raw_df["status"] == "ACTIVE"].copy()

# position -> title
raw_df["title"] = raw_df["position"].astype(str).str.strip()

# minimal benötigte Spalten
keep_cols = ["title"]
if "organization" in raw_df.columns:
    keep_cols.append("organization")

raw_df = raw_df[keep_cols].copy()

display(raw_df.head(10))
print("Raw records:", len(raw_df))

Unnamed: 0,title,organization
0,Bookkeeper,"Keeping The Books, Bookkeeping"
1,Co-Owner,Playful Paws
8,Strategy & Investments,Erste Bank und Sparkasse
21,Corporate Auditor,Guido Meyer
22,Corporate Auditor,Guido Meyer
24,Marketing Manager,Tradeware AG
32,Professor,Monash University
33,"Deputy Dean, Faculty of IT",Monash University
36,Research Director,Oceania Cyber Security Centre #OCSC
42,Program Purchasing Leader / Program / Acquisit...,Faurecia Innenraum Systeme GmbH


Raw records: 419


Zelle 5: Train+Eval Funktionen (generisch für Department oder Seniority)

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

In [None]:
from transformers import EarlyStoppingCallback

def train_and_eval(
    train_df,
    eval_df,
    train_text_col,
    train_label_col,
    eval_text_col,
    eval_label_col,
    task_name,
    model_name=MODEL_NAME,
    max_epochs=20,
    batch_size=32,
    lr=2e-5,
    seed=42,
    patience=3,
    warmup_ratio=0.06,
    weight_decay=0.01
):
    # Eval nur dort, wo Label vorhanden ist
    eval_task = eval_df[eval_df[eval_label_col].notna()].copy()

    # Label-Encoding auf Train
    le = LabelEncoder()
    y_train = le.fit_transform(train_df[train_label_col].astype(str))
    classes = list(le.classes_)

    # Eval auf Labels beschränken, die im Train vorkommen
    eval_task = eval_task[eval_task[eval_label_col].astype(str).isin(set(classes))].copy()
    y_eval = le.transform(eval_task[eval_label_col].astype(str))

    print(f"\n=== {task_name} ===")
    print("Train:", len(train_df), "Eval:", len(eval_task))
    print("Num classes:", len(classes))
    print("LR:", lr, "max_epochs:", max_epochs, "patience:", patience)

    train_ds = Dataset.from_dict({
        "text": train_df[train_text_col].astype(str).tolist(),
        "labels": y_train.tolist()
    })
    eval_ds = Dataset.from_dict({
        "text": eval_task[eval_text_col].astype(str).tolist(),
        "labels": y_eval.tolist()
    })

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tok(batch):
        return tokenizer(batch["text"], truncation=True)

    train_ds = train_ds.map(tok, batched=True)
    eval_ds  = eval_ds.map(tok, batched=True)

    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(classes)
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1_macro": f1_score(labels, preds, average="macro"),
            "f1_weighted": f1_score(labels, preds, average="weighted"),
        }

    args = TrainingArguments(
        output_dir=f"./out_{task_name}",
        eval_strategy="epoch",          # transformers >= 4.46 (GPU ok). falls error: evaluation_strategy="epoch"
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,

        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,

        learning_rate=lr,
        lr_scheduler_type="linear",
        warmup_ratio=warmup_ratio,

        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,

        num_train_epochs=max_epochs,
        weight_decay=weight_decay,

        seed=seed,
        report_to="none",

        fp16=torch.cuda.is_available(),   # auf T4 schneller
    )

    callbacks = [EarlyStoppingCallback(early_stopping_patience=patience)]

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=callbacks
    )

    trainer.train()

    # Best model ist bereits geladen (load_best_model_at_end=True)
    metrics = trainer.evaluate()
    print("Best-model metrics:", metrics)

    # Predictions auf Eval
    pred = trainer.predict(eval_ds)
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_labels = le.inverse_transform(pred_ids)

    pred_df = eval_task[[eval_text_col, eval_label_col]].copy()
    pred_df["pred"] = pred_labels
    pred_df["correct"] = pred_df[eval_label_col].astype(str) == pred_df["pred"].astype(str)

    print("\nClassification report (best model):")
    print(classification_report(
        y_true=y_eval,
        y_pred=pred_ids,
        target_names=le.classes_,
        digits=4
    ))

    return trainer, le, metrics, pred_df

In [None]:
lrs = [5e-5, 3e-5, 2e-5]
results = []

for lr in lrs:
    _, _, m, _ = train_and_eval(
        train_df=dept_df,
        eval_df=eval_df,
        train_text_col="text",
        train_label_col="label",
        eval_text_col="title",
        eval_label_col="department",
        task_name=f"department_lr{lr}",
        max_epochs=20,
        batch_size=32,
        lr=lr,
        patience=3
    )
    results.append((lr, m["eval_f1_macro"]))

print(results)
best_lr = max(results, key=lambda x: x[1])[0]
print("Best LR:", best_lr)

In [None]:
print("dept_df:", dept_df.shape, "labels:", dept_df["label"].nunique())
print("sen_df :", sen_df.shape,  "labels:", sen_df["label"].nunique())
print("eval_df:", eval_df.shape)
print("raw_df :", raw_df.shape)

print("\nEval dept counts:")
print(eval_df["department"].value_counts().head(15))
print("\nEval seniority counts:")
print(eval_df["seniority"].value_counts().head(15))

In [None]:
dept_trainer, dept_le, dept_metrics, dept_pred_df = train_and_eval(
    train_df=dept_df,
    eval_df=eval_df,
    train_text_col="text",
    train_label_col="label",
    eval_text_col="title",
    eval_label_col="department",
    task_name="department",
    max_epochs=20,
    batch_size=32,
    lr=2e-5,
    patience=3
)

display(dept_pred_df.head(20))
print("Dept eval correct mean:", dept_pred_df["correct"].mean())

In [None]:
sen_trainer, sen_le, sen_metrics, sen_pred_df = train_and_eval(
    train_df=sen_df,
    eval_df=eval_df,
    train_text_col="text",
    train_label_col="label",
    eval_text_col="title",
    eval_label_col="seniority",
    task_name="seniority",
    max_epochs=20,
    batch_size=32,
    lr=2e-5,
    patience=3
)

display(sen_pred_df.head(20))
print("Sen eval correct mean:", sen_pred_df["correct"].mean())

In [None]:
from datasets import Dataset
import numpy as np

def predict_on_raw(trainer, label_encoder, raw_df, text_col="title", out_path="predictions.csv"):
    ds = Dataset.from_dict({"text": raw_df[text_col].astype(str).tolist()})
    tokenizer = trainer.tokenizer

    def tok(batch):
        return tokenizer(batch["text"], truncation=True)
    ds = ds.map(tok, batched=True)

    pred = trainer.predict(ds)
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_labels = label_encoder.inverse_transform(pred_ids)

    out = raw_df.copy()
    out["pred"] = pred_labels
    out.to_csv(out_path, index=False)
    print("Saved:", out_path)
    return out

In [None]:
dept_raw_pred = predict_on_raw(dept_trainer, dept_le, raw_df, out_path="raw_pred_department.csv")
sen_raw_pred  = predict_on_raw(sen_trainer,  sen_le, raw_df, out_path="raw_pred_seniority.csv")

display(dept_raw_pred.head(10))
display(sen_raw_pred.head(10))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

y_true = dept_pred_df["department"].astype(str)
y_pred = dept_pred_df["pred"].astype(str)

print("=== Department (Eval) ===")
print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average="macro"))
print("Recall   :", recall_score(y_true, y_pred, average="macro"))
print("F1       :", f1_score(y_true, y_pred, average="macro"))

In [None]:
y_true = sen_pred_df["seniority"].astype(str)
y_pred = sen_pred_df["pred"].astype(str)

print("=== Seniority (Eval) ===")
print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, average="macro"))
print("Recall   :", recall_score(y_true, y_pred, average="macro"))
print("F1       :", f1_score(y_true, y_pred, average="macro"))