# 10 - DistilBERT Improvements

Dieses Notebook setzt die vorgeschlagenen Verbesserungen um:
1) Zwei-stufiges Fine-Tuning (Lookup -> CV-Train)
2) Backbone Vergleich (andere Modelle)
3) Label Smoothing und max_length Tests
4) Input Varianten (title vs text vs title+history)
5) Silver Data Augmentation (optional)

Hinweis: Einige Experimente trainieren auf annotierten CVs.
Das ist nicht mehr strikt zero-shot, sondern supervised domain adaptation.


## 1. Setup

Wir importieren alle benoetigten Bibliotheken, definieren Seeds und Basis-Konfigurationen.


In [None]:
import json
from datetime import datetime
from pathlib import Path
import random
import warnings

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import sys
sys.path.append('../')
from src.data.loader import (
    load_label_lists,
    load_evaluation_dataset,
    load_linkedin_data,
    prepare_dataset,
    load_inference_dataset,
    balance_dataset
)
from src.models.transformer_classifier import TransformerClassifier
from src.models.embedding_classifier import create_domain_classifier, create_seniority_classifier

warnings.filterwarnings('ignore')

DATA_DIR = Path('../data')
RESULTS_DIR = Path('./results')
RESULTS_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42
BASE_MODEL = 'distilbert-base-multilingual-cased'

BASE_CONFIG = {
    'model_name': BASE_MODEL,
    'epochs': 3,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'use_class_weights': False
}

RUN_TWO_STAGE = True
RUN_BACKBONE_TESTS = True
RUN_LABEL_SMOOTHING = True
RUN_INPUT_VARIANTS = True
RUN_SILVER = False


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_STATE)

print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")


## 2. Daten laden

Wir laden Lookup-Tabellen und die annotierten CVs. Fuer Input-Varianten nutzen wir
auch die CV-History.


In [None]:
# Lookup-Tabellen (Training)
dept_df, sen_df = load_label_lists(
    DATA_DIR,
    fix_encoding=True,
    deduplicate=True,
    max_per_class=None
)

# Annotierte CVs (Evaluation)
eval_df = load_evaluation_dataset(DATA_DIR)

# Volle CVs mit History fuer Input-Experimente
cvs_annotated = load_linkedin_data(DATA_DIR / 'linkedin-cvs-annotated.json')
cv_df_full = prepare_dataset(cvs_annotated, include_history=True)

# Titel + History kombinieren
history = cv_df_full['history'].fillna('')
cv_df_full['title_history'] = np.where(
    history != '',
    cv_df_full['title'].fillna('') + ' | ' + history,
    cv_df_full['title'].fillna('')
)

print(f"Department lookup: {len(dept_df):,} examples")
print(f"Seniority lookup:  {len(sen_df):,} examples")
print(f"Annotated CVs:     {len(eval_df):,} positions")


## 3. Hilfsfunktionen

Wir kapseln wiederholte Logik fuer Splits, Mappings, Training und Evaluation.


In [None]:
def build_label_maps(label_series):
    labels = sorted(label_series.unique())
    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label2id, id2label


def split_by_cv_id(df, train_size=0.7, val_size=0.15):
    cv_ids = df['cv_id'].unique()
    train_ids, temp_ids = train_test_split(
        cv_ids, test_size=1 - train_size, random_state=RANDOM_STATE
    )
    val_ids, test_ids = train_test_split(
        temp_ids, test_size=0.5, random_state=RANDOM_STATE
    )
    train_df = df[df['cv_id'].isin(train_ids)].copy()
    val_df = df[df['cv_id'].isin(val_ids)].copy()
    test_df = df[df['cv_id'].isin(test_ids)].copy()
    return train_df, val_df, test_df


def compute_metrics(true_ids, pred_ids, id2label):
    acc = accuracy_score(true_ids, pred_ids)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(
        true_ids, pred_ids, average='macro', zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        true_ids, pred_ids, average='weighted', zero_division=0
    )[2]

    labels = sorted(set(true_ids) | set(pred_ids))
    _, _, f1_per_class, _ = precision_recall_fscore_support(
        true_ids, pred_ids, labels=labels, average=None, zero_division=0
    )
    per_class_f1 = {id2label[i]: float(f1_per_class[idx]) for idx, i in enumerate(labels)}

    return {
        'accuracy': float(acc),
        'precision': float(precision),
        'recall': float(recall),
        'f1_macro': float(f1_macro),
        'f1_weighted': float(f1_weighted),
        'per_class_f1': per_class_f1
    }


def print_summary(name, results):
    in_acc = results['in_distribution']['accuracy']
    in_f1 = results['in_distribution']['f1_macro']
    if results.get('cv_test'):
        rw_acc = results['cv_test']['accuracy']
        rw_f1 = results['cv_test']['f1_macro']
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f} | cv-test acc {rw_acc:.4f} f1 {rw_f1:.4f}")
    else:
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f}")


## 4. Zwei-stufiges Fine-Tuning

Ziel: Erst auf Lookup-Tabellen trainieren (breite Pattern-Abdeckung),
danach auf einem CV-Train-Split feinjustieren (Domain Adaptation).


In [None]:
TWO_STAGE_CONFIG = {
    'stage1': {
        'epochs': 2,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'warmup_ratio': 0.1,
        'use_class_weights': False
    },
    'stage2': {
        'epochs': 2,
        'batch_size': 8,
        'learning_rate': 1e-5,
        'warmup_ratio': 0.1,
        'use_class_weights': False
    }
}


def two_stage_finetune(task_name, lookup_df, cv_df, label_col, config, output_dir, cv_text_col='title'):
    label2id, id2label = build_label_maps(lookup_df['label'])

    # Stage 1: Lookup split
    X_train, X_val, y_train, y_val = train_test_split(
        lookup_df['text'].tolist(),
        lookup_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=lookup_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    clf = TransformerClassifier(
        model_name=BASE_MODEL,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    clf.train(
        texts=X_train,
        labels=y_train_ids,
        val_texts=X_val,
        val_labels=y_val_ids,
        output_dir=f'{output_dir}/stage1',
        epochs=config['stage1']['epochs'],
        batch_size=config['stage1']['batch_size'],
        learning_rate=config['stage1']['learning_rate'],
        warmup_ratio=config['stage1']['warmup_ratio'],
        use_class_weights=config['stage1']['use_class_weights']
    )

    stage1_pred = clf.predict(X_val)
    stage1_metrics = compute_metrics(y_val_ids, stage1_pred, id2label)

    # Stage 2: CV split by cv_id
    cv_task = cv_df[cv_df[label_col].notna()].copy()
    cv_task = cv_task[cv_task[label_col].isin(label2id.keys())]

    cv_train, cv_val, cv_test = split_by_cv_id(cv_task)

    cv_train_texts = cv_train[cv_text_col].fillna('').tolist()
    cv_train_labels = [label2id[l] for l in cv_train[label_col].tolist()]

    cv_val_texts = cv_val[cv_text_col].fillna('').tolist()
    cv_val_labels = [label2id[l] for l in cv_val[label_col].tolist()]

    clf.train(
        texts=cv_train_texts,
        labels=cv_train_labels,
        val_texts=cv_val_texts,
        val_labels=cv_val_labels,
        output_dir=f'{output_dir}/stage2',
        epochs=config['stage2']['epochs'],
        batch_size=config['stage2']['batch_size'],
        learning_rate=config['stage2']['learning_rate'],
        warmup_ratio=config['stage2']['warmup_ratio'],
        use_class_weights=config['stage2']['use_class_weights']
    )

    stage2_pred = clf.predict(cv_val_texts)
    stage2_metrics = compute_metrics(cv_val_labels, stage2_pred, id2label)

    # Final CV test evaluation
    cv_test_texts = cv_test[cv_text_col].fillna('').tolist()
    cv_test_labels = [label2id[l] for l in cv_test[label_col].tolist()]

    cv_test_pred = clf.predict(cv_test_texts)
    cv_test_metrics = compute_metrics(cv_test_labels, cv_test_pred, id2label)

    results = {
        'in_distribution': stage1_metrics,
        'cv_val': stage2_metrics,
        'cv_test': cv_test_metrics,
        'cv_split_sizes': {
            'train': len(cv_train),
            'val': len(cv_val),
            'test': len(cv_test)
        }
    }

    return results


if RUN_TWO_STAGE:
    two_stage_results = {}

    two_stage_results['department'] = two_stage_finetune(
        task_name='department',
        lookup_df=dept_df,
        cv_df=cv_df_full,
        label_col='department',
        config=TWO_STAGE_CONFIG,
        output_dir='./results/10_distilbert/two_stage/department',
        cv_text_col='title'
    )
    print_summary('Two-stage Department', two_stage_results['department'])

    two_stage_results['seniority'] = two_stage_finetune(
        task_name='seniority',
        lookup_df=sen_df,
        cv_df=cv_df_full,
        label_col='seniority',
        config=TWO_STAGE_CONFIG,
        output_dir='./results/10_distilbert/two_stage/seniority',
        cv_text_col='title'
    )
    print_summary('Two-stage Seniority', two_stage_results['seniority'])
else:
    two_stage_results = None


## 5. Backbone Vergleich

Ziel: pruefen, ob ein anderes Modell (z.B. XLM-R) bessere Generalisierung liefert.
Wir trainieren nur auf Lookup-Tabellen und evaluieren auf einem CV-Testsplit.


In [None]:
BACKBONES = [
    'distilbert-base-multilingual-cased',
    'bert-base-multilingual-cased',
    'xlm-roberta-base'
]

BACKBONE_CONFIG = {
    'epochs': 2,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'use_class_weights': False
}

MAX_SAMPLES = 2500  # optional: reduzieren fuer schnellere Runs


def sample_df(df, max_samples=None):
    if max_samples is None or len(df) <= max_samples:
        return df
    return df.sample(max_samples, random_state=RANDOM_STATE).reset_index(drop=True)


def train_lookup_only(task_name, lookup_df, cv_df, label_col, model_name, config, output_dir, max_samples=None):
    label2id, id2label = build_label_maps(lookup_df['label'])
    lookup_df = sample_df(lookup_df, max_samples=max_samples)

    X_train, X_val, y_train, y_val = train_test_split(
        lookup_df['text'].tolist(),
        lookup_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=lookup_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    clf = TransformerClassifier(
        model_name=model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    clf.train(
        texts=X_train,
        labels=y_train_ids,
        val_texts=X_val,
        val_labels=y_val_ids,
        output_dir=output_dir,
        epochs=config['epochs'],
        batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        warmup_ratio=config['warmup_ratio'],
        use_class_weights=config['use_class_weights']
    )

    in_dist_pred = clf.predict(X_val)
    in_dist_metrics = compute_metrics(y_val_ids, in_dist_pred, id2label)

    # CV test split
    cv_task = cv_df[cv_df[label_col].notna()].copy()
    cv_task = cv_task[cv_task[label_col].isin(label2id.keys())]
    _, _, cv_test = split_by_cv_id(cv_task)

    cv_test_texts = cv_test['title'].fillna('').tolist()
    cv_test_labels = [label2id[l] for l in cv_test[label_col].tolist()]

    cv_test_pred = clf.predict(cv_test_texts)
    cv_test_metrics = compute_metrics(cv_test_labels, cv_test_pred, id2label)

    return {
        'in_distribution': in_dist_metrics,
        'cv_test': cv_test_metrics
    }


if RUN_BACKBONE_TESTS:
    backbone_results = {
        'department': {},
        'seniority': {}
    }

    for model_name in BACKBONES:
        print(f"
Backbone: {model_name}")

        backbone_results['department'][model_name] = train_lookup_only(
            task_name='department',
            lookup_df=dept_df,
            cv_df=cv_df_full,
            label_col='department',
            model_name=model_name,
            config=BACKBONE_CONFIG,
            output_dir=f'./results/10_distilbert/backbone/{model_name}/department',
            max_samples=MAX_SAMPLES
        )
        print_summary(f"Dept {model_name}", backbone_results['department'][model_name])

        backbone_results['seniority'][model_name] = train_lookup_only(
            task_name='seniority',
            lookup_df=sen_df,
            cv_df=cv_df_full,
            label_col='seniority',
            model_name=model_name,
            config=BACKBONE_CONFIG,
            output_dir=f'./results/10_distilbert/backbone/{model_name}/seniority',
            max_samples=MAX_SAMPLES
        )
        print_summary(f"Sen {model_name}", backbone_results['seniority'][model_name])
else:
    backbone_results = None


## 6. Label Smoothing und max_length

Ziel: pruefen, ob Label Smoothing die Generalisierung verbessert, und ob eine
andere Token-Laenge (64/128/256) sinnvoll ist. Dieses Experiment nutzt einen
Custom Trainer, weil der vorhandene TransformerClassifier max_length fix auf 128 setzt.


In [None]:
class SimpleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def train_with_label_smoothing(
    lookup_df,
    cv_df,
    label_col,
    model_name,
    output_dir,
    max_length=128,
    label_smoothing=0.1,
    epochs=2,
    batch_size=16
):
    label2id, id2label = build_label_maps(lookup_df['label'])

    X_train, X_val, y_train, y_val = train_test_split(
        lookup_df['text'].tolist(),
        lookup_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=lookup_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    train_enc = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    val_enc = tokenizer(X_val, truncation=True, padding=True, max_length=max_length)

    train_ds = SimpleDataset(train_enc, y_train_ids)
    val_ds = SimpleDataset(val_enc, y_val_ids)

    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_steps=50,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        report_to='none',
        save_total_limit=2,
        label_smoothing_factor=label_smoothing
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds
    )
    trainer.train()

    # In-dist evaluation
    preds = trainer.predict(val_ds).predictions
    val_pred_ids = np.argmax(preds, axis=1).tolist()
    in_dist = compute_metrics(y_val_ids, val_pred_ids, id2label)

    # CV test evaluation
    cv_task = cv_df[cv_df[label_col].notna()].copy()
    cv_task = cv_task[cv_task[label_col].isin(label2id.keys())]
    _, _, cv_test = split_by_cv_id(cv_task)

    cv_texts = cv_test['title'].fillna('').tolist()
    cv_labels = [label2id[l] for l in cv_test[label_col].tolist()]
    cv_enc = tokenizer(cv_texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

    model.eval()
    with torch.no_grad():
        outputs = model(**cv_enc)
        cv_pred_ids = torch.argmax(outputs.logits, dim=-1).cpu().tolist()

    cv_test_metrics = compute_metrics(cv_labels, cv_pred_ids, id2label)

    return {
        'in_distribution': in_dist,
        'cv_test': cv_test_metrics,
        'max_length': max_length,
        'label_smoothing': label_smoothing
    }


if RUN_LABEL_SMOOTHING:
    ls_results = {'department': [], 'seniority': []}

    MAX_LENGTHS = [64, 128, 256]
    SMOOTHING = [0.0, 0.1]

    for max_len in MAX_LENGTHS:
        for ls in SMOOTHING:
            print(f"Label smoothing run: max_len={max_len}, ls={ls}")
            res = train_with_label_smoothing(
                lookup_df=dept_df,
                cv_df=cv_df_full,
                label_col='department',
                model_name=BASE_MODEL,
                output_dir=f'./results/10_distilbert/label_smoothing/department/len_{max_len}_ls_{ls}',
                max_length=max_len,
                label_smoothing=ls,
                epochs=2,
                batch_size=16
            )
            ls_results['department'].append(res)

    for max_len in MAX_LENGTHS:
        for ls in SMOOTHING:
            print(f"Label smoothing run: max_len={max_len}, ls={ls}")
            res = train_with_label_smoothing(
                lookup_df=sen_df,
                cv_df=cv_df_full,
                label_col='seniority',
                model_name=BASE_MODEL,
                output_dir=f'./results/10_distilbert/label_smoothing/seniority/len_{max_len}_ls_{ls}',
                max_length=max_len,
                label_smoothing=ls,
                epochs=2,
                batch_size=16
            )
            ls_results['seniority'].append(res)
else:
    ls_results = None


## 7. Input Varianten (title vs text vs title+history)

Ziel: testen, ob mehr Kontext (company oder history) die Performance verbessert.
Wir nutzen das zwei-stufige Setup und variieren die Eingabe fuer die CV-Phase.


In [None]:
if RUN_INPUT_VARIANTS:
    input_variants = {
        'title': 'title',
        'text': 'text',
        'title_history': 'title_history'
    }

    input_variant_results = {
        'department': {},
        'seniority': {}
    }

    for name, col in input_variants.items():
        print(f"
Input variant: {name}")

        input_variant_results['department'][name] = two_stage_finetune(
            task_name='department',
            lookup_df=dept_df,
            cv_df=cv_df_full,
            label_col='department',
            config=TWO_STAGE_CONFIG,
            output_dir=f'./results/10_distilbert/input_variants/{name}/department',
            cv_text_col=col
        )
        print_summary(f"Department {name}", input_variant_results['department'][name])

        input_variant_results['seniority'][name] = two_stage_finetune(
            task_name='seniority',
            lookup_df=sen_df,
            cv_df=cv_df_full,
            label_col='seniority',
            config=TWO_STAGE_CONFIG,
            output_dir=f'./results/10_distilbert/input_variants/{name}/seniority',
            cv_text_col=col
        )
        print_summary(f"Seniority {name}", input_variant_results['seniority'][name])
else:
    input_variant_results = None


## 8. Silver Data Augmentation (optional)

Ziel: mehr Trainingsdaten durch Pseudo-Labels. Diese Sektion erzeugt optional
Silver Labels mit der Embedding-Baseline und trainiert danach DistilBERT.

Standardmaessig ist RUN_SILVER = False, da es laenger dauert.


In [None]:
if RUN_SILVER:
    silver_threshold = 0.85

    # Unannotierte CVs laden
    inference_df = load_inference_dataset(DATA_DIR)

    # Embedding Classifier
    emb_dept = create_domain_classifier(dept_df, use_examples=True)
    emb_sen = create_seniority_classifier(sen_df, use_examples=True)

    dept_preds = emb_dept.predict_with_confidence(inference_df['text'].tolist())
    sen_preds = emb_sen.predict_with_confidence(inference_df['text'].tolist())

    inference_df['dept_pseudo'] = [p[0] for p in dept_preds]
    inference_df['dept_conf'] = [p[1] for p in dept_preds]
    inference_df['sen_pseudo'] = [p[0] for p in sen_preds]
    inference_df['sen_conf'] = [p[1] for p in sen_preds]

    dept_silver = inference_df[inference_df['dept_conf'] >= silver_threshold][['text', 'dept_pseudo']].copy()
    dept_silver = dept_silver.rename(columns={'dept_pseudo': 'label'})

    sen_silver = inference_df[inference_df['sen_conf'] >= silver_threshold][['text', 'sen_pseudo']].copy()
    sen_silver = sen_silver.rename(columns={'sen_pseudo': 'label'})

    dept_aug = pd.concat([dept_df[['text', 'label']], dept_silver], ignore_index=True)
    sen_aug = pd.concat([sen_df[['text', 'label']], sen_silver], ignore_index=True)

    silver_results = {}

    silver_results['department'] = train_lookup_only(
        task_name='department',
        lookup_df=dept_aug,
        cv_df=cv_df_full,
        label_col='department',
        model_name=BASE_MODEL,
        config=BACKBONE_CONFIG,
        output_dir='./results/10_distilbert/silver/department',
        max_samples=None
    )
    print_summary('Silver Department', silver_results['department'])

    silver_results['seniority'] = train_lookup_only(
        task_name='seniority',
        lookup_df=sen_aug,
        cv_df=cv_df_full,
        label_col='seniority',
        model_name=BASE_MODEL,
        config=BACKBONE_CONFIG,
        output_dir='./results/10_distilbert/silver/seniority',
        max_samples=None
    )
    print_summary('Silver Seniority', silver_results['seniority'])
else:
    silver_results = None


## 9. Ergebnisse speichern

Wir speichern alle Resultate in einer JSON-Datei fuer spaetere Vergleiche.


In [None]:
all_results = {
    'approach': 'DistilBERT Improvements',
    'two_stage': two_stage_results,
    'backbone': backbone_results,
    'label_smoothing': ls_results,
    'input_variants': input_variant_results,
    'silver': silver_results,
    'metadata': {
        'base_model': BASE_MODEL,
        'random_state': RANDOM_STATE
    },
    'timestamp': datetime.now().isoformat()
}

output_path = RESULTS_DIR / 'distilbert_improvements.json'
with open(output_path, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f'Results saved to: {output_path}')
