# 09 - DistilBERT Experiments

Ziel dieses Notebooks:
- Erst ein plain DistilBERT Baseline-Training.
- Danach ein kleines Hyperparameter Tuning.
- Danach vier weitere Experimente, um systematisch Hypothesen zu testen.

Alle Abschnitte enthalten kurze Erklaerungen, was genau getestet wird und warum.


## 1. Setup

Wir importieren alle benoetigten Bibliotheken, setzen einen Seed und definieren Standard-Parameter.


In [None]:
import json
from datetime import datetime
from pathlib import Path
import random
import warnings

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import sys
sys.path.append('../')
from src.data.loader import load_label_lists, load_evaluation_dataset, balance_dataset
from src.models.transformer_classifier import TransformerClassifier

warnings.filterwarnings('ignore')

DATA_DIR = Path('../data')
RESULTS_DIR = Path('./results')
RESULTS_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42
BASE_MODEL = 'distilbert-base-multilingual-cased'

BASE_CONFIG = {
    'model_name': BASE_MODEL,
    'epochs': 3,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'use_class_weights': False
}


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_STATE)

print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")


## 2. Daten laden

Wir trainieren auf den Lookup-Tabellen und evaluieren auf den annotierten CVs.
Das entspricht dem Zero-Shot/Transfer-Setup der vorherigen Notebooks.


In [None]:
# Lookup-Tabellen (Training)
dept_df, sen_df = load_label_lists(
    DATA_DIR,
    fix_encoding=True,
    deduplicate=True,
    max_per_class=None
)

# Annotierte CVs (Evaluation)
eval_df = load_evaluation_dataset(DATA_DIR)

print(f"Department lookup: {len(dept_df):,} examples")
print(f"Seniority lookup:  {len(sen_df):,} examples")
print(f"Annotated CVs:     {len(eval_df):,} positions")


## 3. Hilfsfunktionen

Wir kapseln den wiederholten Code fuer Mapping, Training und Evaluation in Funktionen.
Das macht die Experimente konsistent und leichter vergleichbar.


In [None]:
def build_label_maps(label_series):
    labels = sorted(label_series.unique())
    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label2id, id2label


def prepare_eval_data(eval_df, label_col, label2id, text_col='title'):
    subset = eval_df[eval_df[label_col].notna()].copy()
    subset = subset[subset[label_col].isin(label2id.keys())]
    texts = subset[text_col].fillna('').tolist()
    labels = [label2id[l] for l in subset[label_col].tolist()]
    return texts, labels


def compute_metrics(true_ids, pred_ids, id2label):
    acc = accuracy_score(true_ids, pred_ids)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(
        true_ids, pred_ids, average='macro', zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        true_ids, pred_ids, average='weighted', zero_division=0
    )[2]

    labels = sorted(set(true_ids) | set(pred_ids))
    _, _, f1_per_class, _ = precision_recall_fscore_support(
        true_ids, pred_ids, labels=labels, average=None, zero_division=0
    )
    per_class_f1 = {id2label[i]: float(f1_per_class[idx]) for idx, i in enumerate(labels)}

    return {
        'accuracy': float(acc),
        'precision': float(precision),
        'recall': float(recall),
        'f1_macro': float(f1_macro),
        'f1_weighted': float(f1_weighted),
        'per_class_f1': per_class_f1
    }


def freeze_base_layers(clf):
    base = None
    if hasattr(clf.model, 'base_model_prefix'):
        prefix = clf.model.base_model_prefix
        base = getattr(clf.model, prefix, None)
    if base is None:
        base = getattr(clf.model, 'base_model', None)
    if base is None and hasattr(clf.model, 'distilbert'):
        base = clf.model.distilbert

    if base is None:
        print('Could not find base model to freeze')
        return

    for param in base.parameters():
        param.requires_grad = False

    print('Base layers frozen')


def train_eval_distilbert(
    task_name,
    train_df,
    eval_df,
    label_col,
    config,
    output_dir,
    text_col='title',
    eval_real_world=True,
    freeze_base=False
):
    label2id, id2label = build_label_maps(train_df['label'])

    X_train, X_val, y_train, y_val = train_test_split(
        train_df['text'].tolist(),
        train_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=train_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    clf = TransformerClassifier(
        model_name=config['model_name'],
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    if freeze_base:
        freeze_base_layers(clf)

    clf.train(
        texts=X_train,
        labels=y_train_ids,
        val_texts=X_val,
        val_labels=y_val_ids,
        output_dir=output_dir,
        epochs=config['epochs'],
        batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        warmup_ratio=config.get('warmup_ratio', 0.1),
        use_class_weights=config.get('use_class_weights', False)
    )

    # In-distribution evaluation
    val_pred_ids = clf.predict(X_val)
    in_dist = compute_metrics(y_val_ids, val_pred_ids, id2label)

    real_world = None
    if eval_real_world:
        eval_texts, eval_labels = prepare_eval_data(
            eval_df, label_col, label2id, text_col=text_col
        )
        eval_pred_ids = clf.predict(eval_texts)
        real_world = compute_metrics(eval_labels, eval_pred_ids, id2label)

    return {
        'in_distribution': in_dist,
        'real_world': real_world,
        'label2id': label2id,
        'id2label': id2label
    }, clf


def print_summary(name, results):
    in_acc = results['in_distribution']['accuracy']
    in_f1 = results['in_distribution']['f1_macro']
    if results['real_world']:
        rw_acc = results['real_world']['accuracy']
        rw_f1 = results['real_world']['f1_macro']
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f} | real-world acc {rw_acc:.4f} f1 {rw_f1:.4f}")
    else:
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f} | real-world skipped")


## 4. Plain DistilBERT Baseline

Wir trainieren DistilBERT mit Standard-Parametern auf den Lookup-Tabellen
und evaluieren auf den annotierten CVs. Das ist der Ausgangspunkt fuer alle
weiteren Vergleiche.


In [None]:
baseline_results = {}

# Department baseline
base_dept, base_dept_clf = train_eval_distilbert(
    task_name='department',
    train_df=dept_df,
    eval_df=eval_df,
    label_col='department',
    config=BASE_CONFIG,
    output_dir='./results/09_distilbert/baseline/department'
)
print_summary('Baseline Department', base_dept)

# Seniority baseline
base_sen, base_sen_clf = train_eval_distilbert(
    task_name='seniority',
    train_df=sen_df,
    eval_df=eval_df,
    label_col='seniority',
    config=BASE_CONFIG,
    output_dir='./results/09_distilbert/baseline/seniority'
)
print_summary('Baseline Seniority', base_sen)

baseline_results['department'] = base_dept
baseline_results['seniority'] = base_sen


## 5. Hyperparameter Tuning

Wir testen mehrere Kombinationen aus Lernrate, Batch-Size und Epochen.
Die Bewertung erfolgt auf dem In-Distribution Validation-Split, um fair zu
vergleichen. Real-World Evaluation wird hier ausgelassen, um die Anzahl der
Trainingslaeufe nicht weiter zu vergroessern.


In [None]:
TUNING_GRID = [
    {'learning_rate': 1e-5, 'batch_size': 16, 'epochs': 2, 'warmup_ratio': 0.1},
    {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 2, 'warmup_ratio': 0.1},
    {'learning_rate': 3e-5, 'batch_size': 16, 'epochs': 2, 'warmup_ratio': 0.1},
    {'learning_rate': 2e-5, 'batch_size': 8,  'epochs': 2, 'warmup_ratio': 0.1},
]


def run_tuning(task_name, train_df, label_col):
    rows = []
    for idx, cfg in enumerate(TUNING_GRID, start=1):
        config = BASE_CONFIG.copy()
        config.update(cfg)
        output_dir = f'./results/09_distilbert/tuning/{task_name}/run_{idx}'

        result, _ = train_eval_distilbert(
            task_name=task_name,
            train_df=train_df,
            eval_df=eval_df,
            label_col=label_col,
            config=config,
            output_dir=output_dir,
            eval_real_world=False
        )

        rows.append({
            'run': idx,
            'learning_rate': cfg['learning_rate'],
            'batch_size': cfg['batch_size'],
            'epochs': cfg['epochs'],
            'warmup_ratio': cfg['warmup_ratio'],
            'val_accuracy': result['in_distribution']['accuracy'],
            'val_f1_macro': result['in_distribution']['f1_macro']
        })

    return pd.DataFrame(rows)


tuning_dept = run_tuning('department', dept_df, 'department')
tuning_sen = run_tuning('seniority', sen_df, 'seniority')

print('Tuning Department:')
print(tuning_dept.sort_values('val_f1_macro', ascending=False).head(5))

print('
Tuning Seniority:')
print(tuning_sen.sort_values('val_f1_macro', ascending=False).head(5))


## 6. Weitere Experimente (4 Stueck)

Jedes Experiment testet eine konkrete Hypothese. So kann man gezielt sehen,
welche Massnahme wirklich hilft.


### Experiment 1: Class Weights

Hypothese: Class-Weighted Loss hilft bei unbalancierten Klassen und verbessert
F1 fuer Minoritaeten.


In [None]:
exp1_config = BASE_CONFIG.copy()
exp1_config['use_class_weights'] = True

exp1_dept, _ = train_eval_distilbert(
    task_name='department',
    train_df=dept_df,
    eval_df=eval_df,
    label_col='department',
    config=exp1_config,
    output_dir='./results/09_distilbert/exp1_class_weights/department'
)
print_summary('Exp1 Department', exp1_dept)

exp1_sen, _ = train_eval_distilbert(
    task_name='seniority',
    train_df=sen_df,
    eval_df=eval_df,
    label_col='seniority',
    config=exp1_config,
    output_dir='./results/09_distilbert/exp1_class_weights/seniority'
)
print_summary('Exp1 Seniority', exp1_sen)


### Experiment 2: Balanced Training Data

Hypothese: Balancing (Over- und Undersampling) reduziert die Dominanz grosser Klassen
und verbessert den Macro-F1.


In [None]:
# Balance department and seniority with min/max per class
balanced_dept, _ = balance_dataset(dept_df, min_samples=500, max_samples=2000)
balanced_sen, _ = balance_dataset(sen_df, min_samples=500, max_samples=2000)

exp2_dept, _ = train_eval_distilbert(
    task_name='department',
    train_df=balanced_dept,
    eval_df=eval_df,
    label_col='department',
    config=BASE_CONFIG,
    output_dir='./results/09_distilbert/exp2_balanced/department'
)
print_summary('Exp2 Department', exp2_dept)

exp2_sen, _ = train_eval_distilbert(
    task_name='seniority',
    train_df=balanced_sen,
    eval_df=eval_df,
    label_col='seniority',
    config=BASE_CONFIG,
    output_dir='./results/09_distilbert/exp2_balanced/seniority'
)
print_summary('Exp2 Seniority', exp2_sen)


### Experiment 3: Silver Data Augmentation

Hypothese: Pseudo-Labels aus unannotierten CVs vergroessern die Trainingsmenge
und verbessern die Generalisierung.

Falls die Datei nicht existiert, wird das Experiment uebersprungen.


In [None]:
def load_silver_data(data_dir):
    silver_path = Path(data_dir) / 'processed' / 'unannotated_pseudo_labeled.csv'
    if not silver_path.exists():
        return None, None

    df = pd.read_csv(silver_path)
    if 'text' not in df.columns and 'title' in df.columns:
        df['text'] = df['title']

    dept_silver = df[df['dept_pseudo'].notna()][['text', 'dept_pseudo']].copy()
    dept_silver = dept_silver.rename(columns={'dept_pseudo': 'label'})

    sen_silver = df[df['sen_pseudo'].notna()][['text', 'sen_pseudo']].copy()
    sen_silver = sen_silver.rename(columns={'sen_pseudo': 'label'})

    return dept_silver, sen_silver


dep_silver, sen_silver = load_silver_data(DATA_DIR)

if dep_silver is None or dep_silver.empty:
    exp3_dept = None
    print('No department silver data found. Skipping exp3 department.')
else:
    dept_aug = pd.concat([dept_df[['text', 'label']], dep_silver], ignore_index=True)
    exp3_dept, _ = train_eval_distilbert(
        task_name='department',
        train_df=dept_aug,
        eval_df=eval_df,
        label_col='department',
        config=BASE_CONFIG,
        output_dir='./results/09_distilbert/exp3_silver/department'
    )
    print_summary('Exp3 Department', exp3_dept)

if sen_silver is None or sen_silver.empty:
    exp3_sen = None
    print('No seniority silver data found. Skipping exp3 seniority.')
else:
    sen_aug = pd.concat([sen_df[['text', 'label']], sen_silver], ignore_index=True)
    exp3_sen, _ = train_eval_distilbert(
        task_name='seniority',
        train_df=sen_aug,
        eval_df=eval_df,
        label_col='seniority',
        config=BASE_CONFIG,
        output_dir='./results/09_distilbert/exp3_silver/seniority'
    )
    print_summary('Exp3 Seniority', exp3_sen)


### Experiment 4: Freeze Base Layers

Hypothese: Wenn wir nur den Klassifikationskopf trainieren, reduzieren wir Overfitting
und die Trainingszeit. Das kann sinnvoll sein, wenn die Daten klein sind.


In [None]:
exp4_config = BASE_CONFIG.copy()
exp4_config['epochs'] = 2

exp4_dept, _ = train_eval_distilbert(
    task_name='department',
    train_df=dept_df,
    eval_df=eval_df,
    label_col='department',
    config=exp4_config,
    output_dir='./results/09_distilbert/exp4_frozen/department',
    freeze_base=True
)
print_summary('Exp4 Department', exp4_dept)

exp4_sen, _ = train_eval_distilbert(
    task_name='seniority',
    train_df=sen_df,
    eval_df=eval_df,
    label_col='seniority',
    config=exp4_config,
    output_dir='./results/09_distilbert/exp4_frozen/seniority',
    freeze_base=True
)
print_summary('Exp4 Seniority', exp4_sen)


## 7. Ergebnisse speichern

Wir sammeln alle Ergebnisse in einem JSON-File, damit spaetere Vergleiche
mit anderen Notebooks leichter sind.


In [None]:
all_results = {
    'approach': 'DistilBERT Experiments',
    'baseline': baseline_results,
    'tuning': {
        'department': tuning_dept.to_dict(orient='records'),
        'seniority': tuning_sen.to_dict(orient='records')
    },
    'experiments': {
        'exp1_class_weights': {
            'department': exp1_dept,
            'seniority': exp1_sen
        },
        'exp2_balanced': {
            'department': exp2_dept,
            'seniority': exp2_sen
        },
        'exp3_silver': {
            'department': exp3_dept,
            'seniority': exp3_sen
        },
        'exp4_frozen': {
            'department': exp4_dept,
            'seniority': exp4_sen
        }
    },
    'metadata': {
        'base_model': BASE_MODEL,
        'train_source': 'lookup tables',
        'eval_source': 'annotated CVs',
        'random_state': RANDOM_STATE
    },
    'timestamp': datetime.now().isoformat()
}

output_path = RESULTS_DIR / 'distilbert_experiments.json'
with open(output_path, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f'Results saved to: {output_path}')
