# 09 - DistilBERT Experiments

Ziel dieses Notebooks:
- Erst ein plain DistilBERT Baseline-Training.
- Danach ein kleines Hyperparameter Tuning.
- Danach vier weitere Experimente, um systematisch Hypothesen zu testen.

Alle Abschnitte enthalten kurze Erklaerungen, was genau getestet wird und warum.


## 1. Setup

Wir importieren alle benoetigten Bibliotheken, setzen einen Seed und definieren Standard-Parameter.


In [None]:
import json
from datetime import datetime
from pathlib import Path
import random
import warnings
import tempfile

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

warnings.filterwarnings('ignore')

DATA_DIR = Path('/Users/batuklkn/Desktop/GustAbgabe/BuzzwordLearner/data')

RANDOM_STATE = 42
BASE_MODEL = 'distilbert-base-multilingual-cased'

BASE_CONFIG = {
    'model_name': BASE_MODEL,
    'epochs': 3,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'use_class_weights': False
}


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


set_seed(RANDOM_STATE)

print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")


def _fix_encoding(text):
    if not isinstance(text, str):
        return text
    try:
        if 'Ãƒ' in text:
            return text.encode('latin-1').decode('utf-8', errors='ignore')
    except (UnicodeEncodeError, UnicodeDecodeError):
        pass
    return text


def deduplicate_label_df(label_df, max_per_class=500):
    label_df = label_df.copy()
    label_df['text_normalized'] = label_df['text'].str.lower().str.strip()
    original_count = len(label_df)
    label_df = label_df.drop_duplicates(subset=['text_normalized', 'label'])
    dedup_count = len(label_df)

    if max_per_class is not None:
        label_df = label_df.groupby('label', group_keys=False).apply(
            lambda x: x.sample(min(len(x), max_per_class), random_state=42)
        ).reset_index(drop=True)

    final_count = len(label_df)
    label_df = label_df.drop(columns=['text_normalized'])

    print(f"  Deduplication: {original_count} -> {dedup_count} (removed {original_count - dedup_count} duplicates)")
    if max_per_class is not None:
        print(f"  Capping: {dedup_count} -> {final_count} (max {max_per_class} per class)")

    return label_df


def balance_dataset(df, label_col='label', min_samples=500, max_samples=2000, return_weights=False):
    balanced_dfs = []
    weights = []

    class_counts = df[label_col].value_counts()

    for label, count in class_counts.items():
        class_df = df[df[label_col] == label]

        if count < min_samples:
            n_repeats = min_samples // count
            remainder = min_samples % count
            repeated = pd.concat([class_df] * n_repeats, ignore_index=True)
            if remainder > 0:
                repeated = pd.concat([repeated, class_df.sample(remainder, random_state=42)], ignore_index=True)
            balanced_dfs.append(repeated)
            weights.extend([0.8] * len(repeated))
        elif count > max_samples:
            sampled = class_df.sample(max_samples, random_state=42)
            balanced_dfs.append(sampled)
            weights.extend([1.0] * len(sampled))
        else:
            balanced_dfs.append(class_df)
            weights.extend([1.0] * len(class_df))

    balanced_df = pd.concat(balanced_dfs, ignore_index=True)

    print(f"Balancing: {len(df)} -> {len(balanced_df)} samples")
    print(f"  Class distribution: {balanced_df[label_col].value_counts().to_dict()}")

    if return_weights:
        return balanced_df, weights
    return balanced_df, None


def load_linkedin_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data


def prepare_dataset(cvs, include_history=False):
    records = []

    for cv_idx, cv in enumerate(cvs):
        if isinstance(cv, list):
            positions = cv
        else:
            positions = cv.get('positions', cv) if isinstance(cv, dict) else []

        active_positions = [p for p in positions if p.get('status') == 'ACTIVE']

        if not active_positions:
            continue

        active = active_positions[0]

        title = active.get('position', active.get('title', ''))
        company = active.get('organization', active.get('companyName', ''))

        record = {
            'cv_id': cv_idx,
            'title': title,
            'company': company,
            'text': f"{title} at {company}".strip() if company else title,
        }

        if 'department' in active:
            record['department'] = active['department']
        if 'seniority' in active:
            record['seniority'] = active['seniority']

        if include_history:
            past_positions = [p for p in positions if p.get('status') != 'ACTIVE']
            record['history'] = ' | '.join([
                p.get('position', p.get('title', '')) for p in past_positions
            ])

        records.append(record)

    return pd.DataFrame(records)


def load_label_lists(data_dir, fix_encoding=True, deduplicate=True, max_per_class=500):
    data_path = Path(data_dir)

    department_df = pd.read_csv(data_path / 'department-v2.csv', encoding='utf-8')
    seniority_df = pd.read_csv(data_path / 'seniority-v2.csv', encoding='utf-8')

    if fix_encoding:
        print('Applying encoding fix...')
        department_df['text'] = department_df['text'].apply(_fix_encoding)
        seniority_df['text'] = seniority_df['text'].apply(_fix_encoding)

    if deduplicate:
        print('Deduplicating department labels...')
        department_df = deduplicate_label_df(department_df, max_per_class)
        print('Deduplicating seniority labels...')
        seniority_df = deduplicate_label_df(seniority_df, max_per_class)

    return department_df, seniority_df


def load_evaluation_dataset(data_dir):
    data_path = Path(data_dir)
    cvs = load_linkedin_data(str(data_path / 'linkedin-cvs-annotated.json'))
    return prepare_dataset(cvs)


class JobTitleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            weight = torch.tensor(self.class_weights, device=logits.device, dtype=logits.dtype)
            loss_fn = CrossEntropyLoss(weight=weight)
        else:
            loss_fn = CrossEntropyLoss()

        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


class TransformerClassifier:
    def __init__(self, model_name='distilbert-base-multilingual-cased', num_labels=2, id2label=None, label2id=None):
        self.model_name = model_name
        self.num_labels = num_labels
        self.id2label = id2label or {}
        self.label2id = label2id or {}

        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        self.model = DistilBertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")

    def train(self, texts, labels, val_texts=None, val_labels=None, epochs=3, batch_size=16, learning_rate=1e-5, warmup_ratio=0.1, use_class_weights=False):
        print(f"Training on {len(texts)} examples...")

        train_encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=128)
        train_dataset = JobTitleDataset(train_encodings, labels)

        if val_texts is not None and val_labels is not None:
            val_encodings = self.tokenizer(val_texts, truncation=True, padding=True, max_length=128)
            val_dataset = JobTitleDataset(val_encodings, val_labels)
            eval_strategy = 'epoch'
        else:
            val_dataset = None
            eval_strategy = 'no'

        class_weights = None
        if use_class_weights:
            from sklearn.utils.class_weight import compute_class_weight
            unique_labels = np.unique(labels)
            class_weights = compute_class_weight('balanced', classes=unique_labels, y=labels)
            print(f"Using class weights: {dict(zip(unique_labels, class_weights))}")

        with tempfile.TemporaryDirectory() as tmp_dir:
            training_args = TrainingArguments(
                output_dir=tmp_dir,
                num_train_epochs=epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                warmup_ratio=warmup_ratio,
                weight_decay=0.01,
                evaluation_strategy=eval_strategy,
                save_strategy='no',
                logging_strategy='no',
                learning_rate=learning_rate,
                report_to='none',
                disable_tqdm=True
            )

            if class_weights is not None:
                trainer = WeightedTrainer(
                    class_weights=class_weights,
                    model=self.model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset
                )
            else:
                trainer = Trainer(
                    model=self.model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset
                )

            trainer.train()

        print('Training complete!')

    def predict(self, texts, batch_size=32):
        self.model.eval()
        all_predictions = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = self.tokenizer(
                batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt'
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                predictions = torch.argmax(outputs.logits, dim=-1)
                all_predictions.extend(predictions.cpu().tolist())

        return all_predictions


## 2. Daten laden

Wir trainieren auf den Lookup-Tabellen und evaluieren auf den annotierten CVs.
Das entspricht dem Zero-Shot/Transfer-Setup der vorherigen Notebooks.


In [None]:
# Lookup-Tabellen (Training)
dept_df, sen_df = load_label_lists(
    DATA_DIR,
    fix_encoding=True,
    deduplicate=True,
    max_per_class=None
)

# Annotierte CVs (Evaluation)
eval_df = load_evaluation_dataset(DATA_DIR)

print(f"Department lookup: {len(dept_df):,} examples")
print(f"Seniority lookup:  {len(sen_df):,} examples")
print(f"Annotated CVs:     {len(eval_df):,} positions")


## 3. Hilfsfunktionen

Wir kapseln den wiederholten Code fuer Mapping, Training und Evaluation in Funktionen.
Das macht die Experimente konsistent und leichter vergleichbar.


In [None]:
def build_label_maps(label_series):
    labels = sorted(label_series.unique())
    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label2id, id2label


def prepare_eval_data(eval_df, label_col, label2id, text_col='title'):
    subset = eval_df[eval_df[label_col].notna()].copy()
    subset = subset[subset[label_col].isin(label2id.keys())]
    texts = subset[text_col].fillna('').tolist()
    labels = [label2id[l] for l in subset[label_col].tolist()]
    return texts, labels


def compute_metrics(true_ids, pred_ids, id2label):
    acc = accuracy_score(true_ids, pred_ids)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(
        true_ids, pred_ids, average='macro', zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        true_ids, pred_ids, average='weighted', zero_division=0
    )[2]

    labels = sorted(set(true_ids) | set(pred_ids))
    _, _, f1_per_class, _ = precision_recall_fscore_support(
        true_ids, pred_ids, labels=labels, average=None, zero_division=0
    )
    per_class_f1 = {id2label[i]: float(f1_per_class[idx]) for idx, i in enumerate(labels)}

    return {
        'accuracy': float(acc),
        'precision': float(precision),
        'recall': float(recall),
        'f1_macro': float(f1_macro),
        'f1_weighted': float(f1_weighted),
        'per_class_f1': per_class_f1
    }


def freeze_base_layers(clf):
    base = None
    if hasattr(clf.model, 'base_model_prefix'):
        prefix = clf.model.base_model_prefix
        base = getattr(clf.model, prefix, None)
    if base is None:
        base = getattr(clf.model, 'base_model', None)
    if base is None and hasattr(clf.model, 'distilbert'):
        base = clf.model.distilbert

    if base is None:
        print('Could not find base model to freeze')
        return

    for param in base.parameters():
        param.requires_grad = False

    print('Base layers frozen')


def train_eval_distilbert(
    task_name,
    train_df,
    eval_df,
    label_col,
    config,
    text_col='title',
    eval_real_world=True,
    freeze_base=False
):
    label2id, id2label = build_label_maps(train_df['label'])

    X_train, X_val, y_train, y_val = train_test_split(
        train_df['text'].tolist(),
        train_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=train_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    clf = TransformerClassifier(
        model_name=config['model_name'],
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    if freeze_base:
        freeze_base_layers(clf)

    clf.train(
        texts=X_train,
        labels=y_train_ids,
        val_texts=X_val,
        val_labels=y_val_ids,
        epochs=config['epochs'],
        batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        warmup_ratio=config.get('warmup_ratio', 0.1),
        use_class_weights=config.get('use_class_weights', False)
    )

    val_pred_ids = clf.predict(X_val)
    in_dist = compute_metrics(y_val_ids, val_pred_ids, id2label)

    real_world = None
    if eval_real_world:
        eval_texts, eval_labels = prepare_eval_data(
            eval_df, label_col, label2id, text_col=text_col
        )
        eval_pred_ids = clf.predict(eval_texts)
        real_world = compute_metrics(eval_labels, eval_pred_ids, id2label)

    return {
        'in_distribution': in_dist,
        'real_world': real_world,
        'label2id': label2id,
        'id2label': id2label
    }, clf


def print_summary(name, results):
    in_acc = results['in_distribution']['accuracy']
    in_f1 = results['in_distribution']['f1_macro']
    if results['real_world']:
        rw_acc = results['real_world']['accuracy']
        rw_f1 = results['real_world']['f1_macro']
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f} | real-world acc {rw_acc:.4f} f1 {rw_f1:.4f}")
    else:
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f} | real-world skipped")


## 4. Plain DistilBERT Baseline

Wir trainieren DistilBERT mit Standard-Parametern auf den Lookup-Tabellen
und evaluieren auf den annotierten CVs. Das ist der Ausgangspunkt fuer alle
weiteren Vergleiche.


In [None]:
baseline_results = {}

# Department baseline
base_dept, base_dept_clf = train_eval_distilbert(
    task_name='department',
    train_df=dept_df,
    eval_df=eval_df,
    label_col='department',
    config=BASE_CONFIG
)
print_summary('Baseline Department', base_dept)

# Seniority baseline
base_sen, base_sen_clf = train_eval_distilbert(
    task_name='seniority',
    train_df=sen_df,
    eval_df=eval_df,
    label_col='seniority',
    config=BASE_CONFIG
)
print_summary('Baseline Seniority', base_sen)

baseline_results['department'] = base_dept
baseline_results['seniority'] = base_sen


## 5. Hyperparameter Tuning

Wir testen mehrere Kombinationen aus Lernrate, Batch-Size und Epochen.
Die Bewertung erfolgt auf dem In-Distribution Validation-Split, um fair zu
vergleichen. Real-World Evaluation wird hier ausgelassen, um die Anzahl der
Trainingslaeufe nicht weiter zu vergroessern.


In [None]:
TUNING_GRID = [
    {'learning_rate': 1e-5, 'batch_size': 16, 'epochs': 2, 'warmup_ratio': 0.1},
    {'learning_rate': 2e-5, 'batch_size': 16, 'epochs': 2, 'warmup_ratio': 0.1},
    {'learning_rate': 3e-5, 'batch_size': 16, 'epochs': 2, 'warmup_ratio': 0.1},
    {'learning_rate': 2e-5, 'batch_size': 8,  'epochs': 2, 'warmup_ratio': 0.1},
]


def run_tuning(task_name, train_df, label_col):
    rows = []
    for idx, cfg in enumerate(TUNING_GRID, start=1):
        config = BASE_CONFIG.copy()
        config.update(cfg)

        result, _ = train_eval_distilbert(
            task_name=task_name,
            train_df=train_df,
            eval_df=eval_df,
            label_col=label_col,
            config=config,
            eval_real_world=False
        )

        rows.append({
            'run': idx,
            'learning_rate': cfg['learning_rate'],
            'batch_size': cfg['batch_size'],
            'epochs': cfg['epochs'],
            'warmup_ratio': cfg['warmup_ratio'],
            'val_accuracy': result['in_distribution']['accuracy'],
            'val_f1_macro': result['in_distribution']['f1_macro']
        })

    return pd.DataFrame(rows)


tuning_dept = run_tuning('department', dept_df, 'department')
tuning_sen = run_tuning('seniority', sen_df, 'seniority')

print('Tuning Department:')
print(tuning_dept.sort_values('val_f1_macro', ascending=False).head(5))

print('Tuning Seniority:')
print(tuning_sen.sort_values('val_f1_macro', ascending=False).head(5))


## 6. Weitere Experimente (4 Stueck)

Jedes Experiment testet eine konkrete Hypothese. So kann man gezielt sehen,
welche Massnahme wirklich hilft.


### Experiment 1: Class Weights

Hypothese: Class-Weighted Loss hilft bei unbalancierten Klassen und verbessert
F1 fuer Minoritaeten.


In [None]:
exp1_config = BASE_CONFIG.copy()
exp1_config['use_class_weights'] = True

exp1_dept, _ = train_eval_distilbert(
    task_name='department',
    train_df=dept_df,
    eval_df=eval_df,
    label_col='department',
    config=exp1_config
)
print_summary('Exp1 Department', exp1_dept)

exp1_sen, _ = train_eval_distilbert(
    task_name='seniority',
    train_df=sen_df,
    eval_df=eval_df,
    label_col='seniority',
    config=exp1_config
)
print_summary('Exp1 Seniority', exp1_sen)


### Experiment 2: Balanced Training Data

Hypothese: Balancing (Over- und Undersampling) reduziert die Dominanz grosser Klassen
und verbessert den Macro-F1.


In [None]:
# Balance department and seniority with min/max per class
balanced_dept, _ = balance_dataset(dept_df, min_samples=500, max_samples=2000)
balanced_sen, _ = balance_dataset(sen_df, min_samples=500, max_samples=2000)

exp2_dept, _ = train_eval_distilbert(
    task_name='department',
    train_df=balanced_dept,
    eval_df=eval_df,
    label_col='department',
    config=BASE_CONFIG
)
print_summary('Exp2 Department', exp2_dept)

exp2_sen, _ = train_eval_distilbert(
    task_name='seniority',
    train_df=balanced_sen,
    eval_df=eval_df,
    label_col='seniority',
    config=BASE_CONFIG
)
print_summary('Exp2 Seniority', exp2_sen)


### Experiment 3: Silver Data Augmentation

Hypothese: Pseudo-Labels aus unannotierten CVs vergroessern die Trainingsmenge
und verbessern die Generalisierung.

Falls die Datei nicht existiert, wird das Experiment uebersprungen.


In [None]:
def load_silver_data(data_dir):
    silver_path = Path(data_dir) / 'processed' / 'unannotated_pseudo_labeled.csv'
    if not silver_path.exists():
        return None, None

    df = pd.read_csv(silver_path)
    if 'text' not in df.columns and 'title' in df.columns:
        df['text'] = df['title']

    dept_silver = df[df['dept_pseudo'].notna()][['text', 'dept_pseudo']].copy()
    dept_silver = dept_silver.rename(columns={'dept_pseudo': 'label'})

    sen_silver = df[df['sen_pseudo'].notna()][['text', 'sen_pseudo']].copy()
    sen_silver = sen_silver.rename(columns={'sen_pseudo': 'label'})

    return dept_silver, sen_silver


dep_silver, sen_silver = load_silver_data(DATA_DIR)

if dep_silver is None or dep_silver.empty:
    exp3_dept = None
    print('No department silver data found. Skipping exp3 department.')
else:
    dept_aug = pd.concat([dept_df[['text', 'label']], dep_silver], ignore_index=True)
    exp3_dept, _ = train_eval_distilbert(
        task_name='department',
        train_df=dept_aug,
        eval_df=eval_df,
        label_col='department',
        config=BASE_CONFIG
    )
    print_summary('Exp3 Department', exp3_dept)

if sen_silver is None or sen_silver.empty:
    exp3_sen = None
    print('No seniority silver data found. Skipping exp3 seniority.')
else:
    sen_aug = pd.concat([sen_df[['text', 'label']], sen_silver], ignore_index=True)
    exp3_sen, _ = train_eval_distilbert(
        task_name='seniority',
        train_df=sen_aug,
        eval_df=eval_df,
        label_col='seniority',
        config=BASE_CONFIG
    )
    print_summary('Exp3 Seniority', exp3_sen)


### Experiment 4: Freeze Base Layers

Hypothese: Wenn wir nur den Klassifikationskopf trainieren, reduzieren wir Overfitting
und die Trainingszeit. Das kann sinnvoll sein, wenn die Daten klein sind.


In [None]:
exp4_config = BASE_CONFIG.copy()
exp4_config['epochs'] = 2

exp4_dept, _ = train_eval_distilbert(
    task_name='department',
    train_df=dept_df,
    eval_df=eval_df,
    label_col='department',
    config=exp4_config,
    freeze_base=True
)
print_summary('Exp4 Department', exp4_dept)

exp4_sen, _ = train_eval_distilbert(
    task_name='seniority',
    train_df=sen_df,
    eval_df=eval_df,
    label_col='seniority',
    config=exp4_config,
    freeze_base=True
)
print_summary('Exp4 Seniority', exp4_sen)


## 7. Ergebnisse speichern

Wir sammeln alle Ergebnisse in einem JSON-File, damit spaetere Vergleiche
mit anderen Notebooks leichter sind.


In [None]:
all_results = {
    'approach': 'DistilBERT Experiments',
    'baseline': baseline_results,
    'tuning': {
        'department': tuning_dept.to_dict(orient='records'),
        'seniority': tuning_sen.to_dict(orient='records')
    },
    'experiments': {
        'exp1_class_weights': {
            'department': exp1_dept,
            'seniority': exp1_sen
        },
        'exp2_balanced': {
            'department': exp2_dept,
            'seniority': exp2_sen
        },
        'exp3_silver': {
            'department': exp3_dept,
            'seniority': exp3_sen
        },
        'exp4_frozen': {
            'department': exp4_dept,
            'seniority': exp4_sen
        }
    },
    'metadata': {
        'base_model': BASE_MODEL,
        'train_source': 'lookup tables',
        'eval_source': 'annotated CVs',
        'random_state': RANDOM_STATE
    },
    'timestamp': datetime.now().isoformat()
}

print('All results available in all_results.')
