# 10 - DistilBERT Improvements

Dieses Notebook setzt die vorgeschlagenen Verbesserungen um:
1) Zwei-stufiges Fine-Tuning (Lookup -> CV-Train)
2) Backbone Vergleich (andere Modelle)
3) Label Smoothing und max_length Tests
4) Input Varianten (title vs text vs title+history)
5) Silver Data Augmentation (optional)

Hinweis: Einige Experimente trainieren auf annotierten CVs.
Das ist nicht mehr strikt zero-shot, sondern supervised domain adaptation.


## 1. Setup

Wir importieren alle benoetigten Bibliotheken, definieren Seeds und Basis-Konfigurationen.


In [None]:
import json
from datetime import datetime
from pathlib import Path
import random
import warnings
import tempfile

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

warnings.filterwarnings('ignore')

DATA_DIR = Path('/Users/batuklkn/Desktop/GustAbgabe/BuzzwordLearner/data')

RANDOM_STATE = 42
BASE_MODEL = 'distilbert-base-multilingual-cased'

BASE_CONFIG = {
    'model_name': BASE_MODEL,
    'epochs': 3,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'use_class_weights': False
}

RUN_TWO_STAGE = True
RUN_BACKBONE_TESTS = True
RUN_LABEL_SMOOTHING = True
RUN_INPUT_VARIANTS = True
RUN_SILVER = False


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


set_seed(RANDOM_STATE)

print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")


def _fix_encoding(text):
    if not isinstance(text, str):
        return text
    try:
        if 'Ãƒ' in text:
            return text.encode('latin-1').decode('utf-8', errors='ignore')
    except (UnicodeEncodeError, UnicodeDecodeError):
        pass
    return text


def deduplicate_label_df(label_df, max_per_class=500):
    label_df = label_df.copy()
    label_df['text_normalized'] = label_df['text'].str.lower().str.strip()
    original_count = len(label_df)
    label_df = label_df.drop_duplicates(subset=['text_normalized', 'label'])
    dedup_count = len(label_df)

    if max_per_class is not None:
        label_df = label_df.groupby('label', group_keys=False).apply(
            lambda x: x.sample(min(len(x), max_per_class), random_state=42)
        ).reset_index(drop=True)

    final_count = len(label_df)
    label_df = label_df.drop(columns=['text_normalized'])

    print(f"  Deduplication: {original_count} -> {dedup_count} (removed {original_count - dedup_count} duplicates)")
    if max_per_class is not None:
        print(f"  Capping: {dedup_count} -> {final_count} (max {max_per_class} per class)")

    return label_df


def load_linkedin_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data


def prepare_dataset(cvs, include_history=False):
    records = []

    for cv_idx, cv in enumerate(cvs):
        if isinstance(cv, list):
            positions = cv
        else:
            positions = cv.get('positions', cv) if isinstance(cv, dict) else []

        active_positions = [p for p in positions if p.get('status') == 'ACTIVE']

        if not active_positions:
            continue

        active = active_positions[0]

        title = active.get('position', active.get('title', ''))
        company = active.get('organization', active.get('companyName', ''))

        record = {
            'cv_id': cv_idx,
            'title': title,
            'company': company,
            'text': f"{title} at {company}".strip() if company else title,
        }

        if 'department' in active:
            record['department'] = active['department']
        if 'seniority' in active:
            record['seniority'] = active['seniority']

        if include_history:
            past_positions = [p for p in positions if p.get('status') != 'ACTIVE']
            record['history'] = ' | '.join([
                p.get('position', p.get('title', '')) for p in past_positions
            ])

        records.append(record)

    return pd.DataFrame(records)


def load_label_lists(data_dir, fix_encoding=True, deduplicate=True, max_per_class=500):
    data_path = Path(data_dir)

    department_df = pd.read_csv(data_path / 'department-v2.csv', encoding='utf-8')
    seniority_df = pd.read_csv(data_path / 'seniority-v2.csv', encoding='utf-8')

    if fix_encoding:
        print('Applying encoding fix...')
        department_df['text'] = department_df['text'].apply(_fix_encoding)
        seniority_df['text'] = seniority_df['text'].apply(_fix_encoding)

    if deduplicate:
        print('Deduplicating department labels...')
        department_df = deduplicate_label_df(department_df, max_per_class)
        print('Deduplicating seniority labels...')
        seniority_df = deduplicate_label_df(seniority_df, max_per_class)

    return department_df, seniority_df


def load_evaluation_dataset(data_dir):
    data_path = Path(data_dir)
    cvs = load_linkedin_data(str(data_path / 'linkedin-cvs-annotated.json'))
    return prepare_dataset(cvs)


class JobTitleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            weight = torch.tensor(self.class_weights, device=logits.device, dtype=logits.dtype)
            loss_fn = CrossEntropyLoss(weight=weight)
        else:
            loss_fn = CrossEntropyLoss()

        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


class TransformerClassifier:
    def __init__(self, model_name='distilbert-base-multilingual-cased', num_labels=2, id2label=None, label2id=None):
        self.model_name = model_name
        self.num_labels = num_labels
        self.id2label = id2label or {}
        self.label2id = label2id or {}

        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        self.model = DistilBertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")

    def train(self, texts, labels, val_texts=None, val_labels=None, epochs=3, batch_size=16, learning_rate=1e-5, warmup_ratio=0.1, use_class_weights=False):
        print(f"Training on {len(texts)} examples...")

        train_encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=128)
        train_dataset = JobTitleDataset(train_encodings, labels)

        if val_texts is not None and val_labels is not None:
            val_encodings = self.tokenizer(val_texts, truncation=True, padding=True, max_length=128)
            val_dataset = JobTitleDataset(val_encodings, val_labels)
            eval_strategy = 'epoch'
        else:
            val_dataset = None
            eval_strategy = 'no'

        class_weights = None
        if use_class_weights:
            from sklearn.utils.class_weight import compute_class_weight
            unique_labels = np.unique(labels)
            class_weights = compute_class_weight('balanced', classes=unique_labels, y=labels)
            print(f"Using class weights: {dict(zip(unique_labels, class_weights))}")

        with tempfile.TemporaryDirectory() as tmp_dir:
            training_args = TrainingArguments(
                output_dir=tmp_dir,
                num_train_epochs=epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                warmup_ratio=warmup_ratio,
                weight_decay=0.01,
                evaluation_strategy=eval_strategy,
                save_strategy='no',
                logging_strategy='no',
                learning_rate=learning_rate,
                report_to='none',
                disable_tqdm=True
            )

            if class_weights is not None:
                trainer = WeightedTrainer(
                    class_weights=class_weights,
                    model=self.model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset
                )
            else:
                trainer = Trainer(
                    model=self.model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=val_dataset
                )

            trainer.train()

        print('Training complete!')

    def predict(self, texts, batch_size=32):
        self.model.eval()
        all_predictions = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = self.tokenizer(
                batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt'
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                predictions = torch.argmax(outputs.logits, dim=-1)
                all_predictions.extend(predictions.cpu().tolist())

        return all_predictions


## 2. Daten laden

Wir laden Lookup-Tabellen und die annotierten CVs. Fuer Input-Varianten nutzen wir
auch die CV-History.


In [None]:
# Lookup-Tabellen (Training)
dept_df, sen_df = load_label_lists(
    DATA_DIR,
    fix_encoding=True,
    deduplicate=True,
    max_per_class=None
)

# Annotierte CVs (Evaluation)
eval_df = load_evaluation_dataset(DATA_DIR)

# Volle CVs mit History fuer Input-Experimente
cvs_annotated = load_linkedin_data(DATA_DIR / 'linkedin-cvs-annotated.json')
cv_df_full = prepare_dataset(cvs_annotated, include_history=True)

# Titel + History kombinieren
history = cv_df_full['history'].fillna('')
cv_df_full['title_history'] = np.where(
    history != '',
    cv_df_full['title'].fillna('') + ' | ' + history,
    cv_df_full['title'].fillna('')
)

print(f"Department lookup: {len(dept_df):,} examples")
print(f"Seniority lookup:  {len(sen_df):,} examples")
print(f"Annotated CVs:     {len(eval_df):,} positions")


## 3. Hilfsfunktionen

Wir kapseln wiederholte Logik fuer Splits, Mappings, Training und Evaluation.


In [None]:
def build_label_maps(label_series):
    labels = sorted(label_series.unique())
    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label2id, id2label


def split_by_cv_id(df, train_size=0.7, val_size=0.15):
    cv_ids = df['cv_id'].unique()
    train_ids, temp_ids = train_test_split(
        cv_ids, test_size=1 - train_size, random_state=RANDOM_STATE
    )
    val_ids, test_ids = train_test_split(
        temp_ids, test_size=0.5, random_state=RANDOM_STATE
    )
    train_df = df[df['cv_id'].isin(train_ids)].copy()
    val_df = df[df['cv_id'].isin(val_ids)].copy()
    test_df = df[df['cv_id'].isin(test_ids)].copy()
    return train_df, val_df, test_df


def compute_metrics(true_ids, pred_ids, id2label):
    acc = accuracy_score(true_ids, pred_ids)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(
        true_ids, pred_ids, average='macro', zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        true_ids, pred_ids, average='weighted', zero_division=0
    )[2]

    labels = sorted(set(true_ids) | set(pred_ids))
    _, _, f1_per_class, _ = precision_recall_fscore_support(
        true_ids, pred_ids, labels=labels, average=None, zero_division=0
    )
    per_class_f1 = {id2label[i]: float(f1_per_class[idx]) for idx, i in enumerate(labels)}

    return {
        'accuracy': float(acc),
        'precision': float(precision),
        'recall': float(recall),
        'f1_macro': float(f1_macro),
        'f1_weighted': float(f1_weighted),
        'per_class_f1': per_class_f1
    }


def print_summary(name, results):
    in_acc = results['in_distribution']['accuracy']
    in_f1 = results['in_distribution']['f1_macro']
    if results.get('cv_test'):
        rw_acc = results['cv_test']['accuracy']
        rw_f1 = results['cv_test']['f1_macro']
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f} | cv-test acc {rw_acc:.4f} f1 {rw_f1:.4f}")
    else:
        print(f"{name} | in-dist acc {in_acc:.4f} f1 {in_f1:.4f}")


## 4. Zwei-stufiges Fine-Tuning

Ziel: Erst auf Lookup-Tabellen trainieren (breite Pattern-Abdeckung),
danach auf einem CV-Train-Split feinjustieren (Domain Adaptation).


In [None]:
TWO_STAGE_CONFIG = {
    'stage1': {
        'epochs': 2,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'warmup_ratio': 0.1,
        'use_class_weights': False
    },
    'stage2': {
        'epochs': 2,
        'batch_size': 8,
        'learning_rate': 1e-5,
        'warmup_ratio': 0.1,
        'use_class_weights': False
    }
}


def two_stage_finetune(task_name, lookup_df, cv_df, label_col, config, cv_text_col='title'):
    label2id, id2label = build_label_maps(lookup_df['label'])

    X_train, X_val, y_train, y_val = train_test_split(
        lookup_df['text'].tolist(),
        lookup_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=lookup_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    clf = TransformerClassifier(
        model_name=BASE_MODEL,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    clf.train(
        texts=X_train,
        labels=y_train_ids,
        val_texts=X_val,
        val_labels=y_val_ids,
        epochs=config['stage1']['epochs'],
        batch_size=config['stage1']['batch_size'],
        learning_rate=config['stage1']['learning_rate'],
        warmup_ratio=config['stage1']['warmup_ratio'],
        use_class_weights=config['stage1']['use_class_weights']
    )

    stage1_pred = clf.predict(X_val)
    stage1_metrics = compute_metrics(y_val_ids, stage1_pred, id2label)

    cv_task = cv_df[cv_df[label_col].notna()].copy()
    cv_task = cv_task[cv_task[label_col].isin(label2id.keys())]

    cv_train, cv_val, cv_test = split_by_cv_id(cv_task)

    cv_train_texts = cv_train[cv_text_col].fillna('').tolist()
    cv_train_labels = [label2id[l] for l in cv_train[label_col].tolist()]

    cv_val_texts = cv_val[cv_text_col].fillna('').tolist()
    cv_val_labels = [label2id[l] for l in cv_val[label_col].tolist()]

    clf.train(
        texts=cv_train_texts,
        labels=cv_train_labels,
        val_texts=cv_val_texts,
        val_labels=cv_val_labels,
        epochs=config['stage2']['epochs'],
        batch_size=config['stage2']['batch_size'],
        learning_rate=config['stage2']['learning_rate'],
        warmup_ratio=config['stage2']['warmup_ratio'],
        use_class_weights=config['stage2']['use_class_weights']
    )

    stage2_pred = clf.predict(cv_val_texts)
    stage2_metrics = compute_metrics(cv_val_labels, stage2_pred, id2label)

    cv_test_texts = cv_test[cv_text_col].fillna('').tolist()
    cv_test_labels = [label2id[l] for l in cv_test[label_col].tolist()]

    cv_test_pred = clf.predict(cv_test_texts)
    cv_test_metrics = compute_metrics(cv_test_labels, cv_test_pred, id2label)

    results = {
        'in_distribution': stage1_metrics,
        'cv_val': stage2_metrics,
        'cv_test': cv_test_metrics,
        'cv_split_sizes': {
            'train': len(cv_train),
            'val': len(cv_val),
            'test': len(cv_test)
        }
    }

    return results


if RUN_TWO_STAGE:
    two_stage_results = {}

    two_stage_results['department'] = two_stage_finetune(
        task_name='department',
        lookup_df=dept_df,
        cv_df=cv_df_full,
        label_col='department',
        config=TWO_STAGE_CONFIG,
        cv_text_col='title'
    )
    print_summary('Two-stage Department', two_stage_results['department'])

    two_stage_results['seniority'] = two_stage_finetune(
        task_name='seniority',
        lookup_df=sen_df,
        cv_df=cv_df_full,
        label_col='seniority',
        config=TWO_STAGE_CONFIG,
        cv_text_col='title'
    )
    print_summary('Two-stage Seniority', two_stage_results['seniority'])
else:
    two_stage_results = None


## 5. Backbone Vergleich

Ziel: pruefen, ob ein anderes Modell (z.B. XLM-R) bessere Generalisierung liefert.
Wir trainieren nur auf Lookup-Tabellen und evaluieren auf einem CV-Testsplit.


In [None]:
BACKBONES = [
    BASE_MODEL
]

BACKBONE_CONFIG = {
    'epochs': 2,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'use_class_weights': False
}

MAX_SAMPLES = 2500  # optional: reduce for faster runs


def sample_df(df, max_samples=None):
    if max_samples is None or len(df) <= max_samples:
        return df
    return df.sample(max_samples, random_state=RANDOM_STATE).reset_index(drop=True)


def train_lookup_only(task_name, lookup_df, cv_df, label_col, model_name, config, max_samples=None):
    label2id, id2label = build_label_maps(lookup_df['label'])
    lookup_df = sample_df(lookup_df, max_samples=max_samples)

    X_train, X_val, y_train, y_val = train_test_split(
        lookup_df['text'].tolist(),
        lookup_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=lookup_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    clf = TransformerClassifier(
        model_name=model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    clf.train(
        texts=X_train,
        labels=y_train_ids,
        val_texts=X_val,
        val_labels=y_val_ids,
        epochs=config['epochs'],
        batch_size=config['batch_size'],
        learning_rate=config['learning_rate'],
        warmup_ratio=config['warmup_ratio'],
        use_class_weights=config['use_class_weights']
    )

    in_dist_pred = clf.predict(X_val)
    in_dist_metrics = compute_metrics(y_val_ids, in_dist_pred, id2label)

    cv_task = cv_df[cv_df[label_col].notna()].copy()
    cv_task = cv_task[cv_task[label_col].isin(label2id.keys())]
    _, _, cv_test = split_by_cv_id(cv_task)

    cv_test_texts = cv_test['title'].fillna('').tolist()
    cv_test_labels = [label2id[l] for l in cv_test[label_col].tolist()]

    cv_test_pred = clf.predict(cv_test_texts)
    cv_test_metrics = compute_metrics(cv_test_labels, cv_test_pred, id2label)

    return {
        'in_distribution': in_dist_metrics,
        'cv_test': cv_test_metrics
    }


if RUN_BACKBONE_TESTS:
    backbone_results = {
        'department': {},
        'seniority': {}
    }

    for model_name in BACKBONES:
        print(f"
Backbone: {model_name}")

        backbone_results['department'][model_name] = train_lookup_only(
            task_name='department',
            lookup_df=dept_df,
            cv_df=cv_df_full,
            label_col='department',
            model_name=model_name,
            config=BACKBONE_CONFIG,
            max_samples=MAX_SAMPLES
        )
        print_summary(f"Dept {model_name}", backbone_results['department'][model_name])

        backbone_results['seniority'][model_name] = train_lookup_only(
            task_name='seniority',
            lookup_df=sen_df,
            cv_df=cv_df_full,
            label_col='seniority',
            model_name=model_name,
            config=BACKBONE_CONFIG,
            max_samples=MAX_SAMPLES
        )
        print_summary(f"Sen {model_name}", backbone_results['seniority'][model_name])
else:
    backbone_results = None


## 6. Label Smoothing und max_length

Ziel: pruefen, ob Label Smoothing die Generalisierung verbessert, und ob eine
andere Token-Laenge (64/128/256) sinnvoll ist. Dieses Experiment nutzt einen
Custom Trainer, weil der vorhandene TransformerClassifier max_length fix auf 128 setzt.


In [None]:
class SimpleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def train_with_label_smoothing(
    lookup_df,
    cv_df,
    label_col,
    model_name,
    max_length=128,
    label_smoothing=0.1,
    epochs=2,
    batch_size=16
):
    label2id, id2label = build_label_maps(lookup_df['label'])

    X_train, X_val, y_train, y_val = train_test_split(
        lookup_df['text'].tolist(),
        lookup_df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=lookup_df['label']
    )

    y_train_ids = [label2id[l] for l in y_train]
    y_val_ids = [label2id[l] for l in y_val]

    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    train_enc = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    val_enc = tokenizer(X_val, truncation=True, padding=True, max_length=max_length)

    train_ds = SimpleDataset(train_enc, y_train_ids)
    val_ds = SimpleDataset(val_enc, y_val_ids)

    with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
            output_dir=tmp_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            evaluation_strategy='epoch',
            save_strategy='no',
            logging_strategy='no',
            report_to='none',
            disable_tqdm=True,
            label_smoothing_factor=label_smoothing
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=val_ds
        )
        trainer.train()

        preds = trainer.predict(val_ds).predictions

    val_pred_ids = np.argmax(preds, axis=1).tolist()
    in_dist = compute_metrics(y_val_ids, val_pred_ids, id2label)

    cv_task = cv_df[cv_df[label_col].notna()].copy()
    cv_task = cv_task[cv_task[label_col].isin(label2id.keys())]
    _, _, cv_test = split_by_cv_id(cv_task)

    cv_texts = cv_test['title'].fillna('').tolist()
    cv_labels = [label2id[l] for l in cv_test[label_col].tolist()]
    cv_enc = tokenizer(cv_texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

    model.eval()
    with torch.no_grad():
        outputs = model(**cv_enc)
        cv_pred_ids = torch.argmax(outputs.logits, dim=-1).cpu().tolist()

    cv_test_metrics = compute_metrics(cv_labels, cv_pred_ids, id2label)

    return {
        'in_distribution': in_dist,
        'cv_test': cv_test_metrics,
        'max_length': max_length,
        'label_smoothing': label_smoothing
    }


if RUN_LABEL_SMOOTHING:
    ls_results = {'department': [], 'seniority': []}

    MAX_LENGTHS = [64, 128, 256]
    SMOOTHING = [0.0, 0.1]

    for max_len in MAX_LENGTHS:
        for ls in SMOOTHING:
            print(f"Label smoothing run: max_len={max_len}, ls={ls}")
            res = train_with_label_smoothing(
                lookup_df=dept_df,
                cv_df=cv_df_full,
                label_col='department',
                model_name=BASE_MODEL,
                max_length=max_len,
                label_smoothing=ls,
                epochs=2,
                batch_size=16
            )
            ls_results['department'].append(res)

    for max_len in MAX_LENGTHS:
        for ls in SMOOTHING:
            print(f"Label smoothing run: max_len={max_len}, ls={ls}")
            res = train_with_label_smoothing(
                lookup_df=sen_df,
                cv_df=cv_df_full,
                label_col='seniority',
                model_name=BASE_MODEL,
                max_length=max_len,
                label_smoothing=ls,
                epochs=2,
                batch_size=16
            )
            ls_results['seniority'].append(res)
else:
    ls_results = None


## 7. Input Varianten (title vs text vs title+history)

Ziel: testen, ob mehr Kontext (company oder history) die Performance verbessert.
Wir nutzen das zwei-stufige Setup und variieren die Eingabe fuer die CV-Phase.


In [None]:
if RUN_INPUT_VARIANTS:
    input_variants = {
        'title': 'title',
        'text': 'text',
        'title_history': 'title_history'
    }

    input_variant_results = {
        'department': {},
        'seniority': {}
    }

    for name, col in input_variants.items():
        print(f"
Input variant: {name}")

        input_variant_results['department'][name] = two_stage_finetune(
            task_name='department',
            lookup_df=dept_df,
            cv_df=cv_df_full,
            label_col='department',
            config=TWO_STAGE_CONFIG,
            cv_text_col=col
        )
        print_summary(f"Department {name}", input_variant_results['department'][name])

        input_variant_results['seniority'][name] = two_stage_finetune(
            task_name='seniority',
            lookup_df=sen_df,
            cv_df=cv_df_full,
            label_col='seniority',
            config=TWO_STAGE_CONFIG,
            cv_text_col=col
        )
        print_summary(f"Seniority {name}", input_variant_results['seniority'][name])
else:
    input_variant_results = None


## 8. Silver Data Augmentation (optional)

Ziel: mehr Trainingsdaten durch Pseudo-Labels. Diese Sektion erzeugt optional
Silver Labels mit der Embedding-Baseline und trainiert danach DistilBERT.

Standardmaessig ist RUN_SILVER = False, da es laenger dauert.


In [None]:
silver_results = None


## 9. Ergebnisse speichern

Wir speichern alle Resultate in einer JSON-Datei fuer spaetere Vergleiche.


In [None]:
all_results = {
    'approach': 'DistilBERT Improvements',
    'two_stage': two_stage_results,
    'backbone': backbone_results,
    'label_smoothing': ls_results,
    'input_variants': input_variant_results,
    'silver': silver_results,
    'metadata': {
        'base_model': BASE_MODEL,
        'random_state': RANDOM_STATE
    },
    'timestamp': datetime.now().isoformat()
}

print('All results available in all_results.')
