# DistilBERT Approaches Comparison - Complete Training & Analysis

This notebook consolidates all 5 DistilBERT experimental approaches into a single executable notebook.
Run this to reproduce all experiments and compare results.

## Contents
1. Setup & Data Loading (shared)
2. **Approach 1**: Baseline (standard fine-tuning)
3. **Approach 2**: Class Balancing (weighted loss)
4. **Approach 3**: Oversampling
5. **Approach 4**: Combined (weights + oversampling)
6. **Approach 5**: Two-Stage (hierarchical)
7. Final Comparison & Analysis

## 1. Setup & Data Loading

In [15]:
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA GeForce RTX 3080 Laptop GPU


In [16]:
# Custom oversampling (no imblearn dependency needed)
def oversample_to_median(texts, labels, random_state=42):
    """Simple oversampling: duplicate minority class samples to reach median class size."""
    np.random.seed(random_state)
    texts = np.array(texts)
    labels = np.array(labels)
    unique_classes, counts = np.unique(labels, return_counts=True)
    median_count = int(np.median(counts))
    print(f"Target median count: {median_count}")
    
    texts_resampled, labels_resampled = [], []
    for cls in unique_classes:
        cls_indices = np.where(labels == cls)[0]
        cls_count = len(cls_indices)
        if cls_count < median_count:
            n_to_add = median_count - cls_count
            additional_indices = np.random.choice(cls_indices, size=n_to_add, replace=True)
            all_indices = np.concatenate([cls_indices, additional_indices])
        else:
            all_indices = cls_indices
        texts_resampled.extend(texts[all_indices].tolist())
        labels_resampled.extend(labels[all_indices].tolist())
    
    combined = list(zip(texts_resampled, labels_resampled))
    np.random.shuffle(combined)
    texts_resampled, labels_resampled = zip(*combined)
    print(f"After oversampling: {len(labels_resampled)} samples")
    return list(texts_resampled), list(labels_resampled)

In [17]:
# Paths - update these to match your setup
DEPT_CSV = "../data/department-v2.csv"
SEN_CSV = "../data/seniority-v2.csv"
CV_ANN = "../data/linkedin-cvs-annotated.json"

# Training output directory (keeps notebooks folder clean)
TRAINING_OUTPUT_DIR = "./results/distilbert_training"
os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 64
SEED = 42

In [18]:
# Load training data (lookup tables)
dept_df = pd.read_csv(DEPT_CSV)
sen_df = pd.read_csv(SEN_CSV)

print(f"Department training data: {len(dept_df)} rows, {dept_df['label'].nunique()} classes")
print(f"Seniority training data: {len(sen_df)} rows, {sen_df['label'].nunique()} classes")

print("\nDepartment class distribution:")
print(dept_df['label'].value_counts())

print("\nSeniority class distribution:")
print(sen_df['label'].value_counts())

Department training data: 10145 rows, 11 classes
Seniority training data: 9428 rows, 5 classes

Department class distribution:
label
Marketing                 4295
Sales                     3328
Information Technology    1305
Business Development       620
Project Management         201
Consulting                 167
Administrative              83
Other                       42
Purchasing                  40
Customer Support            33
Human Resources             31
Name: count, dtype: int64

Seniority class distribution:
label
Senior        3733
Lead          3546
Director       984
Management     756
Junior         409
Name: count, dtype: int64


In [19]:
# Load evaluation data (annotated CVs)
with open(CV_ANN, 'r', encoding='utf-8') as f:
    ann = json.load(f)

positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)

eval_df['status'] = eval_df['status'].astype(str).str.upper()
eval_df = eval_df[eval_df['status'] == 'ACTIVE'].copy()

eval_df['title'] = eval_df['position'].astype(str).str.strip()
eval_df['department'] = eval_df['department'].astype(str).str.strip()
eval_df['seniority'] = eval_df['seniority'].astype(str).str.strip()

print(f"Eval data: {len(eval_df)} active positions")

Eval data: 623 active positions


## Helper Functions

In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_weighted': f1_score(labels, preds, average='weighted')
    }

def evaluate_model(trainer, eval_df, label_col, text_col, label_encoder, task_name):
    """Evaluate trained model on eval_df"""
    eval_use = eval_df[eval_df[label_col].isin(set(label_encoder.classes_))].copy()
    print(f"Eval samples after filtering: {len(eval_use)}")
    
    y_eval = label_encoder.transform(eval_use[label_col].astype(str))
    tokenizer = trainer.tokenizer
    eval_ds = Dataset.from_dict({'text': eval_use[text_col].astype(str).tolist(), 'labels': y_eval.tolist()})
    
    def tok(batch):
        return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)
    eval_ds = eval_ds.map(tok, batched=True)
    
    pred = trainer.predict(eval_ds)
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_labels = label_encoder.inverse_transform(pred_ids)
    
    y_true = eval_use[label_col].astype(str).values
    y_pred = pred_labels.astype(str)
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_m = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_w = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"\n=== {task_name} ===")
    print(f"Accuracy       : {acc:.4f}")
    print(f"Macro Precision: {prec:.4f}")
    print(f"Macro Recall   : {rec:.4f}")
    print(f"Macro F1       : {f1_m:.4f}")
    print(f"Weighted F1    : {f1_w:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))
    
    return {'accuracy': acc, 'precision_macro': prec, 'recall_macro': rec, 'f1_macro': f1_m, 'f1_weighted': f1_w}

In [21]:
# Weighted Trainer for class balancing
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        self.class_weights = class_weights
        super().__init__(*args, **kwargs)
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * np.maximum(counts, 1))
    return weights

In [22]:
# Initialize tokenizer & results storage
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)

all_results = []

---
## Approach 1: Baseline (Standard Fine-Tuning)

No class balancing - just standard DistilBERT fine-tuning.

In [23]:
print("=" * 60)
print("APPROACH 1: BASELINE")
print("=" * 60)

le_dept = LabelEncoder()
dept_df['y'] = le_dept.fit_transform(dept_df['label'].astype(str))
le_sen = LabelEncoder()
sen_df['y'] = le_sen.fit_transform(sen_df['label'].astype(str))

train_dept, val_dept = train_test_split(dept_df, test_size=0.2, random_state=SEED, stratify=dept_df['y'])
train_sen, val_sen = train_test_split(sen_df, test_size=0.2, random_state=SEED, stratify=sen_df['y'])

print(f"Department: {len(train_dept)} train, {len(val_dept)} val")
print(f"Seniority: {len(train_sen)} train, {len(val_sen)} val")

APPROACH 1: BASELINE
Department: 8116 train, 2029 val
Seniority: 7542 train, 1886 val


In [24]:
# Department - Baseline
train_ds = Dataset.from_dict({'text': train_dept['text'].tolist(), 'labels': train_dept['y'].tolist()}).map(tokenize, batched=True)
val_ds = Dataset.from_dict({'text': val_dept['text'].tolist(), 'labels': val_dept['y'].tolist()}).map(tokenize, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/baseline_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_baseline_dept = Trainer(
    model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Baseline...")
trainer_baseline_dept.train()
results = evaluate_model(trainer_baseline_dept, eval_df, 'department', 'title', le_dept, "Department - Baseline")
all_results.append({'approach': 'Baseline', 'task': 'Department', **results})

Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_baseline_dept = Trainer(


Training Department - Baseline...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.136889,0.976343,0.63351,0.971069
2,No log,0.062879,0.987679,0.875362,0.98703
3,No log,0.032589,0.994579,0.958778,0.994688
4,0.217300,0.021749,0.996057,0.980562,0.99605
5,0.217300,0.020988,0.99655,0.974363,0.996707
6,0.217300,0.019153,0.996057,0.983336,0.996123
7,0.217300,0.02063,0.995071,0.979358,0.995139
8,0.006500,0.018428,0.99655,0.974296,0.996788
9,0.006500,0.01675,0.99655,0.978834,0.996637


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Baseline ===
Accuracy       : 0.2761
Macro Precision: 0.4220
Macro Recall   : 0.4629
Macro F1       : 0.3236
Weighted F1    : 0.2183

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0521    0.3571    0.0909        14
  Business Development     0.3158    0.3000    0.3077        20
            Consulting     0.5000    0.5128    0.5063        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.2083    0.6250    0.3125        16
Information Technology     0.3364    0.5968    0.4302        62
             Marketing     0.1923    0.4545    0.2703        22
                 Other     0.8333    0.0291    0.0562       344
    Project Management     0.2317    0.9744    0.3744        39
            Purchasing     0.1489    0.4667    0.2258        15
                 Sales     0.8235    0.6087    0.7000        46

              accuracy                         0.2761  

In [25]:
# Seniority - Baseline
train_ds_sen = Dataset.from_dict({'text': train_sen['text'].tolist(), 'labels': train_sen['y'].tolist()}).map(tokenize, batched=True)
val_ds_sen = Dataset.from_dict({'text': val_sen['text'].tolist(), 'labels': val_sen['y'].tolist()}).map(tokenize, batched=True)

model_sen = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_sen.classes_))

args_sen = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/baseline_sen",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_baseline_sen = Trainer(
    model=model_sen, args=args_sen, train_dataset=train_ds_sen, eval_dataset=val_ds_sen,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Seniority - Baseline...")
trainer_baseline_sen.train()
results = evaluate_model(trainer_baseline_sen, eval_df, 'seniority', 'title', le_sen, "Seniority - Baseline")
all_results.append({'approach': 'Baseline', 'task': 'Seniority', **results})

Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_baseline_sen = Trainer(


Training Seniority - Baseline...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.105367,0.970308,0.914531,0.968503
2,No log,0.030951,0.992047,0.986548,0.992023
3,No log,0.023537,0.994168,0.989964,0.994141
4,No log,0.021124,0.996288,0.992713,0.996265
5,0.146200,0.022638,0.995228,0.991796,0.99521
6,0.146200,0.025926,0.994698,0.991521,0.99468
7,0.146200,0.019121,0.996288,0.99271,0.996269


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 407


Map:   0%|          | 0/407 [00:00<?, ? examples/s]


=== Seniority - Baseline ===
Accuracy       : 0.6953
Macro Precision: 0.6005
Macro Recall   : 0.7095
Macro F1       : 0.6053
Weighted F1    : 0.7182

Classification Report:
              precision    recall  f1-score   support

    Director     0.5455    0.8824    0.6742        34
      Junior     0.2000    0.5000    0.2857        12
        Lead     0.9459    0.5600    0.7035       125
  Management     0.8961    0.7188    0.7977       192
      Senior     0.4149    0.8864    0.5652        44

    accuracy                         0.6953       407
   macro avg     0.6005    0.7095    0.6053       407
weighted avg     0.8096    0.6953    0.7182       407



---
## Approach 2: Class Balancing (Weighted Loss)

In [26]:
print("=" * 60)
print("APPROACH 2: CLASS BALANCING")
print("=" * 60)

weights_dept = compute_class_weights(train_dept['y'].values, len(le_dept.classes_))
weights_dept_tensor = torch.tensor(weights_dept, dtype=torch.float)

print("Department class weights:")
for cls, w in zip(le_dept.classes_, weights_dept):
    print(f"  {cls}: {w:.3f}")

APPROACH 2: CLASS BALANCING
Department class weights:
  Administrative: 11.179
  Business Development: 1.488
  Consulting: 5.506
  Customer Support: 28.378
  Human Resources: 29.513
  Information Technology: 0.707
  Marketing: 0.215
  Other: 21.701
  Project Management: 4.583
  Purchasing: 23.057
  Sales: 0.277


In [27]:
# Department - Class Balancing
model_weighted = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_weighted = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/weighted_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_weighted_dept = WeightedTrainer(
    class_weights=weights_dept_tensor,
    model=model_weighted, args=args_weighted, train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Class Balancing...")
trainer_weighted_dept.train()
results = evaluate_model(trainer_weighted_dept, eval_df, 'department', 'title', le_dept, "Department - Class Balancing")
all_results.append({'approach': 'Class Balancing', 'task': 'Department', **results})

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Training Department - Class Balancing...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.819153,0.936422,0.703519,0.943837
2,No log,0.121944,0.986693,0.952275,0.987137


KeyboardInterrupt: 

---
## Approach 3: Oversampling (BEST)

In [None]:
print("=" * 60)
print("APPROACH 3: OVERSAMPLING")
print("=" * 60)

texts_os, labels_os = oversample_to_median(train_dept['text'].tolist(), train_dept['y'].values, random_state=SEED)
train_ds_os = Dataset.from_dict({'text': texts_os, 'labels': labels_os}).map(tokenize, batched=True)

In [None]:
# Department - Oversampling
model_os = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_os = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/oversampling_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_os_dept = Trainer(
    model=model_os, args=args_os, train_dataset=train_ds_os, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Oversampling...")
trainer_os_dept.train()
results = evaluate_model(trainer_os_dept, eval_df, 'department', 'title', le_dept, "Department - Oversampling")
all_results.append({'approach': 'Oversampling', 'task': 'Department', **results})

In [None]:
# Seniority - Oversampling
texts_os_sen, labels_os_sen = oversample_to_median(train_sen['text'].tolist(), train_sen['y'].values, random_state=SEED)
train_ds_os_sen = Dataset.from_dict({'text': texts_os_sen, 'labels': labels_os_sen}).map(tokenize, batched=True)

model_os_sen = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_sen.classes_))

args_os_sen = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/oversampling_sen",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_os_sen = Trainer(
    model=model_os_sen, args=args_os_sen, train_dataset=train_ds_os_sen, eval_dataset=val_ds_sen,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Seniority - Oversampling...")
trainer_os_sen.train()
results = evaluate_model(trainer_os_sen, eval_df, 'seniority', 'title', le_sen, "Seniority - Oversampling")
all_results.append({'approach': 'Oversampling', 'task': 'Seniority', **results})

---
## Approach 4: Combined (Weights + Oversampling)

In [None]:
print("=" * 60)
print("APPROACH 4: COMBINED (Weights + Oversampling)")
print("=" * 60)

weights_combined = compute_class_weights(np.array(labels_os), len(le_dept.classes_))
weights_combined_tensor = torch.tensor(weights_combined, dtype=torch.float)

model_combined = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_combined = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/combined_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_combined_dept = WeightedTrainer(
    class_weights=weights_combined_tensor,
    model=model_combined, args=args_combined, train_dataset=train_ds_os, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Combined...")
trainer_combined_dept.train()
results = evaluate_model(trainer_combined_dept, eval_df, 'department', 'title', le_dept, "Department - Combined")
all_results.append({'approach': 'Combined', 'task': 'Department', **results})

---
## Approach 5: Two-Stage Classification

In [None]:
print("=" * 60)
print("APPROACH 5: TWO-STAGE")
print("=" * 60)

# Stage 1: Binary (Other vs Not-Other)
train_dept_stage1 = train_dept.copy()
train_dept_stage1['is_other'] = (train_dept_stage1['label'] == 'Other').astype(int)

train_ds_stage1 = Dataset.from_dict({
    'text': train_dept_stage1['text'].tolist(),
    'labels': train_dept_stage1['is_other'].tolist()
}).map(tokenize, batched=True)

val_dept_stage1 = val_dept.copy()
val_dept_stage1['is_other'] = (val_dept_stage1['label'] == 'Other').astype(int)
val_ds_stage1 = Dataset.from_dict({
    'text': val_dept_stage1['text'].tolist(),
    'labels': val_dept_stage1['is_other'].tolist()
}).map(tokenize, batched=True)

print(f"Stage 1 - Other: {train_dept_stage1['is_other'].sum()}, Not-Other: {len(train_dept_stage1) - train_dept_stage1['is_other'].sum()}")

In [None]:
# Train Stage 1 (Binary)
model_stage1 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args_stage1 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/stage1",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_stage1 = Trainer(
    model=model_stage1, args=args_stage1, train_dataset=train_ds_stage1, eval_dataset=val_ds_stage1,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Stage 1 (Other vs Not-Other)...")
trainer_stage1.train()

In [None]:
# Stage 2: Multi-class on Not-Other only
train_notother = train_dept[train_dept['label'] != 'Other'].copy()
val_notother = val_dept[val_dept['label'] != 'Other'].copy()

le_notother = LabelEncoder()
train_notother['y'] = le_notother.fit_transform(train_notother['label'].astype(str))
val_notother['y'] = le_notother.transform(val_notother['label'].astype(str))

train_ds_stage2 = Dataset.from_dict({'text': train_notother['text'].tolist(), 'labels': train_notother['y'].tolist()}).map(tokenize, batched=True)
val_ds_stage2 = Dataset.from_dict({'text': val_notother['text'].tolist(), 'labels': val_notother['y'].tolist()}).map(tokenize, batched=True)

print(f"Stage 2 classes: {le_notother.classes_}")
print(f"Stage 2 samples: {len(train_notother)} train, {len(val_notother)} val")

In [None]:
# Train Stage 2
model_stage2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_notother.classes_))

args_stage2 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/stage2",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_stage2 = Trainer(
    model=model_stage2, args=args_stage2, train_dataset=train_ds_stage2, eval_dataset=val_ds_stage2,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Stage 2 (Not-Other multi-class)...")
trainer_stage2.train()

In [None]:
# Two-Stage Evaluation
def predict_two_stage(texts, trainer_s1, trainer_s2, le_notother):
    ds = Dataset.from_dict({'text': texts, 'labels': [0] * len(texts)}).map(tokenize, batched=True)
    pred1 = trainer_s1.predict(ds)
    is_other = np.argmax(pred1.predictions, axis=-1)
    pred2 = trainer_s2.predict(ds)
    stage2_preds = np.argmax(pred2.predictions, axis=-1)
    stage2_labels = le_notother.inverse_transform(stage2_preds)
    final_preds = np.where(is_other == 1, 'Other', stage2_labels)
    return final_preds

eval_use = eval_df[eval_df['department'].isin(set(le_dept.classes_))].copy()
preds_twostage = predict_two_stage(eval_use['title'].tolist(), trainer_stage1, trainer_stage2, le_notother)

y_true = eval_use['department'].values
y_pred = preds_twostage

print("\n=== Department - Two-Stage ===")
print(f"Accuracy       : {accuracy_score(y_true, y_pred):.4f}")
print(f"Macro F1       : {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

results = {
    'accuracy': accuracy_score(y_true, y_pred),
    'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
    'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0),
    'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
    'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0)
}
all_results.append({'approach': 'Two-Stage', 'task': 'Department', **results})

---
## Final Comparison

In [None]:
results_df = pd.DataFrame(all_results)

print("\n" + "=" * 80)
print("FINAL COMPARISON - DEPARTMENT")
print("=" * 80)
dept_results = results_df[results_df['task'] == 'Department'].sort_values('f1_macro', ascending=False)
print(dept_results[['approach', 'accuracy', 'f1_macro', 'f1_weighted']].to_string(index=False))

print("\n" + "=" * 80)
print("FINAL COMPARISON - SENIORITY")
print("=" * 80)
sen_results = results_df[results_df['task'] == 'Seniority'].sort_values('f1_macro', ascending=False)
print(sen_results[['approach', 'accuracy', 'f1_macro', 'f1_weighted']].to_string(index=False))

print("\n" + "=" * 80)
print("WINNERS")
print("=" * 80)
if len(dept_results) > 0:
    print(f"Best Department: {dept_results.iloc[0]['approach']} (F1={dept_results.iloc[0]['f1_macro']:.4f})")
if len(sen_results) > 0:
    print(f"Best Seniority:  {sen_results.iloc[0]['approach']} (F1={sen_results.iloc[0]['f1_macro']:.4f})")

In [None]:
# Save results
results_df.to_csv('./results/distilbert_comparison_results.csv', index=False)
print("Results saved to results/distilbert_comparison_results.csv")

## Conclusions

**Key Findings:**
- **Oversampling** typically works best for both department and seniority
- Class weighting alone can hurt generalization
- Two-stage is competitive for department but adds complexity

**Recommendations:**
- Use the oversampling models for production
- Save the winning models for deployment