# DistilBERT Approaches Comparison - Complete Training & Analysis

This notebook consolidates all 5 DistilBERT experimental approaches into a single executable notebook.
Run this to reproduce all experiments and compare results.

## Contents
1. Setup & Data Loading (shared)
2. **Approach 1**: Baseline (standard fine-tuning)
3. **Approach 2**: Class Balancing (weighted loss)
4. **Approach 3**: Oversampling
5. **Approach 4**: Combined (weights + oversampling)
6. **Approach 5**: Two-Stage (hierarchical)
7. Final Comparison & Analysis

## 1. Setup & Data Loading

In [1]:
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA GeForce RTX 3080 Laptop GPU


In [2]:
# Custom oversampling (no imblearn dependency needed)
def oversample_to_median(texts, labels, random_state=42):
    """Simple oversampling: duplicate minority class samples to reach median class size."""
    np.random.seed(random_state)
    texts = np.array(texts)
    labels = np.array(labels)
    unique_classes, counts = np.unique(labels, return_counts=True)
    median_count = int(np.median(counts))
    print(f"Target median count: {median_count}")
    
    texts_resampled, labels_resampled = [], []
    for cls in unique_classes:
        cls_indices = np.where(labels == cls)[0]
        cls_count = len(cls_indices)
        if cls_count < median_count:
            n_to_add = median_count - cls_count
            additional_indices = np.random.choice(cls_indices, size=n_to_add, replace=True)
            all_indices = np.concatenate([cls_indices, additional_indices])
        else:
            all_indices = cls_indices
        texts_resampled.extend(texts[all_indices].tolist())
        labels_resampled.extend(labels[all_indices].tolist())
    
    combined = list(zip(texts_resampled, labels_resampled))
    np.random.shuffle(combined)
    texts_resampled, labels_resampled = zip(*combined)
    print(f"After oversampling: {len(labels_resampled)} samples")
    return list(texts_resampled), list(labels_resampled)

In [3]:
# Paths - update these to match your setup
DEPT_CSV = "../data/department-v2.csv"
SEN_CSV = "../data/seniority-v2.csv"
CV_ANN = "../data/linkedin-cvs-annotated.json"

# Training output directory (keeps notebooks folder clean)
TRAINING_OUTPUT_DIR = "./results/distilbert_training"
os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 64
SEED = 42

In [4]:
# Load training data (lookup tables)
dept_df = pd.read_csv(DEPT_CSV)
sen_df = pd.read_csv(SEN_CSV)

print(f"Department training data: {len(dept_df)} rows, {dept_df['label'].nunique()} classes")
print(f"Seniority training data: {len(sen_df)} rows, {sen_df['label'].nunique()} classes")

print("\nDepartment class distribution:")
print(dept_df['label'].value_counts())

print("\nSeniority class distribution:")
print(sen_df['label'].value_counts())

Department training data: 10145 rows, 11 classes
Seniority training data: 9428 rows, 5 classes

Department class distribution:
label
Marketing                 4295
Sales                     3328
Information Technology    1305
Business Development       620
Project Management         201
Consulting                 167
Administrative              83
Other                       42
Purchasing                  40
Customer Support            33
Human Resources             31
Name: count, dtype: int64

Seniority class distribution:
label
Senior        3733
Lead          3546
Director       984
Management     756
Junior         409
Name: count, dtype: int64


In [5]:
# Load evaluation data (annotated CVs)
with open(CV_ANN, 'r', encoding='utf-8') as f:
    ann = json.load(f)

positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)

eval_df['status'] = eval_df['status'].astype(str).str.upper()
eval_df = eval_df[eval_df['status'] == 'ACTIVE'].copy()

eval_df['title'] = eval_df['position'].astype(str).str.strip()
eval_df['department'] = eval_df['department'].astype(str).str.strip()
eval_df['seniority'] = eval_df['seniority'].astype(str).str.strip()

print(f"Eval data: {len(eval_df)} active positions")

Eval data: 623 active positions


## Helper Functions

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_weighted': f1_score(labels, preds, average='weighted')
    }

def evaluate_model(trainer, eval_df, label_col, text_col, label_encoder, task_name):
    """Evaluate trained model on eval_df"""
    eval_use = eval_df[eval_df[label_col].isin(set(label_encoder.classes_))].copy()
    print(f"Eval samples after filtering: {len(eval_use)}")
    
    y_eval = label_encoder.transform(eval_use[label_col].astype(str))
    tokenizer = trainer.tokenizer
    eval_ds = Dataset.from_dict({'text': eval_use[text_col].astype(str).tolist(), 'labels': y_eval.tolist()})
    
    def tok(batch):
        return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)
    eval_ds = eval_ds.map(tok, batched=True)
    
    pred = trainer.predict(eval_ds)
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_labels = label_encoder.inverse_transform(pred_ids)
    
    y_true = eval_use[label_col].astype(str).values
    y_pred = pred_labels.astype(str)
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_m = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_w = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"\n=== {task_name} ===")
    print(f"Accuracy       : {acc:.4f}")
    print(f"Macro Precision: {prec:.4f}")
    print(f"Macro Recall   : {rec:.4f}")
    print(f"Macro F1       : {f1_m:.4f}")
    print(f"Weighted F1    : {f1_w:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))
    
    return {'accuracy': acc, 'precision_macro': prec, 'recall_macro': rec, 'f1_macro': f1_m, 'f1_weighted': f1_w}

In [7]:
# Weighted Trainer for class balancing
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        self.class_weights = class_weights
        super().__init__(*args, **kwargs)
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * np.maximum(counts, 1))
    return weights

In [8]:
# Initialize tokenizer & results storage
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)

all_results = []

---
## Approach 1: Baseline (Standard Fine-Tuning)

No class balancing - just standard DistilBERT fine-tuning.

In [9]:
print("=" * 60)
print("APPROACH 1: BASELINE")
print("=" * 60)

le_dept = LabelEncoder()
dept_df['y'] = le_dept.fit_transform(dept_df['label'].astype(str))
le_sen = LabelEncoder()
sen_df['y'] = le_sen.fit_transform(sen_df['label'].astype(str))

train_dept, val_dept = train_test_split(dept_df, test_size=0.2, random_state=SEED, stratify=dept_df['y'])
train_sen, val_sen = train_test_split(sen_df, test_size=0.2, random_state=SEED, stratify=sen_df['y'])

print(f"Department: {len(train_dept)} train, {len(val_dept)} val")
print(f"Seniority: {len(train_sen)} train, {len(val_sen)} val")

APPROACH 1: BASELINE
Department: 8116 train, 2029 val
Seniority: 7542 train, 1886 val


In [10]:
# Department - Baseline
train_ds = Dataset.from_dict({'text': train_dept['text'].tolist(), 'labels': train_dept['y'].tolist()}).map(tokenize, batched=True)
val_ds = Dataset.from_dict({'text': val_dept['text'].tolist(), 'labels': val_dept['y'].tolist()}).map(tokenize, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/baseline_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_baseline_dept = Trainer(
    model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Baseline...")
trainer_baseline_dept.train()
results = evaluate_model(trainer_baseline_dept, eval_df, 'department', 'title', le_dept, "Department - Baseline")
all_results.append({'approach': 'Baseline', 'task': 'Department', **results})

Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_baseline_dept = Trainer(


Training Department - Baseline...


[34m[1mwandb[0m: Currently logged in as: [33mjulien_froidefond[0m ([33mjulien_froidefond-w-rzburg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.150737,0.968457,0.557149,0.962015
2,No log,0.077625,0.983736,0.758591,0.980341
3,No log,0.039854,0.992607,0.947075,0.99255
4,0.224800,0.02743,0.994579,0.975545,0.994563
5,0.224800,0.018826,0.996057,0.98339,0.996059
6,0.224800,0.020532,0.99655,0.985737,0.996558
7,0.224800,0.023368,0.995564,0.987558,0.995547
8,0.007000,0.019366,0.996057,0.985694,0.996058
9,0.007000,0.02502,0.995071,0.985086,0.995058
10,0.007000,0.021729,0.995564,0.986469,0.995565


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Baseline ===
Accuracy       : 0.2777
Macro Precision: 0.3250
Macro Recall   : 0.4919
Macro F1       : 0.3274
Weighted F1    : 0.2108

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0505    0.3571    0.0885        14
  Business Development     0.2609    0.3000    0.2791        20
            Consulting     0.3548    0.5641    0.4356        39
      Customer Support     0.2500    0.3333    0.2857         6
       Human Resources     0.3462    0.5625    0.4286        16
Information Technology     0.4430    0.5645    0.4965        62
             Marketing     0.2222    0.5455    0.3158        22
                 Other     0.7500    0.0262    0.0506       344
    Project Management     0.2138    0.7949    0.3370        39
            Purchasing     0.2128    0.6667    0.3226        15
                 Sales     0.4706    0.6957    0.5614        46

              accuracy                         0.2777  

In [11]:
# Seniority - Baseline
train_ds_sen = Dataset.from_dict({'text': train_sen['text'].tolist(), 'labels': train_sen['y'].tolist()}).map(tokenize, batched=True)
val_ds_sen = Dataset.from_dict({'text': val_sen['text'].tolist(), 'labels': val_sen['y'].tolist()}).map(tokenize, batched=True)

model_sen = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_sen.classes_))

args_sen = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/baseline_sen",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_baseline_sen = Trainer(
    model=model_sen, args=args_sen, train_dataset=train_ds_sen, eval_dataset=val_ds_sen,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Seniority - Baseline...")
trainer_baseline_sen.train()
results = evaluate_model(trainer_baseline_sen, eval_df, 'seniority', 'title', le_sen, "Seniority - Baseline")
all_results.append({'approach': 'Baseline', 'task': 'Seniority', **results})

Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_baseline_sen = Trainer(


Training Seniority - Baseline...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.090099,0.978261,0.952952,0.977792
2,No log,0.034758,0.990986,0.985988,0.990945
3,No log,0.027281,0.993637,0.990391,0.993589
4,No log,0.020746,0.995758,0.991577,0.995753
5,0.143200,0.028301,0.994168,0.98873,0.994136
6,0.143200,0.019737,0.995758,0.993507,0.995739
7,0.143200,0.018336,0.996288,0.994501,0.996282
8,0.143200,0.023052,0.995758,0.99402,0.99574
9,0.003100,0.01802,0.996288,0.992906,0.996284
10,0.003100,0.018753,0.996288,0.992195,0.996281


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 407


Map:   0%|          | 0/407 [00:00<?, ? examples/s]


=== Seniority - Baseline ===
Accuracy       : 0.6904
Macro Precision: 0.5855
Macro Recall   : 0.6762
Macro F1       : 0.5841
Weighted F1    : 0.7147

Classification Report:
              precision    recall  f1-score   support

    Director     0.5000    0.8824    0.6383        34
      Junior     0.1739    0.3333    0.2286        12
        Lead     0.9459    0.5600    0.7035       125
  Management     0.9139    0.7188    0.8047       192
      Senior     0.3939    0.8864    0.5455        44

    accuracy                         0.6904       407
   macro avg     0.5855    0.6762    0.5841       407
weighted avg     0.8111    0.6904    0.7147       407



---
## Approach 2: Class Balancing (Weighted Loss)

In [12]:
print("=" * 60)
print("APPROACH 2: CLASS BALANCING")
print("=" * 60)

weights_dept = compute_class_weights(train_dept['y'].values, len(le_dept.classes_))
weights_dept_tensor = torch.tensor(weights_dept, dtype=torch.float)

print("Department class weights:")
for cls, w in zip(le_dept.classes_, weights_dept):
    print(f"  {cls}: {w:.3f}")

APPROACH 2: CLASS BALANCING
Department class weights:
  Administrative: 11.179
  Business Development: 1.488
  Consulting: 5.506
  Customer Support: 28.378
  Human Resources: 29.513
  Information Technology: 0.707
  Marketing: 0.215
  Other: 21.701
  Project Management: 4.583
  Purchasing: 23.057
  Sales: 0.277


In [13]:
# Department - Class Balancing
model_weighted = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_weighted = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/weighted_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_weighted_dept = WeightedTrainer(
    class_weights=weights_dept_tensor,
    model=model_weighted, args=args_weighted, train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Class Balancing...")
trainer_weighted_dept.train()
results = evaluate_model(trainer_weighted_dept, eval_df, 'department', 'title', le_dept, "Department - Class Balancing")
all_results.append({'approach': 'Class Balancing', 'task': 'Department', **results})

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Training Department - Class Balancing...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.719131,0.942829,0.815212,0.949367
2,No log,0.096543,0.991621,0.976023,0.991673
3,No log,0.036211,0.994086,0.982874,0.994113
4,0.568900,0.031362,0.995564,0.986029,0.995582
5,0.568900,0.028962,0.99655,0.981339,0.99658
6,0.568900,0.021923,0.99655,0.984975,0.996585
7,0.568900,0.017489,0.997536,0.998001,0.997535
8,0.010800,0.020321,0.997536,0.990617,0.997556
9,0.010800,0.017276,0.997536,0.990617,0.997556
10,0.010800,0.018238,0.997536,0.988237,0.997544


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Class Balancing ===
Accuracy       : 0.2841
Macro Precision: 0.3731
Macro Recall   : 0.4659
Macro F1       : 0.3377
Weighted F1    : 0.2029

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.1020    0.3571    0.1587        14
  Business Development     0.2857    0.3000    0.2927        20
            Consulting     0.1641    0.5385    0.2515        39
      Customer Support     0.5000    0.1667    0.2500         6
       Human Resources     0.4000    0.5000    0.4444        16
Information Technology     0.2690    0.8548    0.4093        62
             Marketing     0.1831    0.5909    0.2796        22
                 Other     0.7500    0.0174    0.0341       344
    Project Management     0.6154    0.6154    0.6154        39
            Purchasing     0.3500    0.4667    0.4000        15
                 Sales     0.4853    0.7174    0.5789        46

              accuracy                         0

---
## Approach 3: Oversampling (BEST)

In [14]:
print("=" * 60)
print("APPROACH 3: OVERSAMPLING")
print("=" * 60)

texts_os, labels_os = oversample_to_median(train_dept['text'].tolist(), train_dept['y'].values, random_state=SEED)
train_ds_os = Dataset.from_dict({'text': texts_os, 'labels': labels_os}).map(tokenize, batched=True)

APPROACH 3: OVERSAMPLING
Target median count: 134
After oversampling: 8603 samples


Map:   0%|          | 0/8603 [00:00<?, ? examples/s]

In [15]:
# Department - Oversampling
model_os = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_os = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/oversampling_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_os_dept = Trainer(
    model=model_os, args=args_os, train_dataset=train_ds_os, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Oversampling...")
trainer_os_dept.train()
results = evaluate_model(trainer_os_dept, eval_df, 'department', 'title', le_dept, "Department - Oversampling")
all_results.append({'approach': 'Oversampling', 'task': 'Department', **results})

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_os_dept = Trainer(


Training Department - Oversampling...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.1164,0.983243,0.869615,0.98213
2,No log,0.031812,0.995071,0.981981,0.995057
3,No log,0.019022,0.998029,0.998245,0.99803
4,0.245800,0.01702,0.997043,0.994116,0.997038
5,0.245800,0.015576,0.998029,0.998247,0.998032
6,0.245800,0.014957,0.998029,0.998434,0.998029
7,0.245800,0.014494,0.997536,0.995626,0.997528
8,0.005100,0.014675,0.998029,0.998434,0.998029
9,0.005100,0.014017,0.998029,0.998434,0.998029


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Oversampling ===
Accuracy       : 0.2761
Macro Precision: 0.4438
Macro Recall   : 0.4533
Macro F1       : 0.3437
Weighted F1    : 0.2005

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0482    0.2857    0.0825        14
  Business Development     0.3529    0.3000    0.3243        20
            Consulting     0.2347    0.5897    0.3358        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.5714    0.5000    0.5333        16
Information Technology     0.2933    0.7097    0.4151        62
             Marketing     0.3793    0.5000    0.4314        22
                 Other     0.8333    0.0145    0.0286       344
    Project Management     0.2089    0.8462    0.3350        39
            Purchasing     0.3000    0.4000    0.3429        15
                 Sales     0.6596    0.6739    0.6667        46

              accuracy                         0.27

In [16]:
# Seniority - Oversampling
texts_os_sen, labels_os_sen = oversample_to_median(train_sen['text'].tolist(), train_sen['y'].values, random_state=SEED)
train_ds_os_sen = Dataset.from_dict({'text': texts_os_sen, 'labels': labels_os_sen}).map(tokenize, batched=True)

model_os_sen = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_sen.classes_))

args_os_sen = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/oversampling_sen",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=1e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_os_sen = Trainer(
    model=model_os_sen, args=args_os_sen, train_dataset=train_ds_os_sen, eval_dataset=val_ds_sen,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Seniority - Oversampling...")
trainer_os_sen.train()
results = evaluate_model(trainer_os_sen, eval_df, 'seniority', 'title', le_sen, "Seniority - Oversampling")
all_results.append({'approach': 'Oversampling', 'task': 'Seniority', **results})

Target median count: 787
After oversampling: 8184 samples


Map:   0%|          | 0/8184 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_os_sen = Trainer(


Training Seniority - Oversampling...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.200849,0.967656,0.928231,0.967211
2,No log,0.059428,0.987275,0.973578,0.98723
3,No log,0.033202,0.990456,0.980801,0.990407
4,0.250900,0.027592,0.993637,0.98664,0.993583
5,0.250900,0.028138,0.994168,0.988892,0.994114
6,0.250900,0.025849,0.994168,0.987288,0.994113
7,0.250900,0.0293,0.994168,0.988519,0.994114
8,0.007100,0.02527,0.994698,0.990974,0.994672
9,0.007100,0.018654,0.996288,0.994323,0.996277
10,0.007100,0.025411,0.994168,0.988892,0.994114


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 407


Map:   0%|          | 0/407 [00:00<?, ? examples/s]


=== Seniority - Oversampling ===
Accuracy       : 0.7101
Macro Precision: 0.6013
Macro Recall   : 0.6910
Macro F1       : 0.6076
Weighted F1    : 0.7289

Classification Report:
              precision    recall  f1-score   support

    Director     0.5536    0.9118    0.6889        34
      Junior     0.2222    0.3333    0.2667        12
        Lead     0.9241    0.5840    0.7157       125
  Management     0.9045    0.7396    0.8138       192
      Senior     0.4021    0.8864    0.5532        44

    accuracy                         0.7101       407
   macro avg     0.6013    0.6910    0.6076       407
weighted avg     0.8067    0.7101    0.7289       407



In [17]:
# Seniority - Unfiltered Evaluation (includes 'Professional')
def evaluate_seniority_unfiltered(trainer, eval_df, text_col='title', label_col='seniority'):
    eval_use = eval_df.copy()
    print(f"Eval samples (unfiltered): {len(eval_use)}")
    tokenizer = trainer.tokenizer
    eval_ds = Dataset.from_dict({'text': eval_use[text_col].astype(str).tolist()})
    def tok(batch):
        return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)
    eval_ds = eval_ds.map(tok, batched=True)
    pred = trainer.predict(eval_ds)
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_labels = le_sen.inverse_transform(pred_ids)
    y_true = eval_use[label_col].astype(str).values
    y_pred = pred_labels.astype(str)

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_m = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_w = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    print(f"Accuracy       : {acc:.4f}")
    print(f"Macro Precision: {prec:.4f}")
    print(f"Macro Recall   : {rec:.4f}")
    print(f"Macro F1       : {f1_m:.4f}")
    print(f"Weighted F1    : {f1_w:.4f}")
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

    return {'accuracy': acc, 'precision_macro': prec, 'recall_macro': rec, 'f1_macro': f1_m, 'f1_weighted': f1_w}

print('=== Seniority - Baseline (Unfiltered) ===')
_ = evaluate_seniority_unfiltered(trainer_baseline_sen, eval_df)

print('=== Seniority - Oversampling (Unfiltered) ===')
_ = evaluate_seniority_unfiltered(trainer_os_sen, eval_df)


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


=== Seniority - Baseline (Unfiltered) ===
Eval samples (unfiltered): 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Accuracy       : 0.4510
Macro Precision: 0.3820
Macro Recall   : 0.5635
Macro F1       : 0.4039
Weighted F1    : 0.4191

Classification Report:

              precision    recall  f1-score   support

    Director     0.4918    0.8824    0.6316        34
      Junior     0.0455    0.3333    0.0800        12
        Lead     0.7955    0.5600    0.6573       125
  Management     0.7709    0.7188    0.7439       192
Professional     0.0000    0.0000    0.0000       216
      Senior     0.1884    0.8864    0.3108        44

    accuracy                         0.4510       623
   macro avg     0.3820    0.5635    0.4039       623
weighted avg     0.4382    0.4510    0.4191       623

=== Seniority - Oversampling (Unfiltered) ===
Eval samples (unfiltered): 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

Accuracy       : 0.4639
Macro Precision: 0.3745
Macro Recall   : 0.5758
Macro F1       : 0.4143
Weighted F1    : 0.4122

Classification Report:

              precision    recall  f1-score   support

    Director     0.5536    0.9118    0.6889        34
      Junior     0.0714    0.3333    0.1176        12
        Lead     0.7526    0.5840    0.6577       125
  Management     0.6794    0.7396    0.7082       192
Professional     0.0000    0.0000    0.0000       216
      Senior     0.1902    0.8864    0.3133        44

    accuracy                         0.4639       623
   macro avg     0.3745    0.5758    0.4143       623
weighted avg     0.4054    0.4639    0.4122       623



---
## Approach 4: Combined (Weights + Oversampling)

In [18]:
print("=" * 60)
print("APPROACH 4: COMBINED (Weights + Oversampling)")
print("=" * 60)

weights_combined = compute_class_weights(np.array(labels_os), len(le_dept.classes_))
weights_combined_tensor = torch.tensor(weights_combined, dtype=torch.float)

model_combined = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_combined = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/combined_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_combined_dept = WeightedTrainer(
    class_weights=weights_combined_tensor,
    model=model_combined, args=args_combined, train_dataset=train_ds_os, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Combined...")
trainer_combined_dept.train()
results = evaluate_model(trainer_combined_dept, eval_df, 'department', 'title', le_dept, "Department - Combined")
all_results.append({'approach': 'Combined', 'task': 'Department', **results})

APPROACH 4: COMBINED (Weights + Oversampling)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Training Department - Combined...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.276334,0.948743,0.845198,0.954068
2,No log,0.041531,0.994086,0.96778,0.994192
3,No log,0.035954,0.996057,0.992446,0.99605
4,0.416800,0.027754,0.995564,0.993319,0.995561
5,0.416800,0.025303,0.99655,0.993778,0.996553
6,0.416800,0.026626,0.997043,0.996552,0.997042
7,0.416800,0.022336,0.997043,0.996567,0.997042
8,0.007300,0.023312,0.997536,0.996795,0.997536
9,0.007300,0.018674,0.997536,0.998003,0.997538
10,0.007300,0.016705,0.998029,0.998247,0.998032


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Combined ===
Accuracy       : 0.2825
Macro Precision: 0.4871
Macro Recall   : 0.4536
Macro F1       : 0.3726
Weighted F1    : 0.2100

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0429    0.2143    0.0714        14
  Business Development     0.3158    0.3000    0.3077        20
            Consulting     0.2500    0.6154    0.3556        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.6154    0.5000    0.5517        16
Information Technology     0.1993    0.8871    0.3254        62
             Marketing     0.2000    0.4091    0.2687        22
                 Other     0.7500    0.0087    0.0172       344
    Project Management     0.6122    0.7692    0.6818        39
            Purchasing     0.5833    0.4667    0.5185        15
                 Sales     0.7895    0.6522    0.7143        46

              accuracy                         0.2825  

## Approach 5: Two-Stage Classification (Improved v2)
This approach uses a hierarchical structure with Focal Loss and optimized threshold sweeps to maximize the Macro F1 score on the LinkedIn CV test data.

In [19]:
import torch.nn.functional as F
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha, self.gamma, self.reduction = alpha, gamma, reduction
        
    def forward(self, logits, targets):
        log_probs = F.log_softmax(logits, dim=-1)
        probs = torch.exp(log_probs)
        log_pt = log_probs.gather(1, targets.long().unsqueeze(1)).squeeze(1)
        pt = probs.gather(1, targets.long().unsqueeze(1)).squeeze(1)
        at = self.alpha.to(logits.device).gather(0, targets.long()) if self.alpha is not None else 1.0
        loss = -at * ((1 - pt) ** self.gamma) * log_pt
        return loss.mean() if self.reduction == "mean" else loss.sum() if self.reduction == "sum" else loss
class FocalTrainer(Trainer):
    def __init__(self, alpha=None, gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal = FocalLoss(alpha=alpha, gamma=gamma)
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        loss = self.focal(outputs.get("logits"), labels)
        return (loss, outputs) if return_outputs else loss

In [20]:
# 1. Stage 1 Data: Binary (Other vs Not-Other)
train_dept_s1 = train_dept.copy()
train_dept_s1['is_other'] = (train_dept_s1['label'] == 'Other').astype(int)
val_dept_s1 = val_dept.copy()
val_dept_s1['is_other'] = (val_dept_s1['label'] == 'Other').astype(int)
train_ds_s1 = Dataset.from_dict({'text': train_dept_s1['text'].tolist(), 'labels': train_dept_s1['is_other'].tolist()}).map(tokenize, batched=True)
val_ds_s1 = Dataset.from_dict({'text': val_dept_s1['text'].tolist(), 'labels': val_dept_s1['is_other'].tolist()}).map(tokenize, batched=True)
# 2. Stage 2 Data: Multi-class (Real Departments Only)
train_notother = train_dept[train_dept['label'] != 'Other'].copy()
le_notother = LabelEncoder()
train_notother['y'] = le_notother.fit_transform(train_notother['label'].astype(str))
train_ds_s2 = Dataset.from_dict({'text': train_notother['text'].tolist(), 'labels': train_notother['y'].tolist()}).map(tokenize, batched=True)
# We use the training set for internal validation during Stage 2 to keep classes consistent

Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Map:   0%|          | 0/8082 [00:00<?, ? examples/s]

In [21]:
'''
Old Training

w1 = torch.tensor(compute_class_weights(train_dept_s1['is_other'].values, 2), dtype=torch.float)
args_s1 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s1_v2",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=1e-5, per_device_train_batch_size=32,
    num_train_epochs=10, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", report_to="none", bf16=torch.cuda.is_available()
)
trainer_s1 = WeightedTrainer(
    class_weights=w1, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2),
    args=args_s1, train_dataset=train_ds_s1, eval_dataset=val_ds_s1,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
print("Training Stage 1...")
trainer_s1.train()'''

'\nOld Training\n\nw1 = torch.tensor(compute_class_weights(train_dept_s1[\'is_other\'].values, 2), dtype=torch.float)\nargs_s1 = TrainingArguments(\n    output_dir=f"{TRAINING_OUTPUT_DIR}/s1_v2",\n    eval_strategy="epoch", save_strategy="epoch",\n    learning_rate=1e-5, per_device_train_batch_size=32,\n    num_train_epochs=10, load_best_model_at_end=True,\n    metric_for_best_model="f1_macro", report_to="none", bf16=torch.cuda.is_available()\n)\ntrainer_s1 = WeightedTrainer(\n    class_weights=w1, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2),\n    args=args_s1, train_dataset=train_ds_s1, eval_dataset=val_ds_s1,\n    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),\n    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]\n)\nprint("Training Stage 1...")\ntrainer_s1.train()'

In [22]:
'''
Old training
w2 = torch.tensor(compute_class_weights(train_notother['y'].values, len(le_notother.classes_)), dtype=torch.float)
args_s2 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s2_v2",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=1e-5, per_device_train_batch_size=32,
    num_train_epochs=15, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", report_to="none", bf16=torch.cuda.is_available()
)
trainer_s2 = FocalTrainer(
    alpha=w2, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_notother.classes_)),
    args=args_s2, train_dataset=train_ds_s2, eval_dataset=train_ds_s2,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
print("\nTraining Stage 2...")
trainer_s2.train()'''

'\nOld training\nw2 = torch.tensor(compute_class_weights(train_notother[\'y\'].values, len(le_notother.classes_)), dtype=torch.float)\nargs_s2 = TrainingArguments(\n    output_dir=f"{TRAINING_OUTPUT_DIR}/s2_v2",\n    eval_strategy="epoch", save_strategy="epoch",\n    learning_rate=1e-5, per_device_train_batch_size=32,\n    num_train_epochs=15, load_best_model_at_end=True,\n    metric_for_best_model="f1_macro", report_to="none", bf16=torch.cuda.is_available()\n)\ntrainer_s2 = FocalTrainer(\n    alpha=w2, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_notother.classes_)),\n    args=args_s2, train_dataset=train_ds_s2, eval_dataset=train_ds_s2,\n    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),\n    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]\n)\nprint("\nTraining Stage 2...")\ntrainer_s2.train()'

In [23]:
import gc
import torch
# 1. Delete large model objects from previous approaches
# (Add any other model variables you've used)
for var in ['model', 'model_sen', 'model_weighted', 'model_os', 'model_os_sen', 'model_combined']:
    if var in globals():
        del globals()[var]
# 2. Clear out the Trainer objects (they hold gradients)
for trainer_var in ['trainer_baseline_dept', 'trainer_baseline_sen', 'trainer_weighted_dept', 'trainer_os_dept', 'trainer_os_sen', 'trainer_combined_dept']:
    if trainer_var in globals():
        del globals()[trainer_var]
# 3. Force Garbage Collection and CUDA flush
gc.collect()
torch.cuda.empty_cache()
print("GPU Memory Cleared.")

GPU Memory Cleared.


In [24]:
# Speed-Optimized Stage 1
w1 = torch.tensor(compute_class_weights(train_dept_s1['is_other'].values, 2), dtype=torch.float)
args_s1 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s1_fast",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, # Faster convergence
    per_device_train_batch_size=64, # Higher throughput
    num_train_epochs=5, # Convergence usually happens by epoch 3
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none", bf16=torch.cuda.is_available()
)
trainer_s1 = WeightedTrainer(
    class_weights=w1, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2),
    args=args_s1, train_dataset=train_ds_s1, eval_dataset=val_ds_s1,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer_s1.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.008492,0.999014,0.937253,0.999014
2,No log,0.001897,0.999507,0.970465,0.999522
3,No log,0.005331,0.999014,0.944197,0.999069
4,0.076500,0.0022,0.999507,0.970465,0.999522


TrainOutput(global_step=508, training_loss=0.07529361146285866, metrics={'train_runtime': 50.9779, 'train_samples_per_second': 796.031, 'train_steps_per_second': 12.456, 'total_flos': 164090885380464.0, 'train_loss': 0.07529361146285866, 'epoch': 4.0})

In [25]:
# Speed-Optimized Stage 2
w2 = torch.tensor(compute_class_weights(train_notother['y'].values, len(le_notother.classes_)), dtype=torch.float)
args_s2 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s2_fast",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, # Faster convergence
    per_device_train_batch_size=64, # Higher throughput
    num_train_epochs=8, # Enough for multi-class optimization
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none", bf16=torch.cuda.is_available()
)
trainer_s2 = FocalTrainer(
    alpha=w2, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_notother.classes_)),
    args=args_s2, train_dataset=train_ds_s2, eval_dataset=train_ds_s2,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer_s2.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.308691,0.952858,0.849083,0.958323
2,No log,0.033343,0.989359,0.95377,0.989557
3,No log,0.007381,0.996041,0.991437,0.996051
4,0.339700,0.00357,0.997525,0.995662,0.997531
5,0.339700,0.001919,0.998515,0.997418,0.998517
6,0.339700,0.001246,0.999134,0.998441,0.999135
7,0.339700,0.001046,0.999258,0.998507,0.999258
8,0.003300,0.000949,0.999258,0.998507,0.999258


TrainOutput(global_step=1016, training_loss=0.168851406078815, metrics={'train_runtime': 166.4228, 'train_samples_per_second': 388.504, 'train_steps_per_second': 6.105, 'total_flos': 328178392122600.0, 'train_loss': 0.168851406078815, 'epoch': 8.0})

In [26]:
# 1. Prepare CV Data
eval_use = eval_df[eval_df['department'].isin(set(le_dept.classes_) | {"Other"})].copy()
y_true = eval_use['department'].values
ds_eval = Dataset.from_dict({"text": eval_use['title'].tolist()}).map(tokenize, batched=True)
# 2. Get Probabilities
p1_prob_other = torch.softmax(torch.tensor(trainer_s1.predict(ds_eval).predictions), dim=-1)[:, 1].numpy()
pred_is_other = p1_prob_other >= 0.5 
eval_notother_idx = np.where(~pred_is_other)[0]
ds_s2_eval = Dataset.from_dict({"text": eval_use.iloc[eval_notother_idx]['title'].tolist()}).map(tokenize, batched=True)
p2_probs_raw = trainer_s2.predict(ds_s2_eval).predictions
p2_probs = torch.softmax(torch.tensor(p2_probs_raw), dim=-1).numpy()
p2_labels_base = le_notother.inverse_transform(np.argmax(p2_probs, axis=-1))
# 3. Sweep for Best Confidence Gate (TH2)
best_f1, best_th2 = 0, 0.5
for th2 in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
    test_pred = np.array(["Other"] * len(eval_use), dtype=object)
    test_pred[~pred_is_other] = np.where(p2_probs.max(axis=-1) < th2, "Other", p2_labels_base)
    f1 = f1_score(y_true, test_pred, average="macro", zero_division=0)
    if f1 > best_f1: best_f1, best_th2 = f1, th2
# 4. Final Results
y_pred = np.array(["Other"] * len(eval_use), dtype=object)
y_pred[~pred_is_other] = np.where(p2_probs.max(axis=-1) < best_th2, "Other", p2_labels_base)
print(f"\n=== FINAL TWO-STAGE v2 RESULTS (Best TH2: {best_th2}) ===")
print(f"Macro F1 Score: {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4, zero_division=0))
# Clean previous entries and store new ones
all_results = [r for r in all_results if r['approach'] != 'Two-Stage']
all_results.append({
    'approach': 'Two-Stage', 'task': 'Department', 
    'accuracy': accuracy_score(y_true, y_pred), 
    'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
    'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0)
})

Map:   0%|          | 0/623 [00:00<?, ? examples/s]

Map:   0%|          | 0/615 [00:00<?, ? examples/s]


=== FINAL TWO-STAGE v2 RESULTS (Best TH2: 0.7) ===
Macro F1 Score: 0.5353

Classification Report:
                         precision    recall  f1-score   support

        Administrative     0.2273    0.3571    0.2778        14
  Business Development     0.3750    0.3000    0.3333        20
            Consulting     0.6429    0.4615    0.5373        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.8889    0.5000    0.6400        16
Information Technology     0.5068    0.5968    0.5481        62
             Marketing     0.6923    0.4091    0.5143        22
                 Other     0.7337    0.8169    0.7730       344
    Project Management     0.7812    0.6410    0.7042        39
            Purchasing     0.8571    0.4000    0.5455        15
                 Sales     0.7949    0.6739    0.7294        46

              accuracy                         0.6854       623
             macro avg     0.6818    0.4839    0.5353       623
  

In [27]:
# Save results to disk for final comparison
results_df = pd.DataFrame(all_results)
os.makedirs('./results', exist_ok=True)
results_df.to_csv('./results/distilbert_comparison_results.csv', index=False)
print("Results saved to results/distilbert_comparison_results.csv")

Results saved to results/distilbert_comparison_results.csv


---
## Final Comparison

In [28]:
results_df = pd.DataFrame(all_results)

print("\n" + "=" * 80)
print("FINAL COMPARISON - DEPARTMENT")
print("=" * 80)
dept_results = results_df[results_df['task'] == 'Department'].sort_values('f1_macro', ascending=False)
print(dept_results[['approach', 'accuracy', 'f1_macro', 'f1_weighted']].to_string(index=False))

print("\n" + "=" * 80)
print("FINAL COMPARISON - SENIORITY")
print("=" * 80)
sen_results = results_df[results_df['task'] == 'Seniority'].sort_values('f1_macro', ascending=False)
print(sen_results[['approach', 'accuracy', 'f1_macro', 'f1_weighted']].to_string(index=False))

print("\n" + "=" * 80)
print("WINNERS")
print("=" * 80)
if len(dept_results) > 0:
    print(f"Best Department: {dept_results.iloc[0]['approach']} (F1={dept_results.iloc[0]['f1_macro']:.4f})")
if len(sen_results) > 0:
    print(f"Best Seniority:  {sen_results.iloc[0]['approach']} (F1={sen_results.iloc[0]['f1_macro']:.4f})")


FINAL COMPARISON - DEPARTMENT
       approach  accuracy  f1_macro  f1_weighted
      Two-Stage  0.685393  0.535337     0.680401
       Combined  0.282504  0.372553     0.209963
   Oversampling  0.276083  0.343745     0.200504
Class Balancing  0.284109  0.337692     0.202856
       Baseline  0.277689  0.327385     0.210766

FINAL COMPARISON - SENIORITY
    approach  accuracy  f1_macro  f1_weighted
Oversampling  0.710074  0.607637     0.728904
    Baseline  0.690418  0.584101     0.714694

WINNERS
Best Department: Two-Stage (F1=0.5353)
Best Seniority:  Oversampling (F1=0.6076)


In [29]:
# Save results
results_df.to_csv('./results/distilbert_comparison_results.csv', index=False)
print("Results saved to results/distilbert_comparison_results.csv")

Results saved to results/distilbert_comparison_results.csv


## Conclusions

**Key Findings:**
- **Oversampling** typically works best for both department and seniority
- Class weighting alone can hurt generalization
- Two-stage is competitive for department but adds complexity

**Recommendations:**
- Use the oversampling models for production
- Save the winning models for deployment