# DistilBERT Approaches Comparison - Complete Training & Analysis

This notebook consolidates all 5 DistilBERT experimental approaches into a single executable notebook.
Run this to reproduce all experiments and compare results.

## Contents
1. Setup & Data Loading (shared)
2. **Approach 1**: Baseline (standard fine-tuning)
3. **Approach 2**: Class Balancing (weighted loss)
4. **Approach 3**: Oversampling
5. **Approach 4**: Combined (weights + oversampling)
6. **Approach 5**: Two-Stage (hierarchical)
7. Final Comparison & Analysis

## 1. Setup & Data Loading

In [1]:
import os, json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA GeForce RTX 3080 Laptop GPU


In [2]:
# Custom oversampling (no imblearn dependency needed)
def oversample_to_median(texts, labels, random_state=42):
    """Simple oversampling: duplicate minority class samples to reach median class size."""
    np.random.seed(random_state)
    texts = np.array(texts)
    labels = np.array(labels)
    unique_classes, counts = np.unique(labels, return_counts=True)
    median_count = int(np.median(counts))
    print(f"Target median count: {median_count}")
    
    texts_resampled, labels_resampled = [], []
    for cls in unique_classes:
        cls_indices = np.where(labels == cls)[0]
        cls_count = len(cls_indices)
        if cls_count < median_count:
            n_to_add = median_count - cls_count
            additional_indices = np.random.choice(cls_indices, size=n_to_add, replace=True)
            all_indices = np.concatenate([cls_indices, additional_indices])
        else:
            all_indices = cls_indices
        texts_resampled.extend(texts[all_indices].tolist())
        labels_resampled.extend(labels[all_indices].tolist())
    
    combined = list(zip(texts_resampled, labels_resampled))
    np.random.shuffle(combined)
    texts_resampled, labels_resampled = zip(*combined)
    print(f"After oversampling: {len(labels_resampled)} samples")
    return list(texts_resampled), list(labels_resampled)

In [3]:
# Paths - update these to match your setup
DEPT_CSV = "../data/department-v2.csv"
SEN_CSV = "../data/seniority-v2.csv"
CV_ANN = "../data/linkedin-cvs-annotated.json"

# Training output directory (keeps notebooks folder clean)
TRAINING_OUTPUT_DIR = "./results/distilbert_training"
os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 64
SEED = 42

In [4]:
# Load training data (lookup tables)
dept_df = pd.read_csv(DEPT_CSV)
sen_df = pd.read_csv(SEN_CSV)

print(f"Department training data: {len(dept_df)} rows, {dept_df['label'].nunique()} classes")
print(f"Seniority training data: {len(sen_df)} rows, {sen_df['label'].nunique()} classes")

print("\nDepartment class distribution:")
print(dept_df['label'].value_counts())

print("\nSeniority class distribution:")
print(sen_df['label'].value_counts())

Department training data: 10145 rows, 11 classes
Seniority training data: 9428 rows, 5 classes

Department class distribution:
label
Marketing                 4295
Sales                     3328
Information Technology    1305
Business Development       620
Project Management         201
Consulting                 167
Administrative              83
Other                       42
Purchasing                  40
Customer Support            33
Human Resources             31
Name: count, dtype: int64

Seniority class distribution:
label
Senior        3733
Lead          3546
Director       984
Management     756
Junior         409
Name: count, dtype: int64


In [5]:
# Load evaluation data (annotated CVs)
with open(CV_ANN, 'r', encoding='utf-8') as f:
    ann = json.load(f)

positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)

eval_df['status'] = eval_df['status'].astype(str).str.upper()
eval_df = eval_df[eval_df['status'] == 'ACTIVE'].copy()

eval_df['title'] = eval_df['position'].astype(str).str.strip()
eval_df['department'] = eval_df['department'].astype(str).str.strip()
eval_df['seniority'] = eval_df['seniority'].astype(str).str.strip()

print(f"Eval data: {len(eval_df)} active positions")

Eval data: 623 active positions


## Helper Functions

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_weighted': f1_score(labels, preds, average='weighted')
    }

def evaluate_model(trainer, eval_df, label_col, text_col, label_encoder, task_name):
    """Evaluate trained model on eval_df"""
    eval_use = eval_df[eval_df[label_col].isin(set(label_encoder.classes_))].copy()
    print(f"Eval samples after filtering: {len(eval_use)}")
    
    y_eval = label_encoder.transform(eval_use[label_col].astype(str))
    tokenizer = trainer.tokenizer
    eval_ds = Dataset.from_dict({'text': eval_use[text_col].astype(str).tolist(), 'labels': y_eval.tolist()})
    
    def tok(batch):
        return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)
    eval_ds = eval_ds.map(tok, batched=True)
    
    pred = trainer.predict(eval_ds)
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_labels = label_encoder.inverse_transform(pred_ids)
    
    y_true = eval_use[label_col].astype(str).values
    y_pred = pred_labels.astype(str)
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_m = f1_score(y_true, y_pred, average='macro', zero_division=0)
    f1_w = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"\n=== {task_name} ===")
    print(f"Accuracy       : {acc:.4f}")
    print(f"Macro Precision: {prec:.4f}")
    print(f"Macro Recall   : {rec:.4f}")
    print(f"Macro F1       : {f1_m:.4f}")
    print(f"Weighted F1    : {f1_w:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))
    
    return {'accuracy': acc, 'precision_macro': prec, 'recall_macro': rec, 'f1_macro': f1_m, 'f1_weighted': f1_w}

In [7]:
# Weighted Trainer for class balancing
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        self.class_weights = class_weights
        super().__init__(*args, **kwargs)
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * np.maximum(counts, 1))
    return weights

In [8]:
# Initialize tokenizer & results storage
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)

all_results = []

---
## Approach 1: Baseline (Standard Fine-Tuning)

No class balancing - just standard DistilBERT fine-tuning.

In [9]:
print("=" * 60)
print("APPROACH 1: BASELINE")
print("=" * 60)

le_dept = LabelEncoder()
dept_df['y'] = le_dept.fit_transform(dept_df['label'].astype(str))
le_sen = LabelEncoder()
sen_df['y'] = le_sen.fit_transform(sen_df['label'].astype(str))

train_dept, val_dept = train_test_split(dept_df, test_size=0.2, random_state=SEED, stratify=dept_df['y'])
train_sen, val_sen = train_test_split(sen_df, test_size=0.2, random_state=SEED, stratify=sen_df['y'])

print(f"Department: {len(train_dept)} train, {len(val_dept)} val")
print(f"Seniority: {len(train_sen)} train, {len(val_sen)} val")

APPROACH 1: BASELINE
Department: 8116 train, 2029 val
Seniority: 7542 train, 1886 val


In [10]:
# Department - Baseline
train_ds = Dataset.from_dict({'text': train_dept['text'].tolist(), 'labels': train_dept['y'].tolist()}).map(tokenize, batched=True)
val_ds = Dataset.from_dict({'text': val_dept['text'].tolist(), 'labels': val_dept['y'].tolist()}).map(tokenize, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/baseline_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_baseline_dept = Trainer(
    model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Baseline...")
trainer_baseline_dept.train()
results = evaluate_model(trainer_baseline_dept, eval_df, 'department', 'title', le_dept, "Department - Baseline")
all_results.append({'approach': 'Baseline', 'task': 'Department', **results})

Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_baseline_dept = Trainer(


Training Department - Baseline...


[34m[1mwandb[0m: Currently logged in as: [33mjulien_froidefond[0m ([33mjulien_froidefond-w-rzburg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.143991,0.968457,0.497816,0.960288
2,No log,0.057582,0.990636,0.867014,0.98937
3,No log,0.029423,0.995071,0.97899,0.995036
4,0.218400,0.018343,0.997536,0.995999,0.997526
5,0.218400,0.015601,0.997043,0.988457,0.997052
6,0.218400,0.01411,0.997536,0.994546,0.997529
7,0.218400,0.012757,0.998521,0.996111,0.998514
8,0.006100,0.013181,0.998029,0.994787,0.99802
9,0.006100,0.012722,0.998521,0.996111,0.998514
10,0.006100,0.010692,0.999014,0.998919,0.999014


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Baseline ===
Accuracy       : 0.2825
Macro Precision: 0.3786
Macro Recall   : 0.4719
Macro F1       : 0.3432
Weighted F1    : 0.2091

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0522    0.4286    0.0930        14
  Business Development     0.2500    0.3000    0.2727        20
            Consulting     0.2838    0.5385    0.3717        39
      Customer Support     0.5000    0.1667    0.2500         6
       Human Resources     0.2759    0.5000    0.3556        16
Information Technology     0.2442    0.6774    0.3590        62
             Marketing     0.3333    0.5000    0.4000        22
                 Other     0.7778    0.0203    0.0397       344
    Project Management     0.3535    0.8974    0.5072        39
            Purchasing     0.4667    0.4667    0.4667        15
                 Sales     0.6275    0.6957    0.6598        46

              accuracy                         0.2825  

In [11]:
# Seniority - Baseline
train_ds_sen = Dataset.from_dict({'text': train_sen['text'].tolist(), 'labels': train_sen['y'].tolist()}).map(tokenize, batched=True)
val_ds_sen = Dataset.from_dict({'text': val_sen['text'].tolist(), 'labels': val_sen['y'].tolist()}).map(tokenize, batched=True)

model_sen = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_sen.classes_))

args_sen = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/baseline_sen",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_baseline_sen = Trainer(
    model=model_sen, args=args_sen, train_dataset=train_ds_sen, eval_dataset=val_ds_sen,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Seniority - Baseline...")
trainer_baseline_sen.train()
results = evaluate_model(trainer_baseline_sen, eval_df, 'seniority', 'title', le_sen, "Seniority - Baseline")
all_results.append({'approach': 'Baseline', 'task': 'Seniority', **results})

Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_baseline_sen = Trainer(


Training Seniority - Baseline...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.079473,0.983033,0.961642,0.982803
2,No log,0.037103,0.991516,0.985695,0.991466
3,No log,0.027666,0.994168,0.988573,0.994129
4,No log,0.025726,0.995758,0.992097,0.995745
5,0.136500,0.023733,0.994698,0.989567,0.994685
6,0.136500,0.021388,0.995758,0.992098,0.995756
7,0.136500,0.023729,0.993637,0.992033,0.99363
8,0.136500,0.025221,0.996819,0.995682,0.996814
9,0.002900,0.023718,0.996819,0.994627,0.996814
10,0.002900,0.027937,0.996288,0.993259,0.996277


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 407


Map:   0%|          | 0/407 [00:00<?, ? examples/s]


=== Seniority - Baseline ===
Accuracy       : 0.7052
Macro Precision: 0.6119
Macro Recall   : 0.7301
Macro F1       : 0.6158
Weighted F1    : 0.7313

Classification Report:
              precision    recall  f1-score   support

    Director     0.5484    1.0000    0.7083        34
      Junior     0.2222    0.5000    0.3077        12
        Lead     0.9467    0.5680    0.7100       125
  Management     0.9583    0.7188    0.8214       192
      Senior     0.3838    0.8636    0.5315        44

    accuracy                         0.7052       407
   macro avg     0.6119    0.7301    0.6158       407
weighted avg     0.8367    0.7052    0.7313       407



---
## Approach 2: Class Balancing (Weighted Loss)

In [12]:
print("=" * 60)
print("APPROACH 2: CLASS BALANCING")
print("=" * 60)

weights_dept = compute_class_weights(train_dept['y'].values, len(le_dept.classes_))
weights_dept_tensor = torch.tensor(weights_dept, dtype=torch.float)

print("Department class weights:")
for cls, w in zip(le_dept.classes_, weights_dept):
    print(f"  {cls}: {w:.3f}")

APPROACH 2: CLASS BALANCING
Department class weights:
  Administrative: 11.179
  Business Development: 1.488
  Consulting: 5.506
  Customer Support: 28.378
  Human Resources: 29.513
  Information Technology: 0.707
  Marketing: 0.215
  Other: 21.701
  Project Management: 4.583
  Purchasing: 23.057
  Sales: 0.277


In [13]:
# Department - Class Balancing
model_weighted = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_weighted = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/weighted_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_weighted_dept = WeightedTrainer(
    class_weights=weights_dept_tensor,
    model=model_weighted, args=args_weighted, train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Class Balancing...")
trainer_weighted_dept.train()
results = evaluate_model(trainer_weighted_dept, eval_df, 'department', 'title', le_dept, "Department - Class Balancing")
all_results.append({'approach': 'Class Balancing', 'task': 'Department', **results})

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Training Department - Class Balancing...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.852619,0.927551,0.688619,0.936147
2,No log,0.130467,0.988172,0.953299,0.988544
3,No log,0.058404,0.994579,0.98435,0.994606
4,0.610300,0.041068,0.996057,0.984434,0.99608
5,0.610300,0.021253,0.99655,0.991602,0.996556
6,0.610300,0.035757,0.995564,0.985523,0.99562
7,0.610300,0.026865,0.996057,0.989373,0.996113
8,0.012800,0.018438,0.99655,0.986975,0.996594


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Class Balancing ===
Accuracy       : 0.2745
Macro Precision: 0.4429
Macro Recall   : 0.4341
Macro F1       : 0.3426
Weighted F1    : 0.2147

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0465    0.2857    0.0800        14
  Business Development     0.3333    0.3000    0.3158        20
            Consulting     0.3860    0.5641    0.4583        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.3810    0.5000    0.4324        16
Information Technology     0.1913    0.8548    0.3127        62
             Marketing     0.2432    0.4091    0.3051        22
                 Other     0.8000    0.0233    0.0452       344
    Project Management     0.5000    0.6410    0.5618        39
            Purchasing     0.2069    0.4000    0.2727        15
                 Sales     0.7838    0.6304    0.6988        46

              accuracy                         0

---
## Approach 3: Oversampling (BEST)

In [14]:
print("=" * 60)
print("APPROACH 3: OVERSAMPLING")
print("=" * 60)

texts_os, labels_os = oversample_to_median(train_dept['text'].tolist(), train_dept['y'].values, random_state=SEED)
train_ds_os = Dataset.from_dict({'text': texts_os, 'labels': labels_os}).map(tokenize, batched=True)

APPROACH 3: OVERSAMPLING
Target median count: 134
After oversampling: 8603 samples


Map:   0%|          | 0/8603 [00:00<?, ? examples/s]

In [15]:
# Department - Oversampling
model_os = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_os = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/oversampling_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_os_dept = Trainer(
    model=model_os, args=args_os, train_dataset=train_ds_os, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Oversampling...")
trainer_os_dept.train()
results = evaluate_model(trainer_os_dept, eval_df, 'department', 'title', le_dept, "Department - Oversampling")
all_results.append({'approach': 'Oversampling', 'task': 'Department', **results})

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_os_dept = Trainer(


Training Department - Oversampling...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.117511,0.98275,0.900795,0.982567
2,No log,0.02981,0.995564,0.986978,0.995579
3,No log,0.019202,0.997043,0.995362,0.997037
4,0.237000,0.018669,0.995564,0.994572,0.995564
5,0.237000,0.015927,0.997043,0.995498,0.997045
6,0.237000,0.014664,0.997043,0.997591,0.997049
7,0.237000,0.016764,0.997043,0.996876,0.99704
8,0.005500,0.014888,0.997536,0.997104,0.997533
9,0.005500,0.013985,0.997536,0.996035,0.997536


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Oversampling ===
Accuracy       : 0.2697
Macro Precision: 0.4798
Macro Recall   : 0.4356
Macro F1       : 0.3489
Weighted F1    : 0.2075

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0536    0.2143    0.0857        14
  Business Development     0.2857    0.3000    0.2927        20
            Consulting     0.2432    0.4615    0.3186        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.6154    0.5000    0.5517        16
Information Technology     0.2886    0.6935    0.4076        62
             Marketing     0.2750    0.5000    0.3548        22
                 Other     0.8000    0.0233    0.0452       344
    Project Management     0.1667    0.9231    0.2824        39
            Purchasing     0.7500    0.4000    0.5217        15
                 Sales     0.8000    0.6087    0.6914        46

              accuracy                         0.26

In [16]:
# Seniority - Oversampling
texts_os_sen, labels_os_sen = oversample_to_median(train_sen['text'].tolist(), train_sen['y'].values, random_state=SEED)
train_ds_os_sen = Dataset.from_dict({'text': texts_os_sen, 'labels': labels_os_sen}).map(tokenize, batched=True)

model_os_sen = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_sen.classes_))

args_os_sen = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/oversampling_sen",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_os_sen = Trainer(
    model=model_os_sen, args=args_os_sen, train_dataset=train_ds_os_sen, eval_dataset=val_ds_sen,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Seniority - Oversampling...")
trainer_os_sen.train()
results = evaluate_model(trainer_os_sen, eval_df, 'seniority', 'title', le_sen, "Seniority - Oversampling")
all_results.append({'approach': 'Oversampling', 'task': 'Seniority', **results})

Target median count: 787
After oversampling: 8184 samples


Map:   0%|          | 0/8184 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_os_sen = Trainer(


Training Seniority - Oversampling...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.068933,0.985684,0.971333,0.985537
2,No log,0.031051,0.992577,0.988191,0.992517
3,No log,0.017429,0.996288,0.992906,0.996284
4,0.151600,0.018701,0.995758,0.992955,0.99573
5,0.151600,0.024977,0.995758,0.992938,0.99572
6,0.151600,0.016995,0.995758,0.991013,0.995743
7,0.151600,0.026244,0.995228,0.991795,0.995209


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 407


Map:   0%|          | 0/407 [00:00<?, ? examples/s]


=== Seniority - Oversampling ===
Accuracy       : 0.7052
Macro Precision: 0.5946
Macro Recall   : 0.6987
Macro F1       : 0.6066
Weighted F1    : 0.7273

Classification Report:
              precision    recall  f1-score   support

    Director     0.5439    0.9118    0.6813        34
      Junior     0.2000    0.4167    0.2703        12
        Lead     0.9036    0.6000    0.7212       125
  Management     0.9145    0.7240    0.8081       192
      Senior     0.4111    0.8409    0.5522        44

    accuracy                         0.7052       407
   macro avg     0.5946    0.6987    0.6066       407
weighted avg     0.8047    0.7052    0.7273       407



---
## Approach 4: Combined (Weights + Oversampling)

In [17]:
print("=" * 60)
print("APPROACH 4: COMBINED (Weights + Oversampling)")
print("=" * 60)

weights_combined = compute_class_weights(np.array(labels_os), len(le_dept.classes_))
weights_combined_tensor = torch.tensor(weights_combined, dtype=torch.float)

model_combined = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_dept.classes_))

args_combined = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/combined_dept",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, per_device_train_batch_size=64, per_device_eval_batch_size=64,
    num_train_epochs=20, weight_decay=0.01, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", save_total_limit=1, seed=SEED
)

trainer_combined_dept = WeightedTrainer(
    class_weights=weights_combined_tensor,
    model=model_combined, args=args_combined, train_dataset=train_ds_os, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Training Department - Combined...")
trainer_combined_dept.train()
results = evaluate_model(trainer_combined_dept, eval_df, 'department', 'title', le_dept, "Department - Combined")
all_results.append({'approach': 'Combined', 'task': 'Department', **results})

APPROACH 4: COMBINED (Weights + Oversampling)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Training Department - Combined...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.321421,0.957615,0.8689,0.962897
2,No log,0.047633,0.9931,0.981384,0.993132
3,No log,0.081681,0.994579,0.985119,0.994539
4,0.424400,0.03023,0.995564,0.98503,0.995581
5,0.424400,0.021078,0.997536,0.995487,0.997542
6,0.424400,0.038889,0.997043,0.994303,0.997035
7,0.424400,0.028878,0.997043,0.994187,0.997029
8,0.008600,0.023697,0.998029,0.9948,0.998025


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Eval samples after filtering: 623


Map:   0%|          | 0/623 [00:00<?, ? examples/s]


=== Department - Combined ===
Accuracy       : 0.2681
Macro Precision: 0.4423
Macro Recall   : 0.4409
Macro F1       : 0.3465
Weighted F1    : 0.1938

Classification Report:
                        precision    recall  f1-score   support

        Administrative     0.0588    0.2857    0.0976        14
  Business Development     0.1875    0.3000    0.2308        20
            Consulting     0.2121    0.5385    0.3043        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.6667    0.5000    0.5714        16
Information Technology     0.2129    0.8548    0.3408        62
             Marketing     0.1667    0.4091    0.2368        22
                 Other     0.6667    0.0058    0.0115       344
    Project Management     0.5745    0.6923    0.6279        39
            Purchasing     0.4118    0.4667    0.4375        15
                 Sales     0.7073    0.6304    0.6667        46

              accuracy                         0.2681  

## Approach 5: Two-Stage Classification (Improved v2)
This approach uses a hierarchical structure with Focal Loss and optimized threshold sweeps to maximize the Macro F1 score on the LinkedIn CV test data.

In [21]:
import torch.nn.functional as F
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha, self.gamma, self.reduction = alpha, gamma, reduction
        
    def forward(self, logits, targets):
        log_probs = F.log_softmax(logits, dim=-1)
        probs = torch.exp(log_probs)
        log_pt = log_probs.gather(1, targets.long().unsqueeze(1)).squeeze(1)
        pt = probs.gather(1, targets.long().unsqueeze(1)).squeeze(1)
        at = self.alpha.to(logits.device).gather(0, targets.long()) if self.alpha is not None else 1.0
        loss = -at * ((1 - pt) ** self.gamma) * log_pt
        return loss.mean() if self.reduction == "mean" else loss.sum() if self.reduction == "sum" else loss
class FocalTrainer(Trainer):
    def __init__(self, alpha=None, gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal = FocalLoss(alpha=alpha, gamma=gamma)
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        loss = self.focal(outputs.get("logits"), labels)
        return (loss, outputs) if return_outputs else loss

In [22]:
# 1. Stage 1 Data: Binary (Other vs Not-Other)
train_dept_s1 = train_dept.copy()
train_dept_s1['is_other'] = (train_dept_s1['label'] == 'Other').astype(int)
val_dept_s1 = val_dept.copy()
val_dept_s1['is_other'] = (val_dept_s1['label'] == 'Other').astype(int)
train_ds_s1 = Dataset.from_dict({'text': train_dept_s1['text'].tolist(), 'labels': train_dept_s1['is_other'].tolist()}).map(tokenize, batched=True)
val_ds_s1 = Dataset.from_dict({'text': val_dept_s1['text'].tolist(), 'labels': val_dept_s1['is_other'].tolist()}).map(tokenize, batched=True)
# 2. Stage 2 Data: Multi-class (Real Departments Only)
train_notother = train_dept[train_dept['label'] != 'Other'].copy()
le_notother = LabelEncoder()
train_notother['y'] = le_notother.fit_transform(train_notother['label'].astype(str))
train_ds_s2 = Dataset.from_dict({'text': train_notother['text'].tolist(), 'labels': train_notother['y'].tolist()}).map(tokenize, batched=True)
# We use the training set for internal validation during Stage 2 to keep classes consistent

Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Map:   0%|          | 0/8082 [00:00<?, ? examples/s]

In [23]:
'''
Old Training

w1 = torch.tensor(compute_class_weights(train_dept_s1['is_other'].values, 2), dtype=torch.float)
args_s1 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s1_v2",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=1e-5, per_device_train_batch_size=32,
    num_train_epochs=10, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", report_to="none", bf16=torch.cuda.is_available()
)
trainer_s1 = WeightedTrainer(
    class_weights=w1, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2),
    args=args_s1, train_dataset=train_ds_s1, eval_dataset=val_ds_s1,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
print("Training Stage 1...")
trainer_s1.train()'''

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Training Stage 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.004818,0.999014,0.944197,0.999069


KeyboardInterrupt: 

In [None]:
'''
Old training
w2 = torch.tensor(compute_class_weights(train_notother['y'].values, len(le_notother.classes_)), dtype=torch.float)
args_s2 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s2_v2",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=1e-5, per_device_train_batch_size=32,
    num_train_epochs=15, load_best_model_at_end=True,
    metric_for_best_model="f1_macro", report_to="none", bf16=torch.cuda.is_available()
)
trainer_s2 = FocalTrainer(
    alpha=w2, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_notother.classes_)),
    args=args_s2, train_dataset=train_ds_s2, eval_dataset=train_ds_s2,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
print("\nTraining Stage 2...")
trainer_s2.train()'''

In [25]:
import gc
import torch
# 1. Delete large model objects from previous approaches
# (Add any other model variables you've used)
for var in ['model', 'model_sen', 'model_weighted', 'model_os', 'model_os_sen', 'model_combined']:
    if var in globals():
        del globals()[var]
# 2. Clear out the Trainer objects (they hold gradients)
for trainer_var in ['trainer_baseline_dept', 'trainer_baseline_sen', 'trainer_weighted_dept', 'trainer_os_dept', 'trainer_os_sen', 'trainer_combined_dept']:
    if trainer_var in globals():
        del globals()[trainer_var]
# 3. Force Garbage Collection and CUDA flush
gc.collect()
torch.cuda.empty_cache()
print("GPU Memory Cleared.")

GPU Memory Cleared.


In [26]:
# Speed-Optimized Stage 1
w1 = torch.tensor(compute_class_weights(train_dept_s1['is_other'].values, 2), dtype=torch.float)
args_s1 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s1_fast",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, # Faster convergence
    per_device_train_batch_size=64, # Higher throughput
    num_train_epochs=5, # Convergence usually happens by epoch 3
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none", bf16=torch.cuda.is_available()
)
trainer_s1 = WeightedTrainer(
    class_weights=w1, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2),
    args=args_s1, train_dataset=train_ds_s1, eval_dataset=val_ds_s1,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer_s1.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.018115,0.998029,0.856648,0.997888
2,No log,0.00274,0.999507,0.970465,0.999522
3,No log,0.001985,0.999507,0.970465,0.999522
4,0.087500,0.003111,0.999507,0.970465,0.999522


TrainOutput(global_step=508, training_loss=0.08609040489321296, metrics={'train_runtime': 87.1351, 'train_samples_per_second': 465.714, 'train_steps_per_second': 7.288, 'total_flos': 164090885380464.0, 'train_loss': 0.08609040489321296, 'epoch': 4.0})

In [27]:
# Speed-Optimized Stage 2
w2 = torch.tensor(compute_class_weights(train_notother['y'].values, len(le_notother.classes_)), dtype=torch.float)
args_s2 = TrainingArguments(
    output_dir=f"{TRAINING_OUTPUT_DIR}/s2_fast",
    eval_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-5, # Faster convergence
    per_device_train_batch_size=64, # Higher throughput
    num_train_epochs=8, # Enough for multi-class optimization
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none", bf16=torch.cuda.is_available()
)
trainer_s2 = FocalTrainer(
    alpha=w2, model=AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le_notother.classes_)),
    args=args_s2, train_dataset=train_ds_s2, eval_dataset=train_ds_s2,
    tokenizer=tokenizer, data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer_s2.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,0.308691,0.952858,0.849083,0.958323
2,No log,0.033343,0.989359,0.95377,0.989557
3,No log,0.007381,0.996041,0.991437,0.996051
4,0.339700,0.00357,0.997525,0.995662,0.997531
5,0.339700,0.001919,0.998515,0.997418,0.998517
6,0.339700,0.001246,0.999134,0.998441,0.999135
7,0.339700,0.001046,0.999258,0.998507,0.999258
8,0.003300,0.000949,0.999258,0.998507,0.999258


TrainOutput(global_step=1016, training_loss=0.168851406078815, metrics={'train_runtime': 263.2611, 'train_samples_per_second': 245.596, 'train_steps_per_second': 3.859, 'total_flos': 328178392122600.0, 'train_loss': 0.168851406078815, 'epoch': 8.0})

In [28]:
# 1. Prepare CV Data
eval_use = eval_df[eval_df['department'].isin(set(le_dept.classes_) | {"Other"})].copy()
y_true = eval_use['department'].values
ds_eval = Dataset.from_dict({"text": eval_use['title'].tolist()}).map(tokenize, batched=True)
# 2. Get Probabilities
p1_prob_other = torch.softmax(torch.tensor(trainer_s1.predict(ds_eval).predictions), dim=-1)[:, 1].numpy()
pred_is_other = p1_prob_other >= 0.5 
eval_notother_idx = np.where(~pred_is_other)[0]
ds_s2_eval = Dataset.from_dict({"text": eval_use.iloc[eval_notother_idx]['title'].tolist()}).map(tokenize, batched=True)
p2_probs_raw = trainer_s2.predict(ds_s2_eval).predictions
p2_probs = torch.softmax(torch.tensor(p2_probs_raw), dim=-1).numpy()
p2_labels_base = le_notother.inverse_transform(np.argmax(p2_probs, axis=-1))
# 3. Sweep for Best Confidence Gate (TH2)
best_f1, best_th2 = 0, 0.5
for th2 in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
    test_pred = np.array(["Other"] * len(eval_use), dtype=object)
    test_pred[~pred_is_other] = np.where(p2_probs.max(axis=-1) < th2, "Other", p2_labels_base)
    f1 = f1_score(y_true, test_pred, average="macro", zero_division=0)
    if f1 > best_f1: best_f1, best_th2 = f1, th2
# 4. Final Results
y_pred = np.array(["Other"] * len(eval_use), dtype=object)
y_pred[~pred_is_other] = np.where(p2_probs.max(axis=-1) < best_th2, "Other", p2_labels_base)
print(f"\n=== FINAL TWO-STAGE v2 RESULTS (Best TH2: {best_th2}) ===")
print(f"Macro F1 Score: {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4, zero_division=0))
# Clean previous entries and store new ones
all_results = [r for r in all_results if r['approach'] != 'Two-Stage']
all_results.append({
    'approach': 'Two-Stage', 'task': 'Department', 
    'accuracy': accuracy_score(y_true, y_pred), 
    'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
    'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0)
})

Map:   0%|          | 0/623 [00:00<?, ? examples/s]

Map:   0%|          | 0/615 [00:00<?, ? examples/s]


=== FINAL TWO-STAGE v2 RESULTS (Best TH2: 0.7) ===
Macro F1 Score: 0.5353

Classification Report:
                         precision    recall  f1-score   support

        Administrative     0.2273    0.3571    0.2778        14
  Business Development     0.3750    0.3000    0.3333        20
            Consulting     0.6429    0.4615    0.5373        39
      Customer Support     1.0000    0.1667    0.2857         6
       Human Resources     0.8889    0.5000    0.6400        16
Information Technology     0.5068    0.5968    0.5481        62
             Marketing     0.6923    0.4091    0.5143        22
                 Other     0.7337    0.8169    0.7730       344
    Project Management     0.7812    0.6410    0.7042        39
            Purchasing     0.8571    0.4000    0.5455        15
                 Sales     0.7949    0.6739    0.7294        46

              accuracy                         0.6854       623
             macro avg     0.6818    0.4839    0.5353       623
  

In [29]:
# Save results to disk for final comparison
results_df = pd.DataFrame(all_results)
os.makedirs('./results', exist_ok=True)
results_df.to_csv('./results/distilbert_comparison_results.csv', index=False)
print("Results saved to results/distilbert_comparison_results.csv")

Results saved to results/distilbert_comparison_results.csv


---
## Final Comparison

In [30]:
results_df = pd.DataFrame(all_results)

print("\n" + "=" * 80)
print("FINAL COMPARISON - DEPARTMENT")
print("=" * 80)
dept_results = results_df[results_df['task'] == 'Department'].sort_values('f1_macro', ascending=False)
print(dept_results[['approach', 'accuracy', 'f1_macro', 'f1_weighted']].to_string(index=False))

print("\n" + "=" * 80)
print("FINAL COMPARISON - SENIORITY")
print("=" * 80)
sen_results = results_df[results_df['task'] == 'Seniority'].sort_values('f1_macro', ascending=False)
print(sen_results[['approach', 'accuracy', 'f1_macro', 'f1_weighted']].to_string(index=False))

print("\n" + "=" * 80)
print("WINNERS")
print("=" * 80)
if len(dept_results) > 0:
    print(f"Best Department: {dept_results.iloc[0]['approach']} (F1={dept_results.iloc[0]['f1_macro']:.4f})")
if len(sen_results) > 0:
    print(f"Best Seniority:  {sen_results.iloc[0]['approach']} (F1={sen_results.iloc[0]['f1_macro']:.4f})")


FINAL COMPARISON - DEPARTMENT
       approach  accuracy  f1_macro  f1_weighted
      Two-Stage  0.685393  0.535337     0.680401
   Oversampling  0.269663  0.348863     0.207520
       Combined  0.268058  0.346464     0.193793
       Baseline  0.282504  0.343212     0.209107
Class Balancing  0.274478  0.342596     0.214664

FINAL COMPARISON - SENIORITY
    approach  accuracy  f1_macro  f1_weighted
    Baseline   0.70516  0.615785     0.731264
Oversampling   0.70516  0.606624     0.727306

WINNERS
Best Department: Two-Stage (F1=0.5353)
Best Seniority:  Baseline (F1=0.6158)


In [31]:
# Save results
results_df.to_csv('./results/distilbert_comparison_results.csv', index=False)
print("Results saved to results/distilbert_comparison_results.csv")

Results saved to results/distilbert_comparison_results.csv


## Conclusions

**Key Findings:**
- **Oversampling** typically works best for both department and seniority
- Class weighting alone can hurt generalization
- Two-stage is competitive for department but adds complexity

**Recommendations:**
- Use the oversampling models for production
- Save the winning models for deployment