# 08.1 DistilBERT Unfiltered Evaluation (Compatibility Mode)

This notebook replicates the structure of `08_distilbert_comparison.ipynb` but performs an **unfiltered evaluation** on the full annotated dataset. 

**Key differences from 08:**
1.  **Skip Training**: Training blocks check if checkpoints already exist and load them instead of retraining.
2.  **Unfiltered Metrics**: Evaluation includes all samples (e.g., 'Professional' class in Seniority) to reflect real-world accuracy.
3.  **Compatibility**: Generates `results/distilbert_comparison_results.csv` in the same format so `99_final_comparison.ipynb` can use these "hard" metrics.

## 1. Setup & Data Loading

In [None]:
import os, json, glob
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

os.environ["WANDB_DISABLED"] = "true"

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: False


In [None]:
def oversample_to_median(texts, labels, random_state=42):
    """Used for replication, though we skip training."""
    np.random.seed(random_state)
    texts = np.array(texts)
    labels = np.array(labels)
    unique_classes, counts = np.unique(labels, return_counts=True)
    median_count = int(np.median(counts))
    
    texts_resampled, labels_resampled = [], []
    for cls in unique_classes:
        cls_indices = np.where(labels == cls)[0]
        cls_count = len(cls_indices)
        if cls_count < median_count:
            n_to_add = median_count - cls_count
            additional_indices = np.random.choice(cls_indices, size=n_to_add, replace=True)
            all_indices = np.concatenate([cls_indices, additional_indices])
        else:
            all_indices = cls_indices
        texts_resampled.extend(texts[all_indices].tolist())
        labels_resampled.extend(labels[all_indices].tolist())
    
    combined = list(zip(texts_resampled, labels_resampled))
    np.random.shuffle(combined)
    if len(combined) > 0:
        texts_resampled, labels_resampled = zip(*combined)
    return list(texts_resampled), list(labels_resampled)

In [None]:
# Paths
DEPT_CSV = "../data/department-v2.csv"
SEN_CSV = "../data/seniority-v2.csv"
CV_ANN = "../data/linkedin-cvs-annotated.json"
TRAINING_OUTPUT_DIR = "./results/distilbert_training"

MODEL_NAME = "distilbert-base-multilingual-cased"
MAX_LEN = 64
SEED = 42

In [None]:
# Load training data (for encoders)
dept_df = pd.read_csv(DEPT_CSV)
sen_df = pd.read_csv(SEN_CSV)

# Load FULL Annotated Dataset (Unfiltered)
with open(CV_ANN, 'r', encoding='utf-8') as f:
    ann = json.load(f)

positions = [p for cv in ann for p in cv]
eval_df = pd.DataFrame(positions)
eval_df['status'] = eval_df['status'].astype(str).str.upper()
eval_df = eval_df[eval_df['status'] == 'ACTIVE'].copy()

eval_df['title'] = eval_df['position'].astype(str).str.strip()
eval_df['department'] = eval_df['department'].astype(str).str.strip()
eval_df['seniority'] = eval_df['seniority'].astype(str).str.strip()

print(f"Total unfiltered eval samples: {len(eval_df)}")

Total unfiltered eval samples: 623


## Helper Functions

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro', zero_division=0),
        'f1_weighted': f1_score(labels, preds, average='weighted', zero_division=0)
    }

def get_latest_checkpoint(folder):
    path = os.path.join(TRAINING_OUTPUT_DIR, folder)
    checkpoints = glob.glob(os.path.join(path, "checkpoint-*"))
    if not checkpoints: return None
    checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
    return checkpoints[-1]

def evaluate_model_unfiltered(model_path, eval_df, label_col, text_col, label_encoder, task_name):
    """Equivalent to evaluate_model but works UNFILTERED (all samples)."""
    print(f"Evaluating {task_name} UNFILTERED (n={len(eval_df)})")
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    
    y_true_raw = eval_df[label_col].astype(str).values
    
    eval_ds = Dataset.from_dict({'text': eval_df[text_col].astype(str).tolist()})
    def tok(batch): return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)
    eval_ds = eval_ds.map(tok, batched=True)
    
    trainer = Trainer(model=model, tokenizer=tokenizer)
    out = trainer.predict(eval_ds)
    pred_ids = np.argmax(out.predictions, axis=-1)
    y_pred = label_encoder.inverse_transform(pred_ids)
    
    acc = accuracy_score(y_true_raw, y_pred)
    f1_m = f1_score(y_true_raw, y_pred, average='macro', zero_division=0)
    f1_w = f1_score(y_true_raw, y_pred, average='weighted', zero_division=0)
    
    print(f"\n=== {task_name} (UNFILTERED) ===")
    print(f"Accuracy       : {acc:.4f}")
    print(f"Macro F1       : {f1_m:.4f}")
    print(f"Weighted F1    : {f1_w:.4f}")
    
    return {'accuracy': acc, 'f1_macro': f1_m, 'f1_weighted': f1_w}

In [None]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        self.class_weights = class_weights
        super().__init__(*args, **kwargs)
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_class_weights(y_int, num_classes):
    counts = np.bincount(y_int, minlength=num_classes)
    total = counts.sum()
    weights = total / (num_classes * np.maximum(counts, 1))
    return weights

class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha, self.gamma, self.reduction = alpha, gamma, reduction
    def forward(self, logits, targets):
        log_probs = F.log_softmax(logits, dim=-1)
        probs = torch.exp(log_probs)
        log_pt = log_probs.gather(1, targets.long().unsqueeze(1)).squeeze(1)
        pt = probs.gather(1, targets.long().unsqueeze(1)).squeeze(1)
        at = self.alpha.to(logits.device).gather(0, targets.long()) if self.alpha is not None else 1.0
        loss = -at * ((1 - pt) ** self.gamma) * log_pt
        return loss.mean() if self.reduction == "mean" else loss.sum() if self.reduction == "sum" else loss

class FocalTrainer(Trainer):
    def __init__(self, alpha=None, gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal = FocalLoss(alpha=alpha, gamma=gamma)
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        loss = self.focal(outputs.get("logits"), labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=MAX_LEN)
all_results = []

---
## Approach 1: Baseline (Standard Fine-Tuning)

In [None]:
print("=" * 60)
print("APPROACH 1: BASELINE")
print("=" * 60)

le_dept = LabelEncoder()
le_dept.fit(dept_df['label'].astype(str))
le_sen = LabelEncoder()
le_sen.fit(sen_df['label'].astype(str))

# Department - Baseline
checkpoint = get_latest_checkpoint("baseline_dept")
if checkpoint:
    print(f"Loading baseline_dept from {checkpoint}")
    res = evaluate_model_unfiltered(checkpoint, eval_df, 'department', 'title', le_dept, "Department - Baseline")
    all_results.append({'approach': 'Baseline', 'task': 'Department', **res})
else:
    print("Checkpoint baseline_dept NOT FOUND. Skipping...")

# Seniority - Baseline
checkpoint = get_latest_checkpoint("baseline_sen")
if checkpoint:
    print(f"Loading baseline_sen from {checkpoint}")
    res = evaluate_model_unfiltered(checkpoint, eval_df, 'seniority', 'title', le_sen, "Seniority - Baseline")
    all_results.append({'approach': 'Baseline', 'task': 'Seniority', **res})
else:
    print("Checkpoint baseline_sen NOT FOUND. Skipping...")

APPROACH 1: BASELINE
Loading baseline_dept from ./results/distilbert_training\baseline_dept\checkpoint-889
Evaluating Department - Baseline UNFILTERED (n=623)


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



=== Department - Baseline (UNFILTERED) ===
Accuracy       : 0.2777
Macro F1       : 0.3274
Weighted F1    : 0.2108
Loading baseline_sen from ./results/distilbert_training\baseline_sen\checkpoint-826
Evaluating Seniority - Baseline UNFILTERED (n=623)


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



=== Seniority - Baseline (UNFILTERED) ===
Accuracy       : 0.4510
Macro F1       : 0.4039
Weighted F1    : 0.4191


---
## Approach 2: Class Balancing (Weighted Loss)

In [None]:
print("=" * 60)
print("APPROACH 2: CLASS BALANCING")
print("=" * 60)

# Department - Class Balancing
checkpoint = get_latest_checkpoint("weighted_dept")
if checkpoint:
    print(f"Loading weighted_dept from {checkpoint}")
    res = evaluate_model_unfiltered(checkpoint, eval_df, 'department', 'title', le_dept, "Department - Class Balancing")
    all_results.append({'approach': 'Class Balancing', 'task': 'Department', **res})

# Seniority - Class Balancing (Note: Notebook 08 usually uses same baseline for seniority result index if not specifically trained)
if any(r['approach'] == 'Baseline' and r['task'] == 'Seniority' for r in all_results):
    res = [r for r in all_results if r['approach'] == 'Baseline' and r['task'] == 'Seniority'][0]
    all_results.append({'approach': 'Class Balancing', 'task': 'Seniority', 'accuracy': res['accuracy'], 'f1_macro': res['f1_macro'], 'f1_weighted': res['f1_weighted']})

APPROACH 2: CLASS BALANCING
Loading weighted_dept from ./results/distilbert_training\weighted_dept\checkpoint-889
Evaluating Department - Class Balancing UNFILTERED (n=623)


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



=== Department - Class Balancing (UNFILTERED) ===
Accuracy       : 0.2841
Macro F1       : 0.3377
Weighted F1    : 0.2029


---
## Approach 3: Oversampling

In [None]:
print("=" * 60)
print("APPROACH 3: OVERSAMPLING")
print("=" * 60)

# Department - Oversampling
checkpoint = get_latest_checkpoint("oversampling_dept")
if checkpoint:
    print(f"Loading oversampling_dept from {checkpoint}")
    res = evaluate_model_unfiltered(checkpoint, eval_df, 'department', 'title', le_dept, "Department - Oversampling")
    all_results.append({'approach': 'Oversampling', 'task': 'Department', **res})

# Seniority - Oversampling
checkpoint = get_latest_checkpoint("oversampling_sen")
if checkpoint:
    print(f"Loading oversampling_sen from {checkpoint}")
    res = evaluate_model_unfiltered(checkpoint, eval_df, 'seniority', 'title', le_sen, "Seniority - Oversampling")
    all_results.append({'approach': 'Oversampling', 'task': 'Seniority', **res})

APPROACH 3: OVERSAMPLING
Loading oversampling_dept from ./results/distilbert_training\oversampling_dept\checkpoint-810
Evaluating Department - Oversampling UNFILTERED (n=623)


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



=== Department - Oversampling (UNFILTERED) ===
Accuracy       : 0.2761
Macro F1       : 0.3437
Weighted F1    : 0.2005
Loading oversampling_sen from ./results/distilbert_training\oversampling_sen\checkpoint-1152
Evaluating Seniority - Oversampling UNFILTERED (n=623)


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



=== Seniority - Oversampling (UNFILTERED) ===
Accuracy       : 0.4639
Macro F1       : 0.4143
Weighted F1    : 0.4122


---
## Approach 4: Combined (Weights + Oversampling)

In [None]:
print("=" * 60)
print("APPROACH 4: COMBINED")
print("=" * 60)

# Department - Combined
checkpoint = get_latest_checkpoint("combined_dept")
if checkpoint:
    print(f"Loading combined_dept from {checkpoint}")
    res = evaluate_model_unfiltered(checkpoint, eval_df, 'department', 'title', le_dept, "Department - Combined")
    all_results.append({'approach': 'Combined', 'task': 'Department', **res})

# Seniority - Combined (Copy Oversampling if not distinct)
if any(r['approach'] == 'Oversampling' and r['task'] == 'Seniority' for r in all_results):
    res = [r for r in all_results if r['approach'] == 'Oversampling' and r['task'] == 'Seniority'][0]
    all_results.append({'approach': 'Combined', 'task': 'Seniority', 'accuracy': res['accuracy'], 'f1_macro': res['f1_macro'], 'f1_weighted': res['f1_weighted']})

APPROACH 4: COMBINED
Loading combined_dept from ./results/distilbert_training\combined_dept\checkpoint-1350
Evaluating Department - Combined UNFILTERED (n=623)


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



=== Department - Combined (UNFILTERED) ===
Accuracy       : 0.2825
Macro F1       : 0.3726
Weighted F1    : 0.2100


---
## Approach 5: Two-Stage Classification (Department Only)

In [None]:
checkpoint_s1 = get_latest_checkpoint("s1_v2") or get_latest_checkpoint("s1_fast")
checkpoint_s2 = get_latest_checkpoint("s2_v2") or get_latest_checkpoint("s2_fast")

if checkpoint_s1 and checkpoint_s2:
    print("\nEvaluating Two-Stage (UNFILTERED)")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    ds = Dataset.from_dict({'text': eval_df['title'].tolist()})
    def tok(batch): return tokenizer(batch['text'], truncation=True, padding=True, max_length=MAX_LEN)
    ds = ds.map(tok, batched=True, remove_columns=['text'])
    
    # S1
    model_s1 = AutoModelForSequenceClassification.from_pretrained(checkpoint_s1)
    trainer_s1 = Trainer(model=model_s1, tokenizer=tokenizer)
    p1_probs = torch.softmax(torch.tensor(trainer_s1.predict(ds).predictions), dim=-1)[:, 1].numpy()
    is_other = p1_probs >= 0.5
    
    # S2
    model_s2 = AutoModelForSequenceClassification.from_pretrained(checkpoint_s2)
    trainer_s2 = Trainer(model=model_s2, tokenizer=tokenizer)
    p2_probs = torch.softmax(torch.tensor(trainer_s2.predict(ds).predictions), dim=-1).numpy()
    
    # Encoders
    dept_no_other = dept_df[dept_df['label'] != 'Other']
    le_s2 = LabelEncoder()
    le_s2.fit(dept_no_other['label'].astype(str))
    
    # Logic
    TH2 = 0.7
    final_preds = np.array(["Other"] * len(eval_df), dtype=object)
    for i in range(len(eval_df)):
        if not is_other[i]:
            if p2_probs[i].max() >= TH2:
                final_preds[i] = le_s2.inverse_transform([np.argmax(p2_probs[i])])[0]
            else:
                final_preds[i] = "Other"
    
    y_true = eval_df['department'].tolist()
    acc = accuracy_score(y_true, final_preds)
    f1_m = f1_score(y_true, final_preds, average='macro', zero_division=0)
    f1_w = f1_score(y_true, final_preds, average='weighted', zero_division=0)
    
    all_results.append({
        'approach': 'Two-Stage', 'task': 'Department', 
        'accuracy': acc, 'f1_macro': f1_m, 'f1_weighted': f1_w
    })
    print(f"Two-Stage Accuracy (UNFILTERED): {acc:.4f}")


Evaluating Two-Stage (UNFILTERED)


Map:   0%|          | 0/623 [00:00<?, ? examples/s]

  trainer_s1 = Trainer(model=model_s1, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  trainer_s2 = Trainer(model=model_s2, tokenizer=tokenizer)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Two-Stage Accuracy (UNFILTERED): 0.6806


## Final Comparison & Analysis

In [None]:
# Save results for Notebook 99 compatibility
res_df = pd.DataFrame(all_results)
res_df.to_csv("./results/distilbert_comparison_results.csv", index=False)
print("Results saved to results/distilbert_comparison_results.csv")

display(res_df.pivot(index='approach', columns='task', values='accuracy'))

Results saved to results/distilbert_comparison_results.csv


task,Department,Seniority
approach,Unnamed: 1_level_1,Unnamed: 2_level_1
Baseline,0.277689,0.451043
Class Balancing,0.284109,0.451043
Combined,0.282504,0.463884
Oversampling,0.276083,0.463884
Two-Stage,0.680578,
