# 05. Pseudo-Labeling with Transformers

This notebook covers:
1. Generating pseudo-labels for unannotated CVs using the embedding baseline.
2. Filtering for high-confidence predictions (Silver Data).
3. Combining Gold (lookups) + Silver (pseudo-labels) data.
4. Fine-tuning Transformer classifiers on the combined dataset.
5. Comprehensive evaluation on annotated CVs.

In [1]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
import seaborn as sns
import matplotlib.pyplot as plt

import sys
import os
sys.path.append(os.path.abspath("../"))

from src.data.loader import load_label_lists, load_inference_dataset, load_evaluation_dataset, balance_dataset, prepare_dataset
from src.models.embedding_classifier import EmbeddingClassifier, create_domain_classifier, create_seniority_classifier
from src.models.transformer_classifier import TransformerClassifier

DATA_DIR = "../data"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


## 1. Load Data

In [2]:
# Load lookup tables (gold labels)
dept_df, sen_df = load_label_lists(DATA_DIR)

print(f"Gold-labeled data (lookup tables):\n  Department: {len(dept_df):,} examples\n  Seniority:  {len(sen_df):,} examples")

Applying encoding fix...
  Deduplication: 10145 -> 10145 (removed 0 duplicates)
  Deduplication: 9428 -> 9428 (removed 0 duplicates)
Gold-labeled data (lookup tables):
  Department: 10,145 examples
  Seniority:  9,428 examples


In [3]:
# Load unannotated CVs for pseudo-labeling
inference_df = load_inference_dataset(DATA_DIR)

print(f"\nUnannotated LinkedIn CVs: {len(inference_df)} positions")
print("These will be pseudo-labeled using the embedding baseline")


Unannotated LinkedIn CVs: 314 positions
These will be pseudo-labeled using the embedding baseline


## 2. Generate Pseudo-Labels Using Embedding Baseline


In [None]:
# Create embedding classifiers using factory functions (same as notebook 03)
print("Creating embedding classifiers for pseudo-labeling...")
dept_emb = create_domain_classifier(dept_df, use_examples=True)
sen_emb = create_seniority_classifier(sen_df, use_examples=True)
print("Embedding classifiers ready")

Creating embedding classifiers for pseudo-labeling...
Loading model 'paraphrase-multilingual-MiniLM-L12-v2' on cuda...
Model loaded successfully!
Fitted from examples: 11 labels, shape (11, 384)
Loading model 'paraphrase-multilingual-MiniLM-L12-v2' on cuda...
Model loaded successfully!
Fitted from examples: 5 labels, shape (5, 384)
‚úÖ Embedding classifiers ready


In [None]:
# Generate pseudo-labels with confidence scores
print("\nGenerating pseudo-labels for unannotated CVs...")
inference_texts = inference_df['text'].tolist()

dept_results = dept_emb.predict_with_confidence(inference_texts)
dept_pseudo_labels = [res[0] for res in dept_results]
dept_conf = [res[1] for res in dept_results]
sen_results = sen_emb.predict_with_confidence(inference_texts)
sen_pseudo_labels = [res[0] for res in sen_results]
sen_conf = [res[1] for res in sen_results]

inference_df['pseudo_dept'] = dept_pseudo_labels
inference_df['dept_conf'] = dept_conf
inference_df['pseudo_sen'] = sen_pseudo_labels
inference_df['sen_conf'] = sen_conf

print(f"Generated {len(inference_df)} pseudo-labels")
print(f"    Confidence statistics:\n  Dept - Mean: {np.mean(dept_conf):.3f}, Median: {np.median(dept_conf):.3f}")
print(f"    Sen  - Mean: {np.mean(sen_conf):.3f}, Median: {np.median(sen_conf):.3f}")


Generating pseudo-labels for unannotated CVs...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

‚úÖ Generated 314 pseudo-labels

Confidence statistics:
  Dept - Mean: 0.575, Median: 0.592
  Sen  - Mean: 0.568, Median: 0.587


## 3. Filter for High-Confidence Predictions


In [None]:
# Filter department pseudo-labels
CONFIDENCE_THRESHOLD = 0.85

dept_silver = inference_df[inference_df['dept_conf'] >= CONFIDENCE_THRESHOLD][['text', 'pseudo_dept']].copy()
dept_silver.columns = ['title', 'label']

sen_silver = inference_df[inference_df['sen_conf'] >= CONFIDENCE_THRESHOLD][['text', 'pseudo_sen']].copy()
sen_silver.columns = ['title', 'label']

print(f"High-confidence pseudo-labels (>{CONFIDENCE_THRESHOLD}):")
print(f"    Department: {len(dept_silver)} / {len(inference_df)} ({len(dept_silver)/len(inference_df):.1%})")
print(f"  Seniority:  {len(sen_silver)} / {len(inference_df)} ({len(sen_silver)/len(inference_df):.1%})")

High-confidence pseudo-labels (>0.85):
  Department: 2 / 314 (0.6%)
  Seniority:  0 / 314 (0.0%)


## 4. Combine Gold + Silver Data

In [None]:
# Combine lookup tables (gold) with pseudo-labeled CVs (silver)
dept_gold_df = dept_df[['text', 'label']].rename(columns={'text': 'title'})
dept_combined = pd.concat([dept_gold_df, dept_silver], ignore_index=True)

sen_gold_df = sen_df[['text', 'label']].rename(columns={'text': 'title'})
sen_combined = pd.concat([sen_gold_df, sen_silver], ignore_index=True)

print(f"Combined datasets:\n")
print(f"Department:\n  Gold:   {len(dept_gold_df):,}\n  Silver: {len(dept_silver):,}\n  Total:  {len(dept_combined):,}")
print(f"Seniority:\n  Gold:   {len(sen_gold_df):,}\n  Silver: {len(sen_silver):,}\n  Total:  {len(sen_combined):,}")

Combined datasets:

Department:
  Gold:   10,145
  Silver: 2
  Total:  10,147

Seniority:
  Gold:   9,428
  Silver: 0
  Total:  9,428


In [None]:
# Apply Data Balancing
print("Balancing combined department data...")
dept_balanced, _ = balance_dataset(dept_combined, min_samples=500, max_samples=1000)

print("\nBalancing combined seniority data...")
sen_balanced, _ = balance_dataset(sen_combined, min_samples=500, max_samples=1000)

print(f"Balanced Department Total: {len(dept_balanced):,}")
print(f"Balanced Seniority Total:  {len(sen_balanced):,}")

Balancing combined department data...
Balancing: 10147 -> 7120 samples
  Class distribution: {'Marketing': 1000, 'Sales': 1000, 'Information Technology': 1000, 'Business Development': 620, 'Project Management': 500, 'Consulting': 500, 'Administrative': 500, 'Other': 500, 'Purchasing': 500, 'Customer Support': 500, 'Human Resources': 500}

Balancing combined seniority data...
Balancing: 9428 -> 4240 samples
  Class distribution: {'Senior': 1000, 'Lead': 1000, 'Director': 984, 'Management': 756, 'Junior': 500}

Balanced Department Total: 7,120
Balanced Seniority Total:  4,240


## 5. Train/Val Split on Combined Data

In [None]:
# Split combined data (80/20)
dept_train_texts, dept_val_texts, dept_train_labels, dept_val_labels = train_test_split(
    dept_balanced['title'].tolist(), 
    dept_balanced['label'].tolist(), 
    test_size=0.2, 
    random_state=42
)

sen_train_texts, sen_val_texts, sen_train_labels, sen_val_labels = train_test_split(
    sen_balanced['title'].tolist(), 
    sen_balanced['label'].tolist(), 
    test_size=0.2, 
    random_state=42
)

print(f"Department split:\n  Train: {len(dept_train_texts):,}\n  Val:   {len(dept_val_texts):,}")
print(f"Seniority split:\n  Train: {len(sen_train_texts):,}\n  Val:   {len(sen_val_texts):,}")

Department split:
  Train: 5,696
  Val:   1,424

Seniority split:
  Train: 3,392
  Val:   848


## 6. Create Label Mappings

In [10]:
# Department label mappings
dept_unique_labels = sorted(dept_combined['label'].unique())
dept_label2id = {label: i for i, label in enumerate(dept_unique_labels)}
dept_id2label = {i: label for label, i in dept_label2id.items()}

# Seniority label mappings
sen_unique_labels = sorted(sen_combined['label'].unique())
sen_label2id = {label: i for i, label in enumerate(sen_unique_labels)}
sen_id2label = {i: label for label, i in sen_label2id.items()}

print(f"Department: {len(dept_label2id)} classes")
print(f"Seniority: {len(sen_label2id)} classes")

Department: 11 classes
Seniority: 5 classes


## 7. Train Department Classifier

In [None]:
# Initialize department classifier
dept_clf = TransformerClassifier(
    num_labels=len(dept_label2id),
    label2id=dept_label2id, 
    id2label=dept_id2label
)

print(f"Training department classifier on combined data...")
dept_clf.train(
    texts=dept_train_texts, 
    labels=[dept_label2id[l] for l in dept_train_labels], 
    val_texts=dept_val_texts,
    val_labels=[dept_label2id[l] for l in dept_val_labels],
    output_dir="./models/dept_transformer_pseudo",
    epochs=3,
    learning_rate=1e-5
)
print("Department classifier (pseudo) training complete!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on cuda
üöÄ Training department classifier on combined data...
Training on 5696 examples...


Epoch,Training Loss,Validation Loss
1,0.3191,0.190561
2,0.0564,0.055764
3,0.0426,0.049578


Training complete!

‚úÖ Department classifier (pseudo) training complete!


## 8. Train Seniority Classifier

In [None]:
# Initialize seniority classifier
sen_clf = TransformerClassifier(
    num_labels=len(sen_label2id),
    label2id=sen_label2id, 
    id2label=sen_id2label
)

print(f"Training seniority classifier on combined data...")
sen_clf.train(
    texts=sen_train_texts, 
    labels=[sen_label2id[l] for l in sen_train_labels], 
    val_texts=sen_val_texts,
    val_labels=[sen_label2id[l] for l in sen_val_labels],
    output_dir="./models/sen_transformer_pseudo",
    epochs=3,
    learning_rate=1e-5
)
print("Seniority classifier (pseudo) training complete!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on cuda
üöÄ Training seniority classifier on combined data...
Training on 3392 examples...


Epoch,Training Loss,Validation Loss
1,0.3976,0.285258
2,0.1129,0.095368
3,0.0558,0.08065


Training complete!

‚úÖ Seniority classifier (pseudo) training complete!


## 9. Evaluation

### 9.1 In-Distribution Evaluation (Validation Set)

In [13]:
# Department In-Distribution Evaluation
dept_val_preds = dept_clf.predict_labels(dept_val_texts)
dept_id_acc = accuracy_score(dept_val_labels, dept_val_preds)
sen_val_preds = sen_clf.predict_labels(sen_val_texts)
sen_id_acc = accuracy_score(sen_val_labels, sen_val_preds)

print("=" * 60)
print("IN-DISTRIBUTION EVALUATION (Validation Set)")
print("=" * 60)
print(f"Department Accuracy (Val): {dept_id_acc:.4f}")
print(f"Seniority Accuracy (Val):  {sen_id_acc:.4f}")
print("=" * 60)

IN-DISTRIBUTION EVALUATION (Validation Set)
Department Accuracy (Val): 0.9937
Seniority Accuracy (Val):  0.9800


### 9.2 Real-World Evaluation (Annotated CVs)

‚ö†Ô∏è **LOADING ANNOTATED DATA FOR EVALUATION**

In [None]:
# Load annotated dataset for evaluation
eval_df = load_evaluation_dataset(DATA_DIR)

print(f"Loaded {len(eval_df)} annotated CV positions for evaluation")

üìä Loaded 478 annotated CV positions for evaluation


#### Department Evaluation

In [15]:
# Predict on evaluation set
eval_titles = eval_df['title'].tolist()
dept_predictions = dept_clf.predict_labels(eval_titles)

# Ground truth
dept_true = eval_df['department'].tolist()

# Calculate metrics
dept_accuracy = accuracy_score(dept_true, dept_predictions)
dept_precision, dept_recall, dept_f1, _ = precision_recall_fscore_support(
    dept_true, dept_predictions, average='macro', zero_division=0
)
dept_weighted_f1 = precision_recall_fscore_support(
    dept_true, dept_predictions, average='weighted', zero_division=0
)[2]

print("\n" + "="*60)
print("DEPARTMENT CLASSIFICATION RESULTS (Real-World)")
print("="*60)
print(f"Accuracy:          {dept_accuracy:.4f}")
print(f"Precision (macro): {dept_precision:.4f}")
print(f"Recall (macro):    {dept_recall:.4f}")
print(f"F1-score (macro):  {dept_f1:.4f}")
print(f"F1-score (wtd):    {dept_weighted_f1:.4f}")
print("="*60)

# Per-class F1 scores
dept_report = classification_report(dept_true, dept_predictions, output_dict=True, zero_division=0)

print("\nPer-Class F1 Scores (Department):")
dept_f1_scores = {label: metrics['f1-score'] for label, metrics in dept_report.items() 
                  if label not in ['accuracy', 'macro avg', 'weighted avg']}
for label, f1 in sorted(dept_f1_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"  {label:<30}: {f1:.4f}")


DEPARTMENT CLASSIFICATION RESULTS (Real-World)
Accuracy:          0.3159
Precision (macro): 0.4979
Recall (macro):    0.4856
F1-score (macro):  0.4009
F1-score (wtd):    0.2392

Per-Class F1 Scores (Department):
  Sales                         : 0.7429
  Project Management            : 0.7119
  Human Resources               : 0.5333
  Purchasing                    : 0.4800
  Marketing                     : 0.4762
  Consulting                    : 0.4390
  Information Technology        : 0.3434
  Customer Support              : 0.2857
  Business Development          : 0.2703
  Administrative                : 0.1111
  Other                         : 0.0158


#### Seniority Evaluation

In [16]:
# Predict on evaluation set
sen_predictions = sen_clf.predict_labels(eval_titles)

# Ground truth
sen_true = eval_df['seniority'].tolist()

# Calculate metrics
sen_accuracy = accuracy_score(sen_true, sen_predictions)
sen_precision, sen_recall, sen_f1, _ = precision_recall_fscore_support(
    sen_true, sen_predictions, average='macro', zero_division=0
)
sen_weighted_f1 = precision_recall_fscore_support(
    sen_true, sen_predictions, average='weighted', zero_division=0
)[2]

print("\n" + "="*60)
print("SENIORITY CLASSIFICATION RESULTS (Real-World)")
print("="*60)
print(f"Accuracy:          {sen_accuracy:.4f}")
print(f"Precision (macro): {sen_precision:.4f}")
print(f"Recall (macro):    {sen_recall:.4f}")
print(f"F1-score (macro):  {sen_f1:.4f}")
print(f"F1-score (wtd):    {sen_weighted_f1:.4f}")
print("="*60)

# Per-class F1 scores
sen_report = classification_report(sen_true, sen_predictions, output_dict=True, zero_division=0)

print("\nPer-Class F1 Scores (Seniority):")
sen_f1_scores = {label: metrics['f1-score'] for label, metrics in sen_report.items() 
                 if label not in ['accuracy', 'macro avg', 'weighted avg']}
for label, f1 in sorted(sen_f1_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"  {label:<30}: {f1:.4f}")


SENIORITY CLASSIFICATION RESULTS (Real-World)
Accuracy:          0.4749
Precision (macro): 0.4123
Recall (macro):    0.6183
F1-score (macro):  0.4388
F1-score (wtd):    0.4363

Per-Class F1 Scores (Seniority):
  Management                    : 0.7823
  Director                      : 0.7143
  Lead                          : 0.6474
  Senior                        : 0.3366
  Junior                        : 0.1519
  Professional                  : 0.0000
