In [None]:
!pip install -q evaluate seqeval
!pip install -q optimum[onnxruntime] onnx onnxruntime

In [None]:
import torch
import os
import json
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification, 
    EarlyStoppingCallback
)
from optimum.onnxruntime import ORTModelForTokenClassification
import torch
import shutil
from collections import Counter
import copy
import gc
import onnxruntime as ort
from tqdm import tqdm
from IPython.display import FileLink

os.environ["HF_HOME"] = "/kaggle/working/hf"
os.environ["TRANSFORMERS_CACHE"] = "/kaggle/working/hf"
os.environ["HF_DATASETS_CACHE"] = "/kaggle/working/hf"

In [None]:
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    total_mem = torch.cuda.get_device_properties(0).total_memory
    
    print(f"✅ GPU Found: {device_name}")
    print(f"Memory: {total_mem / 1024**3:.2f} GB")
else:
    print("❌ GPU NOT found. Check your drivers or Secure Boot.")

# PII Masking Model Pipeline (Browser Optimized)

This notebook implements a pipeline to create a small, efficient PII masking model using Quantization.

**Key Steps:**
1. **Load Data**: `ai4privacy/pii-masking-300k` (Filter for **English** only).
2. **Preprocessing**: Robust tokenization using character offsets to handle dataset quirks.
3. **Fine-tune** -> **Quantize**.


In [None]:
# Configuration
MODEL_CHECKPOINT = "distilbert-base-uncased"
DATASET_NAME = "ai4privacy/pii-masking-300k"
OUTPUT_DIR = "/kaggle/working/pii_model_output"

os.makedirs(OUTPUT_DIR, exist_ok=True)

## 1. Data Loading & Splits
We load the dataset and filter for **English** (`language == 'English'`).
Since the dataset only provides `train` and `validation` splits, we split `validation` into disjoint `validation` and `test` sets (50/50).

In [None]:
# Load Dataset
dataset = load_dataset(DATASET_NAME, trust_remote_code=True)
dataset = dataset.filter(lambda x: x["language"] == "English")

if "test" not in dataset:
    print("Creating Test split from Validation...")
    val_test_split = dataset["validation"].train_test_split(test_size=0.5, seed=42)
    dataset["validation"] = val_test_split["train"]
    dataset["test"] = val_test_split["test"]

print(dataset)


## 2. Parsing & Label Extraction
The dataset stores complex fields (`privacy_mask`) as JSON strings (in some versions) or list objects. We ensure they are parsed and extract unique labels.

In [None]:
def parse_dataset_row(example):
    if isinstance(example['privacy_mask'], str):
        example['privacy_mask'] = json.loads(example['privacy_mask'])
    for entity in example['privacy_mask']:
        entity['label'] = 'PII'
    return example

# We map once. This is cached by HF datasets so subsequent calls are fast.
dataset = dataset.map(parse_dataset_row)

print("Mapping labels to O, B-PII, I-PII...")
label_list = ["O", "B-PII", "I-PII"]

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
print(f"Unique labels: {len(label_list)}")


## 3. Robust Tokenization & Alignment
We use `return_offsets_mapping=True` to map directly from character spans in `privacy_mask` to tokenizer tokens.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["source_text"],
        truncation=True,
        max_length=512,
        stride=64,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=False # Dynamic padding in DataCollator
    )

    labels = []
    sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    for i, offsets in enumerate(offset_mapping):
        sample_idx = sample_mapping[i]
        mask_list = examples["privacy_mask"][sample_idx]
        
        # Initialize with O
        chunk_labels = [label2id["O"]] * len(tokenized_inputs["input_ids"][i])
        
        for entity in mask_list:
            label_type = entity["label"]
            start_char = entity["start"]
            end_char = entity["end"]
            
            b_id = label2id.get(f"B-{label_type}")
            i_id = label2id.get(f"I-{label_type}")
            if b_id is None: continue
            
            # Find all overlapping tokens for this entity in this chunk
            # Overlap condition: not (token_end <= entity_start or token_start >= entity_end)
            overlapping_indices = []
            for idx, (t_start, t_end) in enumerate(offsets):
                if t_start == 0 and t_end == 0: continue
                if not (t_end <= start_char or t_start >= end_char):
                    overlapping_indices.append(idx)
            
            # Assign Labels
            for k, idx in enumerate(overlapping_indices):
                current_label = chunk_labels[idx]
                # Don't overwrite existing B/I labels unless they are O or -100
                if current_label != label2id["O"] and current_label != -100:
                     continue
                
                # Logic: The FIRST overlapping token gets B, others I.
                # UNLESS the entity actually started before this token (Sliding window split).
                if k == 0:
                    t_start = offsets[idx][0]
                    # If entity start is >= token start, it means this token contains the start -> B
                    if start_char >= t_start:
                        chunk_labels[idx] = b_id
                    else:
                        # Entity started before this token -> I
                        chunk_labels[idx] = i_id
                else:
                    chunk_labels[idx] = i_id
        
        # Mask special tokens
        for idx, (t_start, t_end) in enumerate(offsets):
            if t_start == 0 and t_end == 0:
                chunk_labels[idx] = -100
        
        labels.append(chunk_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels, 
    batched=True, 
    remove_columns=dataset["train"].column_names
)


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT, 
    num_labels=len(label_list), 
    id2label=id2label, 
    label2id=label2id
)
print(f"Model initialized with {len(label_list)} labels and sliding window configuration.")


In [None]:
# Evaluation Metrics
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
def evaluate_accuracy(model, dataset, batch_size=16):
    print("Evaluating model accuracy...")
    
    # Use a separate Trainer for evaluation to leverage the DataCollator and efficient batching
    eval_trainer = Trainer(
        model=model,
        args=TrainingArguments(output_dir="/tmp/eval", per_device_eval_batch_size=batch_size, report_to="none"),
        data_collator=DataCollatorForTokenClassification(tokenizer),
        eval_dataset=dataset,
        compute_metrics=compute_metrics
    )
    
    metrics = eval_trainer.evaluate()
    print("Evaluation Results:", metrics)
    return metrics


In [None]:
# 6. Class Imbalance Handling
from torch.nn import CrossEntropyLoss

def compute_class_weights(dataset, label2id):
    print("Computing class weights...")
    label_counts = torch.zeros(len(label2id))
    
    # Iterate over train set to count labels
    # Note: This might be slow if we iterate python-side.
    # A faster way is roughly estimating or using the already known statistics if available.
    # But let's do a quick pass or use a subset if needed. Here we do full pass.
    for i, example in enumerate(dataset):
        labels = example['labels']
        for label in labels:
            if label != -100:
                label_counts[label] += 1
    
    print(f"Label counts: {label_counts}")
    
    # Inverse frequency with smoothing
    weights = 1.0 / (label_counts + 100)
    weights = weights / weights.sum() * len(label2id) # Normalize
    
    # Clamp to avoid extreme weights (e.g. for O vs rare B-tags)
    weights = torch.clamp(weights, min=0.1, max=10.0)
    weights[label2id["O"]] = 1.0
    
    return weights

# Compute weights using the training set
class_weights = compute_class_weights(tokenized_datasets['train'], label2id)
print("Class weights calculated.", class_weights)


In [None]:
# 7. Custom Weighted Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Use class weights injected into the loss function
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device), ignore_index=-100)
        loss = loss_fct(logits.view(-1, len(label2id)), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss


In [None]:
# 8. Training Configuration & Execution
args = TrainingArguments(
    OUTPUT_DIR,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    
    learning_rate=2e-5, 
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    num_train_epochs=4,
    
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4, 
    
    weight_decay=0.01,
    fp16=torch.cuda.is_available(), 
    dataloader_num_workers=4,
    logging_steps=100,
    report_to="none", 
    seed=42
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Starting Training...")
try:
    trainer.train()
    print("✅ Training completed successfully.")
    trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
except KeyboardInterrupt:
    print("\n🛑 Training interrupted by user. Saving checkpoint...")
    trainer.save_model(os.path.join(OUTPUT_DIR, "interrupted_checkpoint"))
    print("Checkpoint saved.")
except Exception as e:
    print(f"\n❌ info: Training failed with error: {e}")
    # Attempt to save despite error
    try:
        trainer.save_model(os.path.join(OUTPUT_DIR, "failed_checkpoint"))
        print("Crash checkpoint saved.")
    except:
        print("Could not save crash checkpoint.")
    raise e


# 7. Quantization

## Export Model and Metadata to ONNX

In [None]:
# Configuration
FINAL_MODEL_PATH = "/kaggle/working/pii_model_output/final_model"
EXPORT_DIR = "/kaggle/working/browser_ready_pack"

import os
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
os.makedirs(EXPORT_DIR, exist_ok=True)

# 1. Export to ONNX using Optimum (This includes config.json and tokenizer files automatically)
print(f"🔄 Exporting model to ONNX via Optimum...")
ort_model = ORTModelForTokenClassification.from_pretrained(FINAL_MODEL_PATH, export=True)
tokenizer = AutoTokenizer.from_pretrained(FINAL_MODEL_PATH)

# Save the FP32 model, config.json, and tokenizer files to the EXPORT_DIR
ort_model.save_pretrained(EXPORT_DIR)
tokenizer.save_pretrained(EXPORT_DIR)
print(f"✅ Exported FP32 model and metadata to {EXPORT_DIR}")


## Quantize the ONNX Model (INT8)

In [None]:
print(f"📉 Quantizing ONNX model to INT8 via Optimum...")

EXPORT_DIR = "/kaggle/working/browser_ready_pack"

# Use the ORTQuantizer from Optimum for proper graph optimization
quantizer = ORTQuantizer.from_pretrained(EXPORT_DIR, file_name="model.onnx")

# Create a quantization configuration (Dynamic int8 is best for CPU/browser)
dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)

# This will optimize the graph, quantize it, and save it as `model_quantized.onnx`
quantizer.quantize(save_dir=EXPORT_DIR, quantization_config=dqconfig)

# We can safely delete the original FP32 model now to save space
# The browser only needs the `model_quantized.onnx`
original_model_path = os.path.join(EXPORT_DIR, "model.onnx")
if os.path.exists(original_model_path):
    os.remove(original_model_path)
    print("🗑️ Deleted original FP32 model.onnx to reduce bundle size.")

print(f"🎉 Done! The optimized INT8 model is saved in {EXPORT_DIR}")


In [None]:
# Load the metric
metric = evaluate.load("seqeval")

def evaluate_onnx(model_path, dataset, label_list):
    print(f"🕵️‍♀️ Evaluating ONNX model: {model_path}")
    
    # Create ONNX Runtime session
    # providers=['CUDAExecutionProvider', 'CPUExecutionProvider'] if you have GPU, else just CPU
    session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])
    
    input_name = session.get_inputs()[0].name
    label_map = {i: label for i, label in enumerate(label_list)}
    
    predictions = []
    references = []
    
    print("Running inference...")
    for batch in tqdm(dataset):
        # Prepare inputs for ONNX (needs numpy arrays)
        inputs = {
            "input_ids": np.array([batch["input_ids"]], dtype=np.int64),
            "attention_mask": np.array([batch["attention_mask"]], dtype=np.int64)
        }
        
        # Run inference
        outputs = session.run(None, inputs)[0] # [batch, seq_len, num_labels]
        preds = np.argmax(outputs, axis=2)[0]  # [seq_len]
        
        # Align predictions with labels (filtering out -100)
        true_labels = [label_map[l] for l in batch["labels"] if l != -100]
        true_preds = [label_map[p] for (p, l) in zip(preds, batch["labels"]) if l != -100]
        
        predictions.append(true_preds)
        references.append(true_labels)
        
    results = metric.compute(predictions=predictions, references=references)
    return results

print("Evaluating INT8 Model...")
label_list = ["O", "B-PII", "I-PII"]
onnx_results = evaluate_onnx(QUANTIZED_ONNX_PATH, tokenized_datasets["test"], label_list)
print("ONNX Results:", onnx_results)


In [None]:
import shutil
import os

EXPORT_DIR = "/kaggle/working/browser_ready_pack"

# The EXPORT_DIR already contains the FP32 ONNX model, config.json, tokenizer files, 
# and the new model_quantized.onnx. It is completely ready for Transformers.js!

# 5. Zip it up!
print("\n📦 Zipping the browser-ready pack...")
shutil.make_archive("/kaggle/working/pii_browser_pack", 'zip', EXPORT_DIR)
print("🎉 Done! Download 'pii_browser_pack.zip' from the Output tab.")


In [None]:
FileLink(r'pii_browser_pack.zip')

In [None]:
print("Zipping model for download...")
shutil.make_archive("/kaggle/working/pii_model", 'zip', os.path.join(OUTPUT_DIR, "final_model"))
print("Done! You can now download pii_model.zip from the Output tab.")

In [None]:
FileLink(r'pii_model.zip')