In [None]:
!pip install -q datasets transformers seqeval accelerate


In [None]:

from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from torch.utils.data import DataLoader
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import logging, os, json, re

logging.basicConfig(level=logging.INFO)
logging.getLogger("datasets").setLevel(logging.WARNING)
logging.getLogger("transformers.trainer").setLevel(logging.INFO)

MODEL_CHECKPOINT = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

MAX_LEN = 512
IGNORE_INDEX = -100
BATCH_SIZE = 16
NUM_PROC = 2
OUTPUT_DIR = "/content/ner-mbert-500k"
os.makedirs(OUTPUT_DIR, exist_ok=True)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
ds_raw = load_dataset("ai4privacy/open-pii-masking-500k-ai4privacy")

print(ds_raw)
for k in ds_raw.keys():
    print(k, ds_raw[k].column_names)


DatasetDict({
    train: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 464150
    })
    validation: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 116077
    })
})
train ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes']
validation ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes']


In [None]:

def make_splits_from_column(ds_dict, split_col="split"):
    parts = []
    for k in ds_dict.keys():
        parts.append(ds_dict[k])
    all_ds = concatenate_datasets(parts) if len(parts) > 1 else list(ds_dict.values())[0]

    if split_col not in all_ds.column_names:
        if "train" in ds_dict:
            out = {}
            out["train"] = ds_dict["train"]
            if "validation" in ds_dict:
                out["validation"] = ds_dict["validation"]
            elif "test" in ds_dict:
                out["validation"] = ds_dict["test"]
            else:
                tmp = ds_dict["train"].train_test_split(test_size=0.01, seed=42)
                out["train"], out["validation"] = tmp["train"], tmp["test"]
            return DatasetDict(out)
        tmp = all_ds.train_test_split(test_size=0.01, seed=42)
        return DatasetDict(train=tmp["train"], validation=tmp["test"])

    uniq = set(all_ds[split_col])
    out = {}
    out["train"] = all_ds.filter(lambda ex: ex[split_col] == "train") if "train" in uniq else all_ds.filter(lambda ex: ex[split_col] != "validation")

    if "validation" in uniq:
        out["validation"] = all_ds.filter(lambda ex: ex[split_col] == "validation")
    elif "dev" in uniq:
        out["validation"] = all_ds.filter(lambda ex: ex[split_col] == "dev")
    elif "test" in uniq:
        out["validation"] = all_ds.filter(lambda ex: ex[split_col] == "test")
    else:
        tmp = out["train"].train_test_split(test_size=0.01, seed=42)
        out["train"], out["validation"] = tmp["train"], tmp["test"]

    return DatasetDict(out)

ds = make_splits_from_column(ds_raw, split_col="split")
print(ds)


Filter:   0%|          | 0/580227 [00:00<?, ? examples/s]

Filter:   0%|          | 0/580227 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 464150
    })
    validation: Dataset({
        features: ['source_text', 'masked_text', 'privacy_mask', 'split', 'uid', 'language', 'region', 'script', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 116077
    })
})


In [None]:

TOKENS_COL = "mbert_tokens"
TAGS_COL = "mbert_token_classes"

for split in ds.keys():
    assert TOKENS_COL in ds[split].column_names, f"{TOKENS_COL} missing in {split}"
    assert TAGS_COL in ds[split].column_names, f"{TAGS_COL} missing in {split}"


In [None]:
# 6) Build label set from training (keep 'O' first, then entities grouped)
unique_labels = set()
for ex in ds["train"]:
    for l in ex[TAGS_COL]:
        unique_labels.add(l)

def label_sort_key(lbl):
    if lbl == "O": return (0, "")
    m = re.match(r"([BI])-(.+)", lbl)
    if m: return (1, m.group(2), m.group(1))  # group by entity then B/I
    return (2, lbl)

label_list = sorted(unique_labels, key=label_sort_key)
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

with open(os.path.join(OUTPUT_DIR, "labels.json"), "w") as f:
    json.dump({"label_list": label_list}, f, indent=2)

print(f"#labels = {len(label_list)}")
print("First labels:", label_list[:20])


#labels = 40
First labels: ['O', 'B-AGE', 'B-BUILDINGNUM', 'I-BUILDINGNUM', 'B-CITY', 'I-CITY', 'B-CREDITCARDNUMBER', 'I-CREDITCARDNUMBER', 'B-DATE', 'I-DATE', 'B-DRIVERLICENSENUM', 'I-DRIVERLICENSENUM', 'B-EMAIL', 'I-EMAIL', 'B-GENDER', 'I-GENDER', 'B-GIVENNAME', 'I-GIVENNAME', 'B-IDCARDNUM', 'I-IDCARDNUM']


In [None]:
# Encode examples using provided mBERT tokens/classes (NO text reconstruction)
CLS_ID = tokenizer.cls_token_id
SEP_ID = tokenizer.sep_token_id
UNK_ID = tokenizer.unk_token_id if tokenizer.unk_token_id is not None else 100

def safe_convert_token_to_id(tok: str):
    tid = tokenizer.convert_tokens_to_ids(tok)
    if tid is None or tid == 0:  # some tokenizers can return 0 for unknown
        return UNK_ID
    return tid

def encode_example(ex):
    tokens = ex[TOKENS_COL]
    labels_str = ex[TAGS_COL]

    token_ids = [safe_convert_token_to_id(t) for t in tokens]

    avail = MAX_LEN - 2
    token_ids = token_ids[:avail]
    labels_enc = [label_to_id.get(l, label_to_id["O"]) for l in labels_str[:avail]]

    input_ids = [CLS_ID] + token_ids + [SEP_ID]
    labels = [IGNORE_INDEX] + labels_enc + [IGNORE_INDEX]
    attention_mask = [1] * len(input_ids)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

processed = {}
for split in ["train", "validation"]:
    processed[split] = ds[split].map(
        encode_example,
        remove_columns=ds[split].column_names,  # drop ALL original columns (PII-safe)
        desc=f"Encoding {split}",
        num_proc=NUM_PROC
    )

print(processed["train"])


Encoding train (num_proc=2):   0%|          | 0/464150 [00:00<?, ? examples/s]

Encoding validation (num_proc=2):   0%|          | 0/116077 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 464150
})


In [None]:
# Build model, collator, metrics (seqeval)
num_labels = len(label_list)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=-1)
    labels = p.label_ids

    true_preds = []
    true_labels = []
    for pred_row, lab_row in zip(preds, labels):
        cur_preds = []
        cur_labels = []
        for p_id, l_id in zip(pred_row, lab_row):
            if l_id == IGNORE_INDEX:
                continue
            cur_preds.append(id_to_label[p_id])
            cur_labels.append(id_to_label[l_id])
        true_preds.append(cur_preds)
        true_labels.append(cur_labels)

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 9) TrainingArguments & Trainer
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=2,       # adjust
    weight_decay=0.01,
    logging_steps=100,
    report_to=[],             # avoid external logging for PII safety
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    bf16=False,               # set True if your GPU supports BF16
    fp16=True,                # mixed precision on Colab T4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed["train"],
    eval_dataset=processed["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# 10) Train (no raw text logged)
train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)
with open(os.path.join(OUTPUT_DIR, "train_results.json"), "w") as f:
    json.dump(train_result.metrics, f, indent=2)

print("Training done. Best model saved to:", OUTPUT_DIR)


In [None]:
# 11) Evaluate & show seqeval classification report (safe)
eval_metrics = trainer.evaluate()
print("Eval metrics:", eval_metrics)

# Detailed report (strings only, still safe)
preds_logits, labels_ids, _ = trainer.predict(processed["validation"])
preds = np.argmax(preds_logits, axis=-1)

def to_label_lists(preds, labels):
    all_preds, all_labels = [], []
    for p_row, l_row in zip(preds, labels):
        cur_p, cur_l = [], []
        for p_id, l_id in zip(p_row, l_row):
            if l_id == IGNORE_INDEX:
                continue
            cur_p.append(id_to_label[p_id])
            cur_l.append(id_to_label[l_id])
        all_preds.append(cur_p)
        all_labels.append(cur_l)
    return all_preds, all_labels

pred_lists, label_lists = to_label_lists(preds, labels_ids)
print(classification_report(label_lists, pred_lists, digits=4))


In [None]:
# 12) Save tokenizer + label mapping (already saved model)
tokenizer.save_pretrained(OUTPUT_DIR)
with open(os.path.join(OUTPUT_DIR, "labels.json"), "w") as f:
    json.dump({"label_list": label_list}, f, indent=2)

print("Artifacts saved to:", OUTPUT_DIR)


In [None]:
# 13) (Optional) Save processed dataset (IDs only; no PII)
from datasets import DatasetDict
processed_dict = DatasetDict(processed)
save_path = "/content/secure_500k_tokenized_ids"
processed_dict.save_to_disk(save_path)
print("Processed dataset saved to:", save_path)

# 14) (Optional) Encrypt saved dataset files with GPG
# !gpg --symmetric --cipher-algo AES256 /content/secure_500k_tokenized_ids/train.arrow
# !gpg --symmetric --cipher-algo AES256 /content/secure_500k_tokenized_ids/validation.arrow
