In [1]:
from datasets import load_from_disk
import json

# Load tokenized dataset
dataset = load_from_disk("data/pii43k_tokenized")

# Load label mappings
with open("data/label2id.json") as f:
    label2id = json.load(f)

with open("data/id2label.json") as f:
    id2label = json.load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
!pip install 'accelerate>=0.26.0'



In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output/pii_ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="output/logs",
    logging_steps=10,
)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [12]:
!pip uninstall torch accelerate -y

Found existing installation: torch 2.5.1
Uninstalling torch-2.5.1:
  Successfully uninstalled torch-2.5.1
Found existing installation: accelerate 1.5.2
Uninstalling accelerate-1.5.2:
  Successfully uninstalled accelerate-1.5.2


In [14]:
!pip uninstall transformers -y

Found existing installation: transformers 4.49.0
Uninstalling transformers-4.49.0:
  Successfully uninstalled transformers-4.49.0


In [15]:
!pip install torch accelerate

Collecting torch
  Downloading torch-2.6.0-cp311-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting accelerate
  Using cached accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Downloading torch-2.6.0-cp311-none-macosx_11_0_arm64.whl (66.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m235.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:08[0m
[?25hUsing cached accelerate-1.5.2-py3-none-any.whl (345 kB)
Installing collected packages: torch, accelerate
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.20.1 requires torch==2.5.1, but you have torch 2.6.0 which is incompatible.[0m[31m
[0mSuccessfully installed accelerate-1.5.2 torch-2.6.0


In [None]:
pip install transformers

In [None]:
from transformers import Trainer
from seqeval.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)

    true_preds = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"]
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
model.save_pretrained("output/pii_ner_model")
tokenizer.save_pretrained("output/pii_ner_model")
