In [None]:

import os
import json
import argparse
import random
from typing import List, Dict, Any

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score


# ---------- Data utilities ----------

def load_iob_data(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("Input JSON must be a list of samples.")
    return data


def validate_data(data: List[Dict[str, Any]], sample_check: int = 3) -> None:
    if len(data) == 0:
        raise ValueError("Dataset is empty.")
    for i, ex in enumerate(data[:sample_check]):
        if "tokens" not in ex or "labels" not in ex:
            raise ValueError(f"Sample {i} missing 'tokens' or 'labels' keys.")
        if len(ex["tokens"]) != len(ex["labels"]):
            raise ValueError(f"Sample {i} has mismatched tokens({len(ex['tokens'])}) and labels({len(ex['labels'])}).")


def create_label_map(dataset: List[Dict[str, Any]]):
    unique = sorted({lab for ex in dataset for lab in ex["labels"]})
    label2id = {l: i for i, l in enumerate(unique)}
    id2label = {i: l for l, i in label2id.items()}
    return unique, label2id, id2label


# ---------- Tokenization & alignment ----------

def tokenize_and_align_labels(example, tokenizer, label2id, max_length: int = 256):
    # Dynamic padding (no padding here; DataCollator will pad batches)
    encoded = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=max_length,
    )
    word_ids = encoded.word_ids()
    labels = example["labels"]

    aligned_labels = []
    prev_word_id = None
    for w_id in word_ids:
        if w_id is None:
            aligned_labels.append(-100)  # special/padding tokens
        elif w_id != prev_word_id:
            aligned_labels.append(label2id[labels[w_id]])  # first subtoken
        else:
            aligned_labels.append(-100)  # subsequent subtokens
        prev_word_id = w_id

    encoded["labels"] = aligned_labels
    return encoded


# ---------- Metrics ----------

def compute_metrics(eval_pred, id2label):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_labels = []
    pred_labels = []
    for p_seq, l_seq in zip(preds, labels):
        tl_seq = []
        pl_seq = []
        for p, l in zip(p_seq, l_seq):
            if l == -100:
                continue
            tl_seq.append(id2label[int(l)])
            pl_seq.append(id2label[int(p)])
        true_labels.append(tl_seq)
        pred_labels.append(pl_seq)

    return {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
    }


# ---------- Main ----------

def main():
    parser = argparse.ArgumentParser(description="Phase 1 NER fine-tuning (10k train, 1k eval)")
    parser.add_argument("--data_path", type=str, default="final_merged_iob_dataset.json")
    parser.add_argument("--model_name", type=str, default="bert-base-cased")
    parser.add_argument("--output_dir", type=str, default="./first_phase_ner_model")
    parser.add_argument("--train_size", type=int, default=10_000)
    parser.add_argument("--eval_size", type=int, default=1_000)
    parser.add_argument("--max_length", type=int, default=256)
    parser.add_argument("--epochs", type=int, default=5)
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--learning_rate", type=float, default=5e-5)
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--warmup_ratio", type=float, default=0.1)
    parser.add_argument("--grad_accum", type=int, default=1)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--num_proc", type=int, default=1, help="Set >1 to parallelize Dataset.map() if your OS supports it.")
    args = parser.parse_args() if hasattr(__builtins__, "__IPYTHON__") is False else parser.parse_args([])

    os.makedirs(args.output_dir, exist_ok=True)

    print("Config:")
    for k, v in vars(args).items():
        print(f"  {k}: {v}")

    # Reproducibility
    set_seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Load and validate data
    data = load_iob_data(args.data_path)
    validate_data(data)

    # Create label maps from full dataset
    label_list, label2id, id2label = create_label_map(data)
    print(f"Found {len(label_list)} labels.")

    # Shuffle and slice Phase 1 subsets
    data_shuffled = random.sample(data, len(data))
    if len(data_shuffled) < args.train_size + args.eval_size:
        raise ValueError(f"Dataset too small: need {args.train_size + args.eval_size}, have {len(data_shuffled)}")

    train_raw = data_shuffled[:args.train_size]
    eval_raw = data_shuffled[args.train_size:args.train_size + args.eval_size]

    # Tokenizer and datasets
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    def _map_fn(ex):
        return tokenize_and_align_labels(ex, tokenizer, label2id, max_length=args.max_length)

    train_ds = Dataset.from_list(train_raw).map(_map_fn, batched=False, num_proc=args.num_proc, desc="Tokenizing train")
    eval_ds = Dataset.from_list(eval_raw).map(_map_fn, batched=False, num_proc=args.num_proc, desc="Tokenizing eval")

    print(f"Phase 1 → Train: {len(train_ds)} | Eval: {len(eval_ds)}")

    # Model
    model = AutoModelForTokenClassification.from_pretrained(
        args.model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
    )

    # Training args
    fp16 = torch.cuda.is_available()
    bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported() if hasattr(torch.cuda, "is_bf16_supported") else False

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir=os.path.join(args.output_dir, "logs"),
        logging_steps=50,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        gradient_accumulation_steps=args.grad_accum,
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        warmup_ratio=args.warmup_ratio,
        seed=args.seed,
        group_by_length=True,
        fp16=fp16 and not bf16,
        bf16=bf16,
        report_to="none",
    )

    collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=lambda p: compute_metrics(p, id2label),
    )

    # Train
    print("Starting training...")
    trainer.train()
    print("Training complete.")

    # Evaluate
    print("Evaluating best model...")
    metrics = trainer.evaluate()
    for k, v in metrics.items():
        print(f"{k}: {v}")

    # Save model, tokenizer, and label maps
    model.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    with open(os.path.join(args.output_dir, "label_map.json"), "w", encoding="utf-8") as f:
        json.dump({"label_list": label_list, "label2id": label2id, "id2label": id2label}, f, ensure_ascii=False, indent=2)

    print(f"Artifacts saved to {args.output_dir}")


if __name__ == "__main__":
    main()


Config:
  data_path: final_merged_iob_dataset.json
  model_name: bert-base-cased
  output_dir: ./first_phase_ner_model
  train_size: 10000
  eval_size: 1000
  max_length: 256
  epochs: 5
  batch_size: 16
  learning_rate: 5e-05
  weight_decay: 0.01
  warmup_ratio: 0.1
  grad_accum: 1
  seed: 42
  num_proc: 1
Found 7 labels.


Tokenizing train: 100%|██████████| 10000/10000 [00:02<00:00, 3565.60 examples/s]
Tokenizing eval: 100%|██████████| 1000/1000 [00:00<00:00, 3717.54 examples/s]


Phase 1 → Train: 10000 | Eval: 1000


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0033,0.003996,0.99922,0.992703,0.997002,0.994848
2,0.0022,0.000433,0.999766,0.999001,0.999334,0.999167
3,0.0003,0.001834,0.99961,0.997004,0.997668,0.997336
4,0.0001,0.000345,0.999922,0.999667,0.999001,0.999334
5,0.0001,0.000111,0.999922,0.999667,0.999001,0.999334


Training complete.
Evaluating best model...


eval_loss: 0.0003853101225104183
eval_accuracy: 0.9999220212102308
eval_precision: 0.9996666666666667
eval_recall: 0.9990006662225184
eval_f1: 0.9993335554815062
eval_runtime: 3.8586
eval_samples_per_second: 259.164
eval_steps_per_second: 16.327
epoch: 5.0
Artifacts saved to ./first_phase_ner_model


In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load model and tokenizer (use absolute path if needed)
model_dir = "./first_phase_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
model = AutoModelForTokenClassification.from_pretrained(model_dir, local_files_only=True)
model.eval()

id2label = model.config.id2label

# Sample input
tokens = ["John", "is", "skilled", "in", "Python", "and", "machine", "learning", "at", "DeepMind"]
encoded = tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**encoded)
logits = outputs.logits
preds = torch.argmax(logits, dim=-1).squeeze().tolist()
word_ids = encoded.word_ids()

# Align prediction to original tokens
labels = []
prev_word_id = None
for idx, word_id in enumerate(word_ids):
    if word_id is None or word_id == prev_word_id:
        labels.append(None)
    else:
        label_id = preds[idx]
        labels.append(id2label[label_id])
        prev_word_id = word_id

# Show result
for token, label in zip(tokens, labels):
    print(f"{token:12s} → {label}")


John         → None
is           → O
skilled      → O
in           → O
Python       → O
and          → B-SKILL
machine      → O
learning     → B-SKILL
at           → I-SKILL
DeepMind     → O
