In [5]:
#!/usr/bin/env python
# coding: utf-8

import os
import json
import argparse
from typing import List, Dict, Any

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score


# ---------- Data utilities ----------

def load_iob_data(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("Input JSON must be a list of samples.")
    return data


def validate_data(data: List[Dict[str, Any]], sample_check: int = 3) -> None:
    if len(data) == 0:
        raise ValueError("Dataset is empty.")
    for i, ex in enumerate(data[:sample_check]):
        if "tokens" not in ex or "labels" not in ex:
            raise ValueError(f"Sample {i} missing 'tokens' or 'labels'.")
        if len(ex["tokens"]) != len(ex["labels"]):
            raise ValueError(f"Sample {i} len(tokens) != len(labels).")


def create_label_map(dataset: List[Dict[str, Any]]):
    unique = sorted({lab for ex in dataset for lab in ex["labels"]})
    label2id = {l: i for i, l in enumerate(unique)}
    id2label = {i: l for l, i in label2id.items()}
    return unique, label2id, id2label


# ---------- Tokenization & alignment ----------

def tokenize_and_align_labels(example, tokenizer, label2id, max_length: int = 256):
    encoded = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=max_length,
    )
    word_ids = encoded.word_ids()
    labels = example["labels"]

    aligned_labels = []
    prev_word_id = None
    for w_id in word_ids:
        if w_id is None:
            aligned_labels.append(-100)  # special/pad tokens
        elif w_id != prev_word_id:
            aligned_labels.append(label2id[labels[w_id]])  # first subtoken of a word
        else:
            aligned_labels.append(-100)  # subsequent subtokens
        prev_word_id = w_id

    encoded["labels"] = aligned_labels
    return encoded


# ---------- Metrics ----------

def compute_metrics(eval_pred, id2label):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_labels = []
    pred_labels = []
    for p_seq, l_seq in zip(preds, labels):
        tl_seq = []
        pl_seq = []
        for p, l in zip(p_seq, l_seq):
            if l == -100:
                continue
            tl_seq.append(id2label[int(l)])
            pl_seq.append(id2label[int(p)])
        true_labels.append(tl_seq)
        pred_labels.append(pl_seq)

    return {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
    }


# ---------- Main ----------

def main():
    parser = argparse.ArgumentParser(description="Phase 2 NER fine-tuning (30k train pool, 2k eval)")
    parser.add_argument("--data_path", type=str, default="final_merged_iob_dataset.json")
    parser.add_argument("--phase1_model_dir", type=str, default="./first_phase_ner_model")  # adjust if needed
    parser.add_argument("--output_dir", type=str, default="./models/phase2_ner_model")
    parser.add_argument("--logs_dir", type=str, default="./logs/logs_phase2")
    parser.add_argument("--report_path", type=str, default="./reports/phase2_eval_report.json")
    parser.add_argument("--indices_path", type=str, default="./reports/phase2_indices.json")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--max_length", type=int, default=256)
    parser.add_argument("--epochs", type=int, default=5)
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--learning_rate", type=float, default=5e-5)
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--warmup_ratio", type=float, default=0.05)
    parser.add_argument("--grad_accum", type=int, default=1)
    parser.add_argument("--phase2_pool", type=int, default=30_000)
    parser.add_argument("--eval_size", type=int, default=2_000)
    parser.add_argument("--num_proc", type=int, default=1)
    args = parser.parse_args() if hasattr(__builtins__, "__IPYTHON__") is False else parser.parse_args([])

    os.makedirs(args.output_dir, exist_ok=True)
    os.makedirs(os.path.dirname(args.logs_dir), exist_ok=True)
    os.makedirs(os.path.dirname(args.report_path), exist_ok=True)

    set_seed(args.seed)

    # Load and validate data
    data = load_iob_data(args.data_path)
    validate_data(data)
    n = len(data)
    print(f"Loaded {n} samples.")

    # Load label map from Phase 1 if available; else build from full data
    label_map_path = os.path.join(args.phase1_model_dir, "label_map.json")
    if os.path.exists(label_map_path):
        with open(label_map_path, "r", encoding="utf-8") as f:
            lm = json.load(f)
        label_list = lm["label_list"]
        label2id = {k: int(v) for k, v in lm["label2id"].items()}
        id2label = {int(k): v for k, v in lm["id2label"].items()}
        print(f"Loaded label map from {label_map_path} (num_labels={len(label_list)}).")
    else:
        print("Warning: label_map.json not found in Phase 1 dir. Rebuilding from data.")
        label_list, label2id, id2label = create_label_map(data)

    # Reconstruct deterministic non-overlapping indices using the same seed as Phase 1
    # Phase 1 used first 11,000 (10k train + 1k eval), so Phase 2 pool = next 30,000
    rng = np.random.default_rng(args.seed)
    indices = np.arange(n)
    rng.shuffle(indices)
    needed = 11_000 + args.phase2_pool
    if n < needed:
        raise ValueError(f"Dataset too small for Phase 2: need at least {needed}, have {n}.")

    phase2_pool_idx = indices[11_000:11_000 + args.phase2_pool]
    eval_size = min(args.eval_size, len(phase2_pool_idx)//10) if args.eval_size > len(phase2_pool_idx) else args.eval_size
    eval_idx = phase2_pool_idx[:eval_size]
    train_idx = phase2_pool_idx[eval_size:]

    # Persist indices for reproducibility
    with open(args.indices_path, "w", encoding="utf-8") as f:
        json.dump({
            "seed": args.seed,
            "phase2_pool_idx": phase2_pool_idx.tolist(),
            "train_idx": train_idx.tolist(),
            "eval_idx": eval_idx.tolist()
        }, f, indent=2)
    print(f"Phase 2 pool: {len(phase2_pool_idx)} | Train: {len(train_idx)} | Eval: {len(eval_idx)}")
    print(f"Saved indices to {args.indices_path}")

    # Build subsets
    train_raw = [data[i] for i in train_idx]
    eval_raw = [data[i] for i in eval_idx]

    # Tokenizer and datasets
    tokenizer = AutoTokenizer.from_pretrained(
        args.phase1_model_dir,
        local_files_only=True
    )

    def _map_fn(ex):
        return tokenize_and_align_labels(ex, tokenizer, label2id, max_length=args.max_length)

    train_ds = Dataset.from_list(train_raw).map(_map_fn, batched=False, num_proc=args.num_proc, desc="Tokenizing train")
    eval_ds = Dataset.from_list(eval_raw).map(_map_fn, batched=False, num_proc=args.num_proc, desc="Tokenizing eval")

    # Load Phase 1 checkpoint model
    model = AutoModelForTokenClassification.from_pretrained(
        args.phase1_model_dir,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        local_files_only=True
    )


    # Training setup
    fp16 = torch.cuda.is_available()
    bf16 = torch.cuda.is_available() and getattr(torch.cuda, "is_bf16_supported", lambda: False)()

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir=args.logs_dir,
        logging_steps=50,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        gradient_accumulation_steps=args.grad_accum,
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        warmup_ratio=args.warmup_ratio,
        seed=args.seed,
        group_by_length=True,
        fp16=fp16 and not bf16,
        bf16=bf16,
        report_to="none",
    )

    collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=lambda p: compute_metrics(p, id2label),
    )

    # Train
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Starting training on {device}...")
    trainer.train()
    print("Training complete.")

    # Evaluate and save
    metrics = trainer.evaluate()
    print("Evaluation metrics:")
    for k, v in metrics.items():
        print(f"  {k}: {v}")

    # Save artifacts
    model.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    with open(os.path.join(args.output_dir, "label_map.json"), "w", encoding="utf-8") as f:
        json.dump({"label_list": label_list, "label2id": label2id, "id2label": id2label}, f, ensure_ascii=False, indent=2)
    with open(args.report_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)

    print(f"Artifacts saved to {args.output_dir}")
    print(f"Metrics saved to {args.report_path}")


if __name__ == "__main__":
    main()


Loaded 116805 samples.
Loaded label map from ./first_phase_ner_model\label_map.json (num_labels=7).
Phase 2 pool: 30000 | Train: 28000 | Eval: 2000
Saved indices to ./reports/phase2_indices.json


Tokenizing train: 100%|██████████| 28000/28000 [00:10<00:00, 2668.61 examples/s]
Tokenizing eval: 100%|██████████| 2000/2000 [00:00<00:00, 2822.87 examples/s]
  trainer = Trainer(


Starting training on cuda...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0014,0.00159,0.999689,0.997836,0.999,0.998418
2,0.0002,1.3e-05,1.0,1.0,1.0,1.0
3,0.0004,0.000276,0.999961,0.999833,0.999833,0.999833
4,0.0,7e-06,1.0,1.0,1.0,1.0
5,0.0,4e-06,1.0,1.0,1.0,1.0


Training complete.


Evaluation metrics:
  eval_loss: 1.3432188097795006e-05
  eval_accuracy: 1.0
  eval_precision: 1.0
  eval_recall: 1.0
  eval_f1: 1.0
  eval_runtime: 7.5552
  eval_samples_per_second: 264.717
  eval_steps_per_second: 16.545
  epoch: 5.0
Artifacts saved to ./models/phase2_ner_model
Metrics saved to ./reports/phase2_eval_report.json


In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load model and tokenizer
model_path = "./models/phase2_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample resume text
resume_text = """
"John", "is", "skilled", "in", "Python", "and", "machine", "learning", "at", "DeepMind"
"""

# Run NER
entities = ner_pipeline(resume_text)

# Display results
for ent in entities:
    print(f"{ent['word']} ({ent['entity_group']}) | Score: {ent['score']:.3f}")


  from .autonotebook import tqdm as notebook_tqdm


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './models/phase2_ner_model'. Use `repo_type` argument if needed.