In [36]:
# phase3_train_ready.py

from __future__ import annotations
import os
import sys
import json
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, List, Dict

import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)

# --- Metrics: prefer seqeval, fallback to token accuracy only ---
_SEQEVAL_OK = True
try:
    from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
except Exception:
    _SEQEVAL_OK = False
    precision_score = recall_score = f1_score = accuracy_score = None


# ----------------------------
# Config and env helpers
# ----------------------------
@dataclass
class Args:
    # Data
    data_file: Optional[str] = None       # If None, auto-discover in ../data/phase3
    tokens_field: str = "tokens"
    tags_field: str = "labels"            # Your dataset uses 'labels'
    sample_size: int = 60000              # target sample size
    eval_test_size: float = 0.1           # 10% eval from the sampled pool
    seed: int = 42

    # Model/checkpoint
    model_name_or_path: str = "C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/phase2/phase2_ner_model"  # resumes from Phase 2 by default
    output_dir: str = "C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/phase3/ner-bert-output"           # default Phase 3 output

    # Tokenization/training
    max_length: int = 256
    learning_rate: float = 5e-5
    weight_decay: float = 0.01
    warmup_ratio: float = 0.1
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 16
    per_device_eval_batch_size: int = 16
    gradient_accumulation_steps: int = 1
    logging_steps: int = 100


def is_notebook() -> bool:
    return "ipykernel" in sys.modules or "IPython" in sys.modules


def discover_data_file() -> Optional[str]:
    # 1) env override
    env_path = os.environ.get("PHASE3_DATA_FILE")
    if env_path and Path(env_path).is_file():
        return env_path

    # 2) common project locations relative to this script
    here = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
    candidates = [
        here / "../data/phase3",
        here / "../../data/phase3",
        here / "data/phase3",
    ]
    exts = (".json", ".jsonl")
    for root in candidates:
        if root.exists():
            files = [p for p in root.rglob("*") if p.suffix.lower() in exts]
            if files:
                # pick the first deterministically
                return str(sorted(files)[0])
    return None


def load_model_and_labels(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_path)

    # Normalize id2label/label2id
    id2label_raw = model.config.id2label
    if isinstance(id2label_raw, dict):
        id2label = {int(k): v for k, v in id2label_raw.items()}
    else:
        id2label = {i: v for i, v in enumerate(id2label_raw)}
    label2id = {v: k for k, v in id2label.items()}

    # Persist normalized mappings on config
    model.config.id2label = id2label
    model.config.label2id = label2id
    return tokenizer, model, id2label, label2id


def align_labels_with_tokens(labels, word_ids):
    aligned = []
    prev_word = None
    for wid in word_ids:
        if wid is None:
            aligned.append(-100)
        elif wid != prev_word:
            aligned.append(labels[wid])
        else:
            aligned.append(-100)
        prev_word = wid
    return aligned


def compute_metrics_builder(id2label: Dict[int, str]):
    if _SEQEVAL_OK:
        def compute_metrics(p):
            preds = np.argmax(p.predictions, axis=-1)
            labels = p.label_ids
            true_labels, true_preds = [], []
            for pred, lab in zip(preds, labels):
                mask = lab != -100
                tl = [id2label[int(l)] for l in lab[mask]]
                tp = [id2label[int(p_)] for p_ in pred[mask]]
                true_labels.append(tl)
                true_preds.append(tp)
            return {
                "precision": precision_score(true_labels, true_preds),
                "recall":    recall_score(true_labels, true_preds),
                "f1":        f1_score(true_labels, true_preds),
                "accuracy":  accuracy_score(true_labels, true_preds),
            }
        return compute_metrics
    else:
        # Fallback: token-level accuracy only (no seqeval installed)
        def compute_metrics(p):
            preds = np.argmax(p.predictions, axis=-1)
            labels = p.label_ids
            correct, total = 0, 0
            for pred, lab in zip(preds, labels):
                for p_i, l_i in zip(pred, lab):
                    if l_i == -100:
                        continue
                    total += 1
                    correct += int(p_i == l_i)
            acc = correct / total if total else 0.0
            return {"accuracy_token": acc}
        return compute_metrics


# ----------------------------
# Main training flow
# ----------------------------
def run_phase3(args: Args):
    # Resolve data path
    data_path = args.data_file or discover_data_file()
    if not data_path or not Path(data_path).is_file():
        raise FileNotFoundError(
            "Could not find data file. Set Args.data_file to your JSON/JSONL path "
            "or place a file under ../data/phase3."
        )

    # Prepare output
    out_dir = Path(args.output_dir).resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    # Reproducibility
    set_seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    print(f"[Phase 3] Data file: {data_path}")
    print(f"[Phase 3] Output dir: {out_dir}")
    print(f"[Phase 3] Loading dataset...")
    raw = load_dataset("json", data_files={"raw": data_path})["raw"]
    raw = raw.add_column("_orig_idx", list(range(len(raw))))
    print(f"[Phase 3] Total samples loaded: {len(raw)}")
    print(f"[Phase 3] Columns: {raw.column_names}")

    # Shuffle + sample 60k (or less if not enough)
    raw_shuffled = raw.shuffle(seed=args.seed)
    k = min(args.sample_size, len(raw_shuffled))
    sampled = raw_shuffled.select(range(k))
    print(f"[Phase 3] Sampled size (pre-split): {len(sampled)}")

    # Split into train/eval
    split = sampled.train_test_split(test_size=args.eval_test_size, seed=args.seed)
    train_ds, eval_ds = split["train"], split["test"]
    print(f"[Phase 3] Train: {len(train_ds)} | Eval: {len(eval_ds)}")

    # Load model + tokenizer from Phase 2 checkpoint (default)
    print(f"[Phase 3] Loading model from: {args.model_name_or_path}")
    tokenizer, model, id2label, label2id = load_model_and_labels(args.model_name_or_path)
    num_labels = len(id2label)
    print(f"[Phase 3] Labels ({num_labels}): {sorted(label2id.keys())}")

    # Detect if dataset labels are ints or strings
    sample_tags = train_ds[0][args.tags_field]
    labels_are_int = isinstance(sample_tags[0], int)

    def to_label_ids(tag_seq):
        if labels_are_int:
            # Ensure within range
            bad = [t for t in tag_seq if not (0 <= int(t) < num_labels)]
            if bad:
                raise ValueError(f"Found out-of-range label ids {bad}; expected 0..{num_labels-1}")
            return [int(t) for t in tag_seq]
        else:
            try:
                return [int(label2id[t]) for t in tag_seq]
            except KeyError as e:
                known = sorted(label2id.keys())
                raise KeyError(f"Unknown label '{e.args[0]}'. Known labels: {known}") from e

    # Tokenization + alignment
    def tokenize_and_align(batch):
        tokens_batch = batch[args.tokens_field]
        tags_batch = batch[args.tags_field]
        tag_ids_batch = [to_label_ids(tags) for tags in tags_batch]

        enc = tokenizer(
            tokens_batch,
            is_split_into_words=True,
            truncation=True,
            max_length=args.max_length,
            return_offsets_mapping=False,
        )

        aligned = []
        for i, tags in enumerate(tag_ids_batch):
            word_ids = enc.word_ids(batch_index=i)
            aligned.append(align_labels_with_tokens(tags, word_ids))

        enc["labels"] = aligned
        return enc

    # Keep minimal original columns
    cols_keep = (args.tokens_field, args.tags_field, "_orig_idx")
    rem_train = [c for c in train_ds.column_names if c not in cols_keep]
    rem_eval  = [c for c in eval_ds.column_names if c not in cols_keep]

    print("[Phase 3] Tokenizing train...")
    train_tok = train_ds.map(tokenize_and_align, batched=True, remove_columns=rem_train, desc="Tokenize train")
    print("[Phase 3] Tokenizing eval...")
    eval_tok  = eval_ds.map(tokenize_and_align,  batched=True, remove_columns=rem_eval,  desc="Tokenize eval")

    # Data collator + metrics
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    compute_metrics = compute_metrics_builder(id2label)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=str(out_dir),
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        warmup_ratio=args.warmup_ratio,
        num_train_epochs=args.num_train_epochs,
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1" if _SEQEVAL_OK else "accuracy_token",
        greater_is_better=True,
        logging_steps=args.logging_steps,
        report_to="none",
        seed=args.seed,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=eval_tok,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train + evaluate
    print("[Phase 3] Starting training...")
    trainer.train()
    print("[Phase 3] Evaluating...")
    metrics = trainer.evaluate()
    print("[Phase 3] Eval metrics:", json.dumps({k: float(v) for k, v in metrics.items() if isinstance(v, (int, float))}, indent=2))

    # Save model + tokenizer
    print(f"[Phase 3] Saving model to: {out_dir}")
    trainer.save_model(str(out_dir))
    tokenizer.save_pretrained(str(out_dir))

    # Save split traceability
    split_info = {
        "seed": args.seed,
        "data_file": data_path,
        "total_full": int(len(raw)),
        "sampled_k": int(k),
        "train_ratio": 1.0 - args.eval_test_size,
        "train_orig_indices": [int(i) for i in train_ds["_orig_idx"]],
        "eval_orig_indices": [int(i) for i in eval_ds["_orig_idx"]],
        "labels_from_checkpoint": sorted(label2id.keys()),
        "model_loaded_from": args.model_name_or_path,
    }
    with open(out_dir / "phase3_split_info.json", "w", encoding="utf-8") as f:
        json.dump(split_info, f, indent=2, ensure_ascii=False)

    print("[Phase 3] Done.")


# ----------------------------
# Entrypoint (works in notebook and CLI)
# ----------------------------
def main():
    # Notebook-friendly: no argparse needed. CLI can still override via env.
    a = Args()
    # Optional: set explicit paths here if you want to hardcode them:
    a.data_file = r"C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\data\final_merged_iob_dataset.json"
    a.model_name_or_path = r"C:\\Users\\WIN11\\OneDrive\\Desktop\\resume proj\\projwithml\\phase2\\phase2_ner_model"

    a.output_dir = r"C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\phase3\ner-bert-output"

    # If running as CLI and you want explicit args, uncomment and use argparse:
    # if not is_notebook():
    #     import argparse
    #     p = argparse.ArgumentParser()
    #     p.add_argument("--data_file", required=False)
    #     p.add_argument("--model_name_or_path", default=a.model_name_or_path)
    #     p.add_argument("--output_dir", default=a.output_dir)
    #     ns = p.parse_args()
    #     if ns.data_file: a.data_file = ns.data_file
    #     a.model_name_or_path = ns.model_name_or_path
    #     a.output_dir = ns.output_dir

    run_phase3(a)


if __name__ == "__main__":
    main()


[Phase 3] Data file: C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\data\final_merged_iob_dataset.json
[Phase 3] Output dir: C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\phase3\ner-bert-output
[Phase 3] Loading dataset...
[Phase 3] Total samples loaded: 116805
[Phase 3] Columns: ['tokens', 'labels', '_orig_idx']
[Phase 3] Sampled size (pre-split): 60000
[Phase 3] Train: 54000 | Eval: 6000
[Phase 3] Loading model from: C:\\Users\\WIN11\\OneDrive\\Desktop\\resume proj\\projwithml\\phase2\\phase2_ner_model
[Phase 3] Labels (7): ['B-COMPANY', 'B-ROLE', 'B-SKILL', 'I-COMPANY', 'I-ROLE', 'I-SKILL', 'O']
[Phase 3] Tokenizing train...
[Phase 3] Tokenizing eval...


  trainer = Trainer(


[Phase 3] Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0001,0.000673,0.999001,0.999445,0.999222,0.999882
2,0.0001,0.000845,0.998114,0.999333,0.998723,0.99983
3,0.0,1.7e-05,0.999889,0.999944,0.999917,0.999987


[Phase 3] Evaluating...


[Phase 3] Eval metrics: {
  "eval_loss": 1.6940721252467483e-05,
  "eval_precision": 0.9998889074043215,
  "eval_recall": 0.9999444506165982,
  "eval_f1": 0.9999166782391333,
  "eval_accuracy": 0.9999869166459514,
  "eval_runtime": 102.2422,
  "eval_samples_per_second": 58.684,
  "eval_steps_per_second": 3.668,
  "epoch": 3.0
}
[Phase 3] Saving model to: C:\Users\WIN11\OneDrive\Desktop\resume proj\projwithml\phase3\ner-bert-output
[Phase 3] Done.


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd

# --- Config ---
MODEL_PATH = "C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/phase3/ner-bert-output"

# --- Load tokenizer and model ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

# --- Inference pipeline ---
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# --- Resume-style sample data ---
sample_texts = [
    "John Doe is a Senior Software Engineer at Google with expertise in Python and Machine Learning.",
    "Jane Smith completed her Master's in Data Science from Stanford University in 2020.",
    "Rahul Verma worked as a Data Analyst at Deloitte and has experience with SQL, Tableau, and Excel.",
    "Priya Mehta holds a Bachelor's degree in Computer Engineering from IIT Bombay.",
    "Amit Sharma is currently employed at Infosys as a Cloud Solutions Architect."
]

df = pd.DataFrame({"text": sample_texts})

# --- Run predictions ---
results_all = []
results_filtered = []

ALLOWED_LABELS = ["COMPANY", "ROLE", "SKILL"]  # Only keep these

for text in sample_texts:
    prediction = ner_pipeline(text)
    results_all.append(prediction)
    filtered = [ent for ent in prediction if ent["entity_group"] in ALLOWED_LABELS]
    results_filtered.append(filtered)

# --- Diagnostics: output comparison ---
for idx, (text, res) in enumerate(zip(sample_texts, results_filtered)):
    if res:  # Only print samples that have COMPANY/ROLE/SKILL
        print(f"\n[Sample {idx+1}]")
        print("Text:", text)
        print("Filtered Entities (COMPANY, ROLE, SKILL only):")
        for entity in res:
            print(f"  - {entity['word']} ({entity['entity_group']} | score: {entity['score']:.3f})")

# --- Save results ---
df["predicted_entities_all"] = results_all
df["predicted_entities_filtered"] = results_filtered
df.to_csv("ner_resume_output_filtered.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0



[Sample 1]
Text: John Doe is a Senior Software Engineer at Google with expertise in Python and Machine Learning.
Filtered Entities (COMPANY, ROLE, SKILL only):
  - Senior Software Engineer (ROLE | score: 1.000)
  - Google (COMPANY | score: 1.000)
  - Python (SKILL | score: 1.000)
  - Machine Learning (SKILL | score: 1.000)

[Sample 2]
Text: Jane Smith completed her Master's in Data Science from Stanford University in 2020.
Filtered Entities (COMPANY, ROLE, SKILL only):
  - Master ' s (ROLE | score: 0.937)
  - Data Science (ROLE | score: 0.760)
  - Stanford University (COMPANY | score: 0.974)
  - 2020 (COMPANY | score: 0.638)

[Sample 3]
Text: Rahul Verma worked as a Data Analyst at Deloitte and has experience with SQL, Tableau, and Excel.
Filtered Entities (COMPANY, ROLE, SKILL only):
  - V (ROLE | score: 0.418)
  - Data Analyst (ROLE | score: 1.000)
  - Deloitte (COMPANY | score: 0.999)
  - SQL (SKILL | score: 0.997)
  - , Tableau, (COMPANY | score: 0.989)
  - Excel (COMPANY | score:

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd

# --- Config ---
MODEL_PATH = r"C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/phase 3/ner-bert-output"

# --- Load tokenizer and model ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

# --- Inference pipeline ---
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# --- Post-processing helper ---
def clean_entities(entities):
    CLEANED = []
    degree_keywords = {
        "bachelor", "master", "mba", "phd", "m.sc", "b.sc",
        "msc", "bsc", "degree", "diploma", "btech", "mtech"
    }
    
    for ent in entities:
        label = ent["entity_group"].upper()
        word = ent["word"].strip()
        
        if label not in {"COMPANY", "ROLE", "SKILL"}:
            continue
        if word.startswith("##"):
            word = word[2:]
        if len(word) == 1 and word.lower() != "c":
            continue
        if any(k in word.lower() for k in degree_keywords):
            continue
        
        word = word.strip(",. ")
        if not word:
            continue
        
        CLEANED.append({
            "word": word,
            "entity_group": label,
            "score": ent["score"]
        })
    
    return CLEANED

# --- Resume-style sample data ---
sample_texts = [
    "John Doe is a Senior Software Engineer at Google with expertise in Python and Machine Learning.",
    "Jane Smith completed her Master's in Data Science from Stanford University in 2020.",
    "Rahul Verma worked as a Data Analyst at Deloitte and has experience with SQL, Tableau, and Excel.",
    "Priya Mehta holds a Bachelor's degree in Computer Engineering from IIT Bombay.",
    "Amit Sharma is currently employed at Infosys as a Cloud Solutions Architect."
]

df = pd.DataFrame({"text": sample_texts})

# --- Run predictions ---
results_all = []
results_filtered = []

for text in sample_texts:
    prediction = ner_pipeline(text)
    results_all.append(prediction)
    filtered = clean_entities(prediction)
    results_filtered.append(filtered)

# --- Display results inline in Jupyter ---
for idx, (text, res) in enumerate(zip(sample_texts, results_filtered)):
    if res:
        print(f"\n[Sample {idx+1}]")
        print("Text:", text)
        print("Cleaned Entities (COMPANY, ROLE, SKILL only):")
        for entity in res:
            print(f"  - {entity['word']} ({entity['entity_group']} | score: {entity['score']:.3f})")

# Optional: save cleaned results
df["predicted_entities_all"] = results_all
df["predicted_entities_cleaned"] = results_filtered



Device set to use cuda:0



[Sample 1]
Text: John Doe is a Senior Software Engineer at Google with expertise in Python and Machine Learning.
Cleaned Entities (COMPANY, ROLE, SKILL only):
  - Senior Software Engineer (ROLE | score: 1.000)
  - Google (COMPANY | score: 1.000)
  - Python (SKILL | score: 1.000)
  - Machine Learning (SKILL | score: 1.000)

[Sample 2]
Text: Jane Smith completed her Master's in Data Science from Stanford University in 2020.
Cleaned Entities (COMPANY, ROLE, SKILL only):
  - Data Science (ROLE | score: 0.760)
  - Stanford University (COMPANY | score: 0.974)
  - 2020 (COMPANY | score: 0.638)

[Sample 3]
Text: Rahul Verma worked as a Data Analyst at Deloitte and has experience with SQL, Tableau, and Excel.
Cleaned Entities (COMPANY, ROLE, SKILL only):
  - Data Analyst (ROLE | score: 1.000)
  - Deloitte (COMPANY | score: 0.999)
  - SQL (SKILL | score: 0.997)
  - Tableau (COMPANY | score: 0.989)
  - Excel (COMPANY | score: 1.000)

[Sample 4]
Text: Priya Mehta holds a Bachelor's degree in Comp

In [4]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

app = FastAPI()

MODEL_PATH = r"C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/phase 3/ner-bert-output"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH, local_files_only=True)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def clean_entities(entities):
    CLEANED = []
    degree_keywords = {"bachelor", "master", "mba", "phd", "m.sc", "b.sc", "msc", "bsc", "degree", "diploma", "btech", "mtech"}
    for ent in entities:
        label = ent["entity_group"].upper()
        word = ent["word"].strip()
        if label not in {"COMPANY", "ROLE", "SKILL"}:
            continue
        if word.startswith("##"):
            word = word[2:]
        if len(word) == 1 and word.lower() != "c":
            continue
        if any(k in word.lower() for k in degree_keywords):
            continue
        word = word.strip(",. ")
        if not word:
            continue
        CLEANED.append({
            "word": word,
            "entity_group": label,
            "score": ent["score"]
        })
    return CLEANED

class TextInput(BaseModel):
    text: str

@app.post("/extract_entities")
def extract_entities(data: TextInput):
    prediction = ner_pipeline(data.text)
    cleaned = clean_entities(prediction)
    return {"entities": cleaned}


Device set to use cuda:0
