In [1]:
import random
import json

# -------------------------------
# 1. Define sample entities
# -------------------------------
indian_names = ["John Doe", "Jane Smith", "Rahul Sharma", "Priya Singh",
    "Ankit Kumar", "Neha Patel", "Vikram Joshi", "Aisha Khan"]
roles = [
    "Senior AI/ML Engineer", "Full-stack Developer", "Cloud Solutions Architect",
    "Data Scientist", "Machine Learning Engineer"
]
companies = [
    "DeepTech Solutions", "InnoData Labs", "CloudNova Pvt Ltd", "NextGen AI", "TechVantage"
]
skills = [
     "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "Rust",
    "TensorFlow", "PyTorch", "FastAPI", "Flask", "Docker", "Kubernetes",
    "Git", "Terraform", "Jenkins", "PostgreSQL", "MongoDB", "Redis",
    "Apache Kafka", "Spark", "Hadoop", "NLP", "Reinforcement Learning", "Distributed Systems",
]

# -------------------------------
# 2. Generate synthetic sentences
# -------------------------------
def generate_sentence():
    name = random.choice(indian_names)
    role = random.choice(roles)
    company = random.choice(companies)
    skill_sample = random.sample(skills, k=5)  # pick 5 random skills

    sentence = f"{name} is a {role} at {company}. Skilled in {', '.join(skill_sample)}."
    
    entities = []

    # BIO tagging
    # Name
    name_tokens = name.split()
    entities.extend([{"word": token, "entity_group": "B-PER"} if i==0 else {"word": token, "entity_group": "I-PER"} for i, token in enumerate(name_tokens)])
    
    # Role
    role_tokens = role.split()
    entities.extend([{"word": token, "entity_group": "B-ROLE"} if i==0 else {"word": token, "entity_group": "I-ROLE"} for i, token in enumerate(role_tokens)])
    
    # Company
    company_tokens = company.split()
    entities.extend([{"word": token, "entity_group": "B-COMPANY"} if i==0 else {"word": token, "entity_group": "I-COMPANY"} for i, token in enumerate(company_tokens)])
    
    # Skills
    for skill in skill_sample:
        skill_tokens = skill.split()
        entities.extend([{"word": token, "entity_group": "B-SKILL"} if i==0 else {"word": token, "entity_group": "I-SKILL"} for i, token in enumerate(skill_tokens)])
    
    return {"text": sentence, "entities": entities}

# -------------------------------
# 3. Generate dataset
# -------------------------------
num_samples = 50  # generate 50 synthetic examples
synthetic_dataset = [generate_sentence() for _ in range(num_samples)]

# -------------------------------
# 4. Save to JSON for later use
# -------------------------------
with open("synthetic_ner_data.json", "w") as f:
    json.dump(synthetic_dataset, f, indent=4)

print("✅ Synthetic NER dataset generated and saved as synthetic_ner_data.json")


✅ Synthetic NER dataset generated and saved as synthetic_ner_data.json


In [16]:
# -------------------------------
# 0. Imports
# -------------------------------
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import f1_score

# -------------------------------
# 1. Example data
# -------------------------------
# Replace this with your real dataset
data = [
    {"text": "John works at OpenAI", "entities": [{"word": "John", "label": "B-PER"}, {"word": "OpenAI", "label": "B-ORG"}]},
    {"text": "Alice joined Google", "entities": [{"word": "Alice", "label": "B-PER"}, {"word": "Google", "label": "B-ORG"}]}
]

# Convert B-PER/I-PER to O if not trained
for item in data:
    for ent in item["entities"]:
        if ent["label"] in ["B-PER", "I-PER"]:
            ent["label"] = "O"

# -------------------------------
# 2. Create label maps
# -------------------------------
unique_labels = set(ent["label"] for item in data for ent in item["entities"])
label2id = {label: i for i, label in enumerate(sorted(unique_labels))}
id2label = {i: label for label, i in label2id.items()}

# -------------------------------
# 3. Tokenizer
# -------------------------------
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# -------------------------------
# 4. Preprocess & align labels
# -------------------------------
texts, labels = [], []

for item in data:
    words = [ent["word"] for ent in item["entities"]]
    word_labels = [ent["label"] for ent in item["entities"]]

    # Tokenize
    tokenized = tokenizer(words, is_split_into_words=True, truncation=True, padding="max_length", max_length=128)

    # Align labels with subword tokens
    aligned_labels = []
    for i, word_id in enumerate(tokenized.word_ids(batch_index=0)):
        if word_id is None:
            aligned_labels.append(-100)  # special tokens
        else:
            aligned_labels.append(label2id[word_labels[word_id]])

    texts.append(tokenized)
    labels.append(aligned_labels)

# Convert to HuggingFace Dataset
dataset = Dataset.from_dict({
    "input_ids": [t["input_ids"] for t in texts],
    "attention_mask": [t["attention_mask"] for t in texts],
    "labels": labels
})

# -------------------------------
# 5. Model
# -------------------------------
num_labels = len(label2id)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels,
                                                        id2label=id2label, label2id=label2id)

# -------------------------------
# 6. Data collator
# -------------------------------
data_collator = DataCollatorForTokenClassification(tokenizer)

# -------------------------------
# 7. Metrics
# -------------------------------
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
                  for prediction, label in zip(predictions, labels)]

    f1 = f1_score(true_labels, true_preds)
    return {"f1": f1}

# -------------------------------
# 8. Training arguments
# -------------------------------
training_args = TrainingArguments(
    output_dir="./ner_model",
    eval_strategy="steps",
    save_steps=100,
    eval_steps=100,
    logging_steps=50,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

# -------------------------------
# 9. Trainer
# -------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# -------------------------------
# 10. Train
# -------------------------------
trainer.train()

# -------------------------------
# 11. Save final model
# -------------------------------
trainer.save_model("./ner_model_final")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss


In [31]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# -------------------------------
# 1. Load model and tokenizer
# -------------------------------
model_name_or_path = "C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/src/ner_model_final"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
model.eval()

# -------------------------------
# 2. Example tech resumes
# -------------------------------
texts = [
    """John Doe is a software engineer with 5 years of experience in Python, PyTorch, 
    and NLP. He has designed AI-based resume parsing systems, chatbots, and recommendation engines. 
    He contributed to open-source ML libraries and published papers on deep learning.""",
    
    """Jane Smith is a data scientist specialized in large-scale data processing with Spark and Hadoop. 
    She implemented predictive models for user behavior analysis, optimized SQL queries, 
    and led the development of cloud-based analytics pipelines using AWS services."""
]

# -------------------------------
# 3. Run NER safely
# -------------------------------
for text in texts:
    # Tokenize
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True, return_offsets_mapping=True)
    offsets = encoding.pop("offset_mapping")[0]  # remove offsets before passing to model

    # Predict
    with torch.no_grad():
        outputs = model(**encoding)
    predictions = torch.argmax(outputs.logits, dim=2)[0]

    # Map tokens back to original text
    pred_labels = []
    current_word = None

    for idx, word_id in enumerate(encoding.word_ids(batch_index=0)):
        if word_id is None:
            continue  # skip special tokens
        label_id = predictions[idx].item()
        label = model.config.id2label[label_id]

        if word_id != current_word:
            start, end = offsets[idx].tolist()
            word_text = text[start:end]
            pred_labels.append((word_text, label))
            current_word = word_id

    # -------------------------------
    # 4. Display results
    # -------------------------------
    print("\n===============================")
    print("Resume Text:\n")
    print(text)
    print("\nNER Predictions:\n")
    for word, label in pred_labels:
        print(f"{word} -> {label}")



Resume Text:

John Doe is a software engineer with 5 years of experience in Python, PyTorch, 
    and NLP. He has designed AI-based resume parsing systems, chatbots, and recommendation engines. 
    He contributed to open-source ML libraries and published papers on deep learning.

NER Predictions:

John -> O
Doe -> O
is -> O
a -> O
software -> B-ORG
engineer -> B-ORG
with -> B-ORG
5 -> B-ORG
years -> B-ORG
of -> B-ORG
experience -> B-ORG
in -> O
Python -> O
, -> O
P -> B-ORG
, -> O
and -> B-ORG
NL -> B-ORG
. -> O
He -> O
has -> B-ORG
designed -> B-ORG
AI -> O
- -> B-ORG
based -> B-ORG
resume -> O
par -> B-ORG
systems -> B-ORG
, -> O
chat -> B-ORG
, -> O
and -> B-ORG
recommendation -> B-ORG
engines -> B-ORG
. -> O
He -> O
contributed -> B-ORG
to -> B-ORG
open -> B-ORG
- -> B-ORG
source -> B-ORG
ML -> O
libraries -> B-ORG
and -> B-ORG
published -> O
papers -> B-ORG
on -> B-ORG
deep -> B-ORG
learning -> B-ORG
. -> B-ORG

Resume Text:

Jane Smith is a data scientist specialized in large-s