In [None]:
import json
from collections import defaultdict
import re

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def extract_skills_from_text(text):
    """
    Very simple keyword-based skill extractor from text.
    Replace this with spaCy or ML NER model for better results.
    """
    skill_keywords = [
        "project management", "quality", "budget", "client", "proposal",
        "reporting", "execution", "operations", "risk", "planning", "SOW", "RFQ"
    ]
    
    text = text.lower()
    found_skills = []
    for keyword in skill_keywords:
        if keyword.lower() in text:
            found_skills.append(keyword.lower())

    return found_skills

def extract_from_experience_blocks(resume_data):
    skills_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for profile in resume_data:
        experiences = profile.get("experience", [])
        for exp in experiences:
            raw_company = exp.get("company", "").strip().lower()
            company = re.sub(r'[^a-zA-Z0-9 ]', '', raw_company).strip()

            role = (exp.get("title") or "").strip().lower()

            responsibilities = exp.get("responsibilities", [])
            if not responsibilities:
                continue

            for line in responsibilities:
                inferred_skills = extract_skills_from_text(line)
                for skill in inferred_skills:
                    skills_dict[company][role][skill] += 1

    return skills_dict

def merge_skills_into_benchmark(existing_benchmark, new_skills):
    for company, roles in new_skills.items():
        if company not in existing_benchmark:
            existing_benchmark[company] = {}

        for role, skills in roles.items():
            if role not in existing_benchmark[company]:
                existing_benchmark[company][role] = {}

            for skill, count in skills.items():
                existing_count = existing_benchmark[company][role].get(skill, 0)
                existing_benchmark[company][role][skill] = existing_count + count

    return existing_benchmark

# === Paths ===
resume_path = "resumes_dataset.json"
benchmark_path = "merged_benchmark_by_company_role.json"
output_path = "merged_benchmark_by_company_role.json"

# === Pipeline ===
print("📂 Loading files...")
resume_data = load_json(resume_path)
existing_benchmark = load_json(benchmark_path)

print("🔍 Extracting from responsibilities + roles...")
inferred_skills = extract_from_experience_blocks(resume_data)

print("🔗 Merging into benchmark...")
updated = merge_skills_into_benchmark(existing_benchmark, inferred_skills)

print("💾 Saving merged file...")
save_json(updated, output_path)

print("✅ Resume experiences merged into benchmark.")


In [5]:
import json
import random
from transformers import AutoTokenizer

# ==== CONFIG ====
benchmark_file = "kaggle_resume_benchmarkdata.json"  # Your benchmark data file
output_file = "benchmark_iob_dataset.json"           # Output IOB dataset
pretrained_model = "bert-base-cased"                 # Match your NER model tokenizer
samples_per_role = 5                                 # How many synthetic sentences per role

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

# Load benchmark data
with open(benchmark_file, "r", encoding="utf-8") as f:
    benchmark_data = json.load(f)

# Sentence templates
role_templates = [
    "Worked as a {role} at {company} specializing in {skill}.",
    "Served as {role} for {company} with expertise in {skill}.",
    "Held the position of {role} at {company}, focusing on {skill}.",
    "As a {role}, contributed to {company} using {skill}.",
    "Responsible for {skill} while working as a {role} at {company}."
]

# Tokenize & label
def tokenize_and_label(sentence, role, company, skill):
    tokens = tokenizer.tokenize(sentence)
    labels = ["O"] * len(tokens)

    def label_entity(entity, label_prefix):
        entity_tokens = tokenizer.tokenize(entity)
        for i in range(len(tokens) - len(entity_tokens) + 1):
            if tokens[i:i+len(entity_tokens)] == entity_tokens:
                labels[i] = f"B-{label_prefix}"
                for j in range(1, len(entity_tokens)):
                    labels[i+j] = f"I-{label_prefix}"

    label_entity(company, "COMPANY")
    label_entity(role, "ROLE")
    label_entity(skill, "SKILL")
    return tokens, labels

# Build dataset
iob_dataset = []

# Detect if data is dict or list
if isinstance(benchmark_data, dict):
    company_items = benchmark_data.items()
else:
    company_items = [(entry.get("company"), entry.get("roles", [])) for entry in benchmark_data]

for company, roles in company_items:
    if not company or not roles:
        continue

    for role_entry in roles:
        # Role might be a dict or string
        if isinstance(role_entry, dict):
            role = role_entry.get("role")
            skills = role_entry.get("skills", [])
        else:
            role = str(role_entry)
            skills = []

        if not role:
            continue

        # If no skills, still make a generic skill placeholder
        if not skills:
            skills = ["technology", "software development"]

        for skill in skills:
            for _ in range(samples_per_role):
                template = random.choice(role_templates)
                sentence = template.format(role=role, company=company, skill=skill)
                tokens, labels = tokenize_and_label(sentence, role, company, skill)
                iob_dataset.append({"tokens": tokens, "labels": labels})

# Save to file
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(iob_dataset, f, indent=2)

print(f"✅ Generated {len(iob_dataset)} labeled samples → {output_file}")


✅ Generated 1440 labeled samples → benchmark_iob_dataset.json


In [5]:
import json
import random


# ===============================
# Config
# ===============================
benchmark_file = "benchmark_iob_dataset.json"
companies_file = "extra_companies.json"
roles_file = "extra_roles.json"
skills_file = "extra_skills.json"
output_file = "benchmark_iob_dataset_augmented.json"
samples_per_triplet = 5  # how many variations per (company, role, skill)

# ===============================
# Load spaCy model
# ===============================


# ===============================
# Helper function using spaCy for IOB labeling
# ===============================
import re

def tokenize_and_label(sentence, role, company, skill):
    """
    Tokenizes and labels the sentence with IOB tags for COMPANY, ROLE, SKILL.
    - Case-insensitive
    - Longest-first entity matching
    - Strips punctuation from tokens
    - Avoids overlapping spans
    """

    # ====== Tokenize ======
    tokens = re.findall(r"\w+|\S", sentence)  # Keeps punctuation as separate tokens
    norm_tokens = [t.lower() for t in tokens]
    labels = ["O"] * len(tokens)

    # ====== Normalize Entities ======
    def normalize(text):
        return [w.lower() for w in re.findall(r"\w+|\S", text)]

    entities = [
        (normalize(company), "COMPANY"),
        (normalize(role), "ROLE"),
        (normalize(skill), "SKILL"),
    ]

    # Sort longest-first to prefer multi-word matches
    entities.sort(key=lambda e: len(e[0]), reverse=True)

    # ====== Match and Label ======
    for ent_tokens, ent_type in entities:
        L = len(ent_tokens)
        if L == 0:
            continue
        for i in range(len(norm_tokens) - L + 1):
            window = norm_tokens[i:i+L]
            if window == ent_tokens and all(lbl == "O" for lbl in labels[i:i+L]):
                labels[i] = f"B-{ent_type}"
                for j in range(1, L):
                    labels[i + j] = f"I-{ent_type}"
                break

    return tokens, labels


# ===============================
# Load datasets
# ===============================
with open(benchmark_file, "r", encoding="utf-8") as f:
    benchmark_data = json.load(f)

with open(companies_file, "r", encoding="utf-8") as f:
    extra_companies = json.load(f)

with open(roles_file, "r", encoding="utf-8") as f:
    extra_roles = json.load(f)

with open(skills_file, "r", encoding="utf-8") as f:
    extra_skills = json.load(f)

# ===============================
# Templates for synthetic sentences
# ===============================
templates = [
    "Worked as a {role} at {company} specializing in {skill}.",
    "Served as {role} for {company} with expertise in {skill}.",
    "Held the position of {role} at {company}, focusing on {skill}.",
    "As a {role}, contributed to {company} using {skill}.",
    "Responsible for {skill} development at {company} as a {role}.",
    "Implemented {skill} solutions while working as a {role} at {company}.",
    "Developed {skill} projects at {company} in the role of {role}."
]

# ===============================
# Generate synthetic dataset
# ===============================
synthetic_data = []

for company in extra_companies:
    for role in extra_roles:
        for skill in extra_skills:
            for _ in range(samples_per_triplet):
                template = random.choice(templates)
                sentence = template.format(role=role, company=company, skill=skill)
                tokens, labels = tokenize_and_label(sentence, role, company, skill)
                synthetic_data.append({
                    "tokens": tokens,
                    "labels": labels
                })

# ===============================
# Merge with benchmark
# ===============================
augmented_dataset = benchmark_data + synthetic_data

# ===============================
# Save output
# ===============================
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(augmented_dataset, f, indent=2, ensure_ascii=False)

print(f"✅ Augmented dataset saved to {output_file}")
print(f"Original dataset size: {len(benchmark_data)}")
print(f"Synthetic dataset size: {len(synthetic_data)}")
print(f"Total dataset size: {len(augmented_dataset)}")


✅ Augmented dataset saved to benchmark_iob_dataset_augmented.json
Original dataset size: 1440
Synthetic dataset size: 113925
Total dataset size: 115365


In [6]:
import json
import random

# ===============================
# File paths (update if needed)
# ===============================
original_file = "benchmark_iob_dataset.json"  # Your original dataset
synthetic_file = "benchmark_iob_dataset_augmented.json"     # Your generated synthetic dataset
merged_file = "final_merged_iob_dataset.json"       # Output merged dataset

# ===============================
# Load datasets
# ===============================
with open(original_file, "r", encoding="utf-8") as f:
    original_data = json.load(f)

with open(synthetic_file, "r", encoding="utf-8") as f:
    synthetic_data = json.load(f)

print(f"Original dataset size: {len(original_data)}")
print(f"Synthetic dataset size: {len(synthetic_data)}")

# ===============================
# Merge and shuffle
# ===============================
merged_data = original_data + synthetic_data
random.shuffle(merged_data)

# ===============================
# Save merged dataset
# ===============================
with open(merged_file, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2, ensure_ascii=False)

print(f"✅ Merged dataset saved to {merged_file}")
print(f"Total samples in merged dataset: {len(merged_data)}")


Original dataset size: 1440
Synthetic dataset size: 115365
✅ Merged dataset saved to final_merged_iob_dataset.json
Total samples in merged dataset: 116805
