In [1]:
import json
from collections import defaultdict
from pathlib import Path

def build_feedback_dictionaries(ner_dataset_path: str, 
                                output_company_dict: str, 
                                output_role_dict: str):
    """
    Converts token/label NER dataset into two dictionaries:
    1. Company → Role → [Skills]
    2. Role → Company → [Skills]
    
    Args:
        ner_dataset_path: path to your NER dataset (json or jsonl)
        output_company_dict: path to save company-role dictionary
        output_role_dict: path to save role-company dictionary
    """
    
    # --- Step 1: Load dataset ---
    dataset_path = Path(ner_dataset_path)
    if not dataset_path.exists():
        raise FileNotFoundError(f"❌ Dataset not found at {dataset_path}")
    
    # Support jsonl (line by line) or json (array)
    data = []
    with open(dataset_path, "r", encoding="utf-8") as f:
        if dataset_path.suffix == ".jsonl":
            for line in f:
                data.append(json.loads(line.strip()))
        else:
            data = json.load(f)
    
    # --- Step 2: Prepare dicts ---
    company_dict = defaultdict(lambda: defaultdict(set))  # company → role → skills
    role_dict = defaultdict(lambda: defaultdict(set))     # role → company → skills
    
    # --- Step 3: Process dataset ---
    for example in data:
        tokens = example["tokens"]
        labels = example["labels"]
        
        current_role, current_company, current_skills = [], [], []
        
        for tok, lab in zip(tokens, labels):
            if lab.endswith("ROLE"):
                current_role.append(tok)
            elif lab.endswith("COMPANY"):
                current_company.append(tok)
            elif lab.endswith("SKILL"):
                current_skills.append(tok)
        
        # Join multi-token entities
        role = " ".join(current_role).strip()
        company = " ".join(current_company).strip()
        skill = " ".join(current_skills).strip()
        
        # Skip empty entities
        if not role or not company or not skill:
            continue
        
        # --- Step 4: Update dictionaries ---
        company_dict[company][role].add(skill)
        role_dict[role][company].add(skill)
    
    # --- Step 5: Convert sets → lists ---
    company_dict = {
        comp: {role: sorted(list(skills)) for role, skills in roles.items()}
        for comp, roles in company_dict.items()
    }
    role_dict = {
        role: {comp: sorted(list(skills)) for comp, skills in comps.items()}
        for role, comps in role_dict.items()
    }
    
    # --- Step 6: Save to JSON ---
    with open(output_company_dict, "w", encoding="utf-8") as f:
        json.dump(company_dict, f, indent=2, ensure_ascii=False)
    with open(output_role_dict, "w", encoding="utf-8") as f:
        json.dump(role_dict, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Saved Company→Role→Skills dictionary to {output_company_dict}")
    print(f"✅ Saved Role→Company→Skills dictionary to {output_role_dict}")


# -----------------------------
# Example usage
# -----------------------------
if __name__ == "__main__":
    build_feedback_dictionaries(
        ner_dataset_path="C:\\Users\\WIN11\\OneDrive\\Desktop\\resume proj\\projwithml\\data\\final_merged_iob_dataset.json",          # your 100k dataset file
        output_company_dict="company_role_skills.json",
        output_role_dict="role_company_skills.json"
    )


✅ Saved Company→Role→Skills dictionary to company_role_skills.json
✅ Saved Role→Company→Skills dictionary to role_company_skills.json


In [3]:
import json
import random
from transformers import pipeline
from tqdm import tqdm

# Paths
INPUT_FILE = r"C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/data/t5_training_data.json"
OUTPUT_FILE = r"C:/Users/WIN11/OneDrive/Desktop/resume proj/projwithml/data/t5_training_data_augmented.json"

# 🔹 Load JSON array dataset
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} samples")

# 🔹 Setup paraphrasing pipeline (runs on GPU if available)
paraphraser = pipeline("text2text-generation", model="t5-small", device=0)

# 🔹 Skill synonym dictionary
skill_synonyms = {
    "Python": ["Python programming", "Python development"],
    "SQL": ["SQL databases", "SQL queries"],
    "Machine Learning": ["ML", "machine learning algorithms"],
    "TensorFlow": ["TF", "TensorFlow framework"],
    "JavaScript": ["JS", "JavaScript coding"],
    "React": ["ReactJS", "React framework"],
    "AWS": ["Amazon Web Services", "AWS cloud"],
    "Docker": ["Docker containers", "containerization with Docker"],
    "Kubernetes": ["K8s", "Kubernetes orchestration"],
    "Data Analysis": ["data analytics", "analyzing data"],
}

def vary_skills(input_text: str) -> str:
    """Shuffle and add synonyms to skills inside input_text"""
    if "user_skills:" not in input_text:
        return input_text

    try:
        before, skills_part = input_text.split("user_skills:", 1)
        skills_str, after = skills_part.split("| required_skills:", 1)
        skills = eval(skills_str.strip())  # convert string list to Python list
    except:
        return input_text

    # Shuffle
    random.shuffle(skills)

    # Replace some with synonyms
    new_skills = []
    for s in skills:
        if s in skill_synonyms and random.random() > 0.5:
            new_skills.append(random.choice(skill_synonyms[s]))
        else:
            new_skills.append(s)

    return before + "user_skills: " + str(new_skills) + " | required_skills:" + after

# ------------------------
# 🔹 Create augmentation tasks
# ------------------------
augmented = []

# Collect texts to paraphrase (all targets)
to_paraphrase = [s["target"] for s in data]

# Run paraphrasing in batches
print("⚡ Paraphrasing targets in batches...")
batch_size = 16
paraphrased_targets = []
for i in tqdm(range(0, len(to_paraphrase), batch_size)):
    batch = ["paraphrase: " + t for t in to_paraphrase[i:i+batch_size]]
    results = paraphraser(batch, max_length=128, num_beams=4, do_sample=True, temperature=0.7)
    paraphrased_targets.extend([r["generated_text"] for r in results])

# ------------------------
# 🔹 Build augmented dataset
# ------------------------
for i, sample in enumerate(data):
    inp, tgt = sample["input"], sample["target"]
    para_tgt = paraphrased_targets[i]

    # 1. Original
    augmented.append({"input": inp, "target": tgt})

    # 2. Paraphrased feedback
    augmented.append({"input": inp, "target": para_tgt})

    # 3. Skill-variation
    varied_inp = vary_skills(inp)
    if varied_inp != inp:
        augmented.append({"input": varied_inp, "target": tgt})

    # 4. Skill-variation + paraphrased feedback
    varied_inp2 = vary_skills(inp)
    if varied_inp2 != inp:
        augmented.append({"input": varied_inp2, "target": para_tgt})

# ------------------------
# 🔹 Save new dataset
# ------------------------
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(augmented, f, indent=2)

print(f"✅ Augmented dataset saved: {len(augmented)} samples (original + variations)")


Loaded 768 samples


Device set to use cuda:0


⚡ Paraphrasing targets in batches...


  0%|          | 0/48 [00:00<?, ?it/s]Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/

✅ Augmented dataset saved: 2783 samples (original + variations)





In [5]:
import json
import random

# Load skill dataset
with open("C:\\Users\\WIN11\\OneDrive\\Desktop\\resume proj\\projwithml\\data\\skill_requirement_dataset.json", "r") as f:
    skill_dataset = json.load(f)

missing_templates = [
    "You already know {skills}. To qualify for the {role} role at {company}, you should also focus on {missing}. Your current match is {match}%.",
    "Strong base with {skills}. Still missing {missing}, which are important for this role. You meet around {match}% of the requirements.",
    "Good start! You have {skills}, but need to add {missing} for the {role} role at {company}. Current match: {match}%."
]

perfect_templates = [
    "Excellent! Your skills {skills} fully match the requirements for {role} at {company}. 🎉 This is a perfect match (100%).",
    "Congratulations, you already have all the required skills for {role} at {company}. This is a 100% match!",
    "Great job — {skills} cover everything needed for the {role} role at {company}. ✅ You are a perfect fit (100%)."
]

def generate_feedback(company, role, required_skills, candidate_skills):
    missing = [s for s in required_skills if s not in candidate_skills]
    match_percent = int((len(candidate_skills) / len(required_skills)) * 100)

    input_text = f"Company: {company} | Role: {role} | Candidate Skills: {', '.join(candidate_skills)}"

    if missing:
        template = random.choice(missing_templates)
        output_text = template.format(
            skills=", ".join(candidate_skills),
            role=role,
            company=company,
            missing=", ".join(missing),
            match=match_percent
        )
    else:
        template = random.choice(perfect_templates)
        output_text = template.format(
            skills=", ".join(candidate_skills),
            role=role,
            company=company
        )

    return {"input": input_text, "output": output_text}

examples = []

# Generate ~8 examples per role (~5k total)
for company, roles in skill_dataset.items():
    for role, required_skills in roles.items():
        if not required_skills or required_skills == ["TBD"]:
            continue
        for _ in range(8):
            k = random.randint(3, len(required_skills))
            candidate_skills = random.sample(required_skills, k=k)
            examples.append(generate_feedback(company, role, required_skills, candidate_skills))

# Shuffle & split
random.shuffle(examples)
split = int(0.9 * len(examples))
train_examples = examples[:split]
valid_examples = examples[split:]

# Save
with open("train.jsonl", "w") as f:
    for ex in train_examples:
        f.write(json.dumps(ex) + "\n")

with open("valid.jsonl", "w") as f:
    for ex in valid_examples:
        f.write(json.dumps(ex) + "\n")

print(f"✅ Generated {len(train_examples)} training and {len(valid_examples)} validation examples with varied feedback (including permatch).")


✅ Generated 1461 training and 163 validation examples with varied feedback (including permatch).
