In [1]:
import json
from collections import defaultdict, Counter
from pathlib import Path

def build_feedback_dictionaries(ner_dataset_path: str, 
                                output_company_dict: str, 
                                output_role_dict: str):
    """
    Converts token/label NER dataset into two dictionaries:
    1. Company → Role → {skills: [...], freq: {...}}
    2. Role → Company → {skills: [...], freq: {...}}
    
    Args:
        ner_dataset_path: path to your NER dataset (json or jsonl)
        output_company_dict: path to save company-role dictionary
        output_role_dict: path to save role-company dictionary
    """
    
    dataset_path = Path(ner_dataset_path)
    if not dataset_path.exists():
        raise FileNotFoundError(f"❌ Dataset not found at {dataset_path}")
    
    # Load dataset
    data = []
    with open(dataset_path, "r", encoding="utf-8") as f:
        if dataset_path.suffix == ".jsonl":
            for line in f:
                data.append(json.loads(line.strip()))
        else:
            data = json.load(f)
    
    # Dicts: company→role→Counter(skills), role→company→Counter(skills)
    company_dict = defaultdict(lambda: defaultdict(Counter))
    role_dict = defaultdict(lambda: defaultdict(Counter))
    
    for example in data:
        tokens = example["tokens"]
        labels = example["labels"]
        
        current_role, current_company, current_skill = [], [], []
        
        for tok, lab in zip(tokens, labels):
            if lab.endswith("ROLE"):
                current_role.append(tok)
            elif lab.endswith("COMPANY"):
                current_company.append(tok)
            elif lab.endswith("SKILL"):
                current_skill.append(tok)
            else:
                # if we reach a non-skill token, flush accumulated skill
                if current_skill:
                    skill = " ".join(current_skill).strip()
                    if skill:
                        role = " ".join(current_role).strip()
                        company = " ".join(current_company).strip()
                        if role and company:
                            company_dict[company][role][skill] += 1
                            role_dict[role][company][skill] += 1
                    current_skill = []
        
        # Handle last trailing skill
        if current_skill:
            skill = " ".join(current_skill).strip()
            if skill:
                role = " ".join(current_role).strip()
                company = " ".join(current_company).strip()
                if role and company:
                    company_dict[company][role][skill] += 1
                    role_dict[role][company][skill] += 1
    
    # Convert Counter → dict {skills: [...], freq: {...}}
    company_dict = {
        comp: {
            role: {
                "skills": sorted(list(counter.keys())),
                "freq": dict(counter)
            }
            for role, counter in roles.items()
        }
        for comp, roles in company_dict.items()
    }
    
    role_dict = {
        role: {
            comp: {
                "skills": sorted(list(counter.keys())),
                "freq": dict(counter)
            }
            for comp, counter in comps.items()
        }
        for role, comps in role_dict.items()
    }
    
    # Save to JSON
    with open(output_company_dict, "w", encoding="utf-8") as f:
        json.dump(company_dict, f, indent=2, ensure_ascii=False)
    with open(output_role_dict, "w", encoding="utf-8") as f:
        json.dump(role_dict, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Saved Company→Role→Skills+Freq dictionary to {output_company_dict}")
    print(f"✅ Saved Role→Company→Skills+Freq dictionary to {output_role_dict}")


# -----------------------------
# Example usage
# -----------------------------
if __name__ == "__main__":
    build_feedback_dictionaries(
        ner_dataset_path="C:\\Users\\WIN11\\OneDrive\\Desktop\\resume proj\\projwithml\\data\\final_merged_iob_dataset.json",          # your 100k dataset
        output_company_dict="company_role_skills.json",
        output_role_dict="role_company_skills.json"
    )


✅ Saved Company→Role→Skills+Freq dictionary to company_role_skills.json
✅ Saved Role→Company→Skills+Freq dictionary to role_company_skills.json


In [2]:
import json
import random

# Load your company-role-skill dictionary
with open("company_role_skills.json", "r") as f:
    company_role_dict = json.load(f)

train_data = []

for company, roles in company_role_dict.items():
    for role, details in roles.items():
        required_skills = details["skills"]

        if not required_skills:
            continue

        # Simulate candidate’s skills (random subset)
        user_skills = random.sample(required_skills, 
                                    k=random.randint(1, len(required_skills)))

        # Prepare input string
        input_text = (
            f"role: {role} | company: {company} | "
            f"user_skills: {user_skills} | required_skills: {required_skills}"
        )

        # Prepare output (feedback)
        missing = [s for s in required_skills if s not in user_skills]
        matched = [s for s in user_skills if s in required_skills]

        if missing:
            output_text = (
                f"You already have {matched}, which is useful for {role} at {company}. "
                f"To improve your chances, focus on learning {missing}."
            )
        else:
            output_text = (
                f"Excellent! You already have all required skills for {role} at {company}. "
                f"You are a strong fit."
            )

        # Save training example
        train_data.append({"input": input_text, "target": output_text})

# Save dataset for T5
with open("t5_training_data.json", "w") as f:
    json.dump(train_data, f, indent=2)

print(f"Generated {len(train_data)} training samples")


Generated 768 training samples
