In [3]:
import random
import pandas as pd
import json

# ------------------------------
# Master Skill Vocabulary
# ------------------------------

technical_skills = [
    "Python", "Java", "C++", "JavaScript", "HTML", "CSS",
    "SQL", "NoSQL", "Machine Learning", "Deep Learning",
    "Data Analysis", "Data Engineering", "Docker", "Kubernetes",
    "AWS", "Azure", "GCP", "REST API", "GraphQL", "Flask",
    "Django", "TensorFlow", "PyTorch", "Spark", "Hadoop",
    "Linux", "CI/CD", "Git", "DevOps", "MLOps",
]

soft_skills = [
    "Communication", "Leadership", "Teamwork", "Problem Solving",
    "Critical Thinking", "Time Management", "Adaptability",
    "Creativity", "Collaboration", "Presentation Skills"
]

domain_skills = [
    "Financial Modeling", "Cybersecurity", "UI/UX Design",
    "Cloud Architecture", "Business Analysis", "Product Management",
    "HR Analytics", "Marketing Strategy", "Sales Forecasting"
]

all_skills = technical_skills + soft_skills + domain_skills


# ------------------------------
# Row Generator
# ------------------------------

def generate_row():
    # Randomly pick 5–10 required skills
    required = random.sample(all_skills, random.randint(5, 10))

    # Candidate skills = required skills minus some missing ones + random noise
    candidate = []

    # Add some required skills (60–80% coverage)
    coverage_rate = random.uniform(0.4, 0.9)
    num_cover = int(len(required) * coverage_rate)

    candidate += random.sample(required, num_cover)

    # Add noise skills (0–4 extra skills)
    noise = random.sample(all_skills, random.randint(0, 4))
    candidate += noise

    # Remove duplicates
    candidate = list(set(candidate))

    # Compute expected gap
    expected_gap = [skill for skill in required if skill not in candidate]

    return {
        "required_skills": required,
        "candidate_skills": candidate,
        "expected_gap": expected_gap
    }


# ------------------------------
# Create 1000 Rows
# ------------------------------

rows = [generate_row() for _ in range(1000)]

# Save CSV
df = pd.DataFrame({
    "required_skills": [", ".join(r["required_skills"]) for r in rows],
    "candidate_skills": [", ".join(r["candidate_skills"]) for r in rows],
    "expected_gap": [", ".join(r["expected_gap"]) for r in rows]
})
df.to_csv("skill_gap_dataset_1000.csv", index=False)

file_path = "skill_gap_dataset_1000.csv"
df.to_csv(file_path, index=False)



print("Generated 1000-row dataset!")
print(file_path)


Generated 1000-row dataset!
skill_gap_dataset_1000.csv
