In [None]:
!pip uninstall tensorflow tensorflow-gpu keras

In [None]:
# If installed via conda
conda remove tensorflow keras --yes

In [1]:
"""
End-to-end Semantic Skill Gap Analysis

Steps:
1. Generate synthetic dataset: 1000 rows with required_skills, candidate_skills, expected_gap
2. Load dataset from CSV
3. Use sentence-transformer embeddings to infer gaps (predicted_gap)
4. Compare predicted_gap vs expected_gap to compute precision, recall, F1
"""

import os
import random
import json
from typing import List, Dict, Any

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, recall_score, f1_score


# ============================================================
# 1. Synthetic dataset generation
# ============================================================

TECHNICAL_SKILLS = [
    "Python", "Java", "C++", "JavaScript", "HTML", "CSS", "SQL", "NoSQL",
    "Machine Learning", "Deep Learning", "Data Analysis", "Data Engineering",
    "Docker", "Kubernetes", "AWS", "Azure", "GCP", "REST API", "GraphQL",
    "Flask", "Django", "TensorFlow", "PyTorch", "Spark", "Hadoop", "Linux",
    "CI/CD", "Git", "DevOps", "MLOps"
]

SOFT_SKILLS = [
    "Communication", "Leadership", "Teamwork", "Problem Solving",
    "Critical Thinking", "Time Management", "Adaptability", "Creativity",
    "Collaboration", "Presentation Skills"
]

DOMAIN_SKILLS = [
    "Financial Modeling", "Cybersecurity", "UI/UX Design", "Cloud Architecture",
    "Business Analysis", "Product Management", "HR Analytics",
    "Marketing Strategy", "Sales Forecasting"
]

ALL_SKILLS = TECHNICAL_SKILLS + SOFT_SKILLS + DOMAIN_SKILLS


def generate_row() -> Dict[str, str]:
    """
    Generate a single synthetic row:
    - required_skills: 5–10 unique skills
    - candidate_skills: 40–90% of required skills + some random noise skills
    - expected_gap: required_skills that are not present in candidate_skills
    """
    required = random.sample(ALL_SKILLS, random.randint(5, 10))

    # Candidate skills: subset of required + noise
    candidate = []
    coverage_rate = random.uniform(0.4, 0.9)  # how many required skills are covered
    num_cover = max(1, int(len(required) * coverage_rate))
    candidate += random.sample(required, num_cover)

    # Add noise skills
    noise = random.sample(ALL_SKILLS, random.randint(0, 4))
    candidate += noise

    # Deduplicate
    candidate = list(set(candidate))

    # Expected gap = required - candidate
    expected_gap = [skill for skill in required if skill not in candidate]

    return {
        "required_skills": ", ".join(required),
        "candidate_skills": ", ".join(candidate),
        "expected_gap": ", ".join(expected_gap),
    }


def generate_dataset_csv(
    num_rows: int = 1000,
    path: str = "skill_gap_dataset_1000.csv"
) -> str:
    """
    Generate a CSV with synthetic skill-gap data.
    If file already exists, it will be overwritten.
    """
    rows = [generate_row() for _ in range(num_rows)]
    df = pd.DataFrame(rows)
    df.to_csv(path, index=False)
    print(f"Generated dataset with {num_rows} rows at: {path}")
    return path


# ============================================================
# 2. Utility functions for parsing and embeddings
# ============================================================

def to_list(skill_str: str) -> List[str]:
    """
    Convert comma-separated skills string to list of trimmed skills.
    """
    if not isinstance(skill_str, str):
        return []
    return [s.strip() for s in skill_str.split(",") if s.strip()]


def normalize_skill_text(skill: str) -> str:
    """
    Basic normalization: lowercase and strip. You can extend this.
    """
    return skill.strip().lower()


def compute_embeddings(
    model: SentenceTransformer,
    skills: List[str]
) -> np.ndarray:
    """
    Compute normalized embeddings for a list of skill phrases.
    """
    if not skills:
        return np.zeros((0, model.get_sentence_embedding_dimension()), dtype=np.float32)

    norm_skills = [normalize_skill_text(s) for s in skills]
    emb = model.encode(norm_skills, normalize_embeddings=True)
    return np.array(emb, dtype=np.float32)


def compute_gap_for_row(
    required: List[str],
    candidate: List[str],
    model: SentenceTransformer,
    partial_threshold: float = 0.55
) -> List[str]:
    """
    For a single row:
    - required: list of required skills
    - candidate: list of candidate skills
    Uses sentence embeddings + cosine similarity.
    Treats skills with max similarity < partial_threshold as gaps.
    Returns the list of required skills considered gaps.
    """
    if not required:
        return []

    req_emb = compute_embeddings(model, required)
    cand_emb = compute_embeddings(model, candidate)

    if cand_emb.shape[0] == 0:
        # No candidate skills: all required are gaps
        return list(required)

    predicted_gap = []

    # For each required skill, find max similarity with candidate skills
    # cosine similarity = dot product (because embeddings are normalized)
    sim_matrix = np.matmul(req_emb, cand_emb.T)  # [num_req, num_cand]

    for i, r in enumerate(required):
        sims = sim_matrix[i]
        max_sim = float(np.max(sims))  # best match
        if max_sim < partial_threshold:
            predicted_gap.append(r)

    return predicted_gap


# ============================================================
# 3. Evaluation: precision, recall, F1
# ============================================================

def evaluate_gap_detection(df: pd.DataFrame) -> Dict[str, float]:
    """
    Evaluate predicted gaps vs expected gaps across all rows.

    We treat each (row, required_skill) as a binary classification:
    - label 1 if in expected_gap_list
    - prediction 1 if in predicted_gap

    Returns global precision, recall, F1.
    """
    y_true = []
    y_pred = []

    for _, row in df.iterrows():
        required = row["required_list"]
        true_gap = set(row["expected_gap_list"])
        pred_gap = set(row["predicted_gap"])

        for skill in required:
            y_true.append(1 if skill in true_gap else 0)
            y_pred.append(1 if skill in pred_gap else 0)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return {"precision": precision, "recall": recall, "f1": f1}


# ============================================================
# 4. Main script
# ============================================================

def main():
    random.seed(42)

    csv_path = "skill_gap_dataset_1000.csv"

    # Step 1: generate dataset if not present
    if not os.path.exists(csv_path):
        generate_dataset_csv(num_rows=1000, path=csv_path)
    else:
        print(f"Found existing dataset: {csv_path}")

    # Step 2: load dataset
    df = pd.read_csv(csv_path)
    print("Loaded dataset with shape:", df.shape)

    # Step 3: parse lists from comma-separated strings
    df["required_list"] = df["required_skills"].apply(to_list)
    df["candidate_list"] = df["candidate_skills"].apply(to_list)
    df["expected_gap_list"] = df["expected_gap"].apply(to_list)

    # Step 4: load embedding model once
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    print(f"Loading embedding model: {model_name}")
    model = SentenceTransformer(model_name)

    # Step 5: compute predicted gap for each row
    partial_threshold = 0.55  # you can tune this
    print(f"Computing predicted gaps with partial_threshold={partial_threshold}...")
    df["predicted_gap"] = df.apply(
        lambda row: compute_gap_for_row(
            row["required_list"],
            row["candidate_list"],
            model=model,
            partial_threshold=partial_threshold
        ),
        axis=1
    )

    # Step 6: evaluate precision, recall, F1
    metrics = evaluate_gap_detection(df)

    print("\n=== Global Skill Gap Detection Metrics ===")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall   : {metrics['recall']:.4f}")
    print(f"F1-score : {metrics['f1']:.4f}")

    # Step 7: show a few example rows
    print("\n=== Sample Rows (first 5) ===")
    for i in range(5):
        row = df.iloc[i]
        print(f"\nRow {i+1}:")
        print("Required skills : ", row['required_list'])
        print("Candidate skills: ", row['candidate_list'])
        print("Expected gap    : ", row['expected_gap_list'])
        print("Predicted gap   : ", row['predicted_gap'])

    # Optional: save predictions to a new CSV
    out_path = "skill_gap_dataset_1000_with_predictions.csv"
    df.to_csv(out_path, index=False)
    print(f"\nSaved dataset with predictions to: {out_path}")


if __name__ == "__main__":
    main()


Found existing dataset: skill_gap_dataset_1000.csv
Loaded dataset with shape: (1000, 3)
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing predicted gaps with partial_threshold=0.55...

=== Global Skill Gap Detection Metrics ===
Precision: 1.0000
Recall   : 0.9658
F1-score : 0.9826

=== Sample Rows (first 5) ===

Row 1:
Required skills :  ['Linux', 'Adaptability', 'Machine Learning', 'Communication', 'Cloud Architecture']
Candidate skills:  ['Critical Thinking', 'Django', 'Communication', 'Machine Learning']
Expected gap    :  ['Linux', 'Adaptability', 'Cloud Architecture']
Predicted gap   :  ['Linux', 'Adaptability', 'Cloud Architecture']

Row 2:
Required skills :  ['MLOps', 'CI/CD', 'Sales Forecasting', 'Kubernetes', 'HTML']
Candidate skills:  ['Sales Forecasting', 'Kubernetes', 'MLOps', 'CI/CD', 'HTML']
Expected gap    :  []
Predicted gap   :  []

Row 3:
Required skills :  ['C++', 'Data Engineering', 'Creativity', 'TensorFlow', 'Kubernetes']
Candidate skills:  ['Flask', 'TensorFlow', 'C++', 'Data Engineering', 'Spark']
Expected gap    :  ['Creativity', 'Kubernetes']
Predicted gap   :  ['Creativity', 'Kubernet