In [None]:
!pip uninstall tensorflow tensorflow-gpu keras

In [None]:
# If installed via conda
conda remove tensorflow keras --yes

In [5]:
"""
End-to-end Semantic Skill Gap Analysis

Steps:
1. Generate synthetic dataset: 1000 rows with required_skills, candidate_skills, expected_gap
2. Load dataset from CSV
3. Use sentence-transformer embeddings to infer gaps (predicted_gap)
4. Compare predicted_gap vs expected_gap to compute precision, recall, F1
"""

import os
import random
import json
from typing import List, Dict, Any

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, recall_score, f1_score



# ============================================================
# 2. Utility functions for parsing and embeddings
# ============================================================

def to_list(skill_str: str) -> List[str]:
    """
    Convert comma-separated skills string to list of trimmed skills.
    """
    if not isinstance(skill_str, str):
        return []
    return [s.strip() for s in skill_str.split(",") if s.strip()]


def normalize_skill_text(skill: str) -> str:
    """
    Basic normalization: lowercase and strip. You can extend this.
    """
    return skill.strip().lower()


def compute_embeddings(
    model: SentenceTransformer,
    skills: List[str]
) -> np.ndarray:
    """
    Compute normalized embeddings for a list of skill phrases.
    """
    if not skills:
        return np.zeros((0, model.get_sentence_embedding_dimension()), dtype=np.float32)

    norm_skills = [normalize_skill_text(s) for s in skills]
    emb = model.encode(norm_skills, normalize_embeddings=True)
    return np.array(emb, dtype=np.float32)


def compute_gap_for_row(
    required: List[str],
    candidate: List[str],
    model: SentenceTransformer,
    partial_threshold: float = 0.55
) -> List[str]:
    """
    For a single row:
    - required: list of required skills
    - candidate: list of candidate skills
    Uses sentence embeddings + cosine similarity.
    Treats skills with max similarity < partial_threshold as gaps.
    Returns the list of required skills considered gaps.
    """
    if not required:
        return []

    req_emb = compute_embeddings(model, required)
    cand_emb = compute_embeddings(model, candidate)

    if cand_emb.shape[0] == 0:
        # No candidate skills: all required are gaps
        return list(required)

    predicted_gap = []

    # For each required skill, find max similarity with candidate skills
    # cosine similarity = dot product (because embeddings are normalized)
    sim_matrix = np.matmul(req_emb, cand_emb.T)  # [num_req, num_cand]

    for i, r in enumerate(required):
        sims = sim_matrix[i]
        max_sim = float(np.max(sims))  # best match
        if max_sim < partial_threshold:
            predicted_gap.append(r)

    return predicted_gap


# ============================================================
# 3. Evaluation: precision, recall, F1
# ============================================================

def evaluate_gap_detection(df: pd.DataFrame) -> Dict[str, float]:
    """
    Evaluate predicted gaps vs expected gaps across all rows.

    We treat each (row, required_skill) as a binary classification:
    - label 1 if in expected_gap_list
    - prediction 1 if in predicted_gap

    Returns global precision, recall, F1.
    """
    y_true = []
    y_pred = []

    for _, row in df.iterrows():
        required = row["required_list"]
        true_gap = set(row["expected_gap_list"])
        pred_gap = set(row["predicted_gap"])

        for skill in required:
            y_true.append(1 if skill in true_gap else 0)
            y_pred.append(1 if skill in pred_gap else 0)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return {"precision": precision, "recall": recall, "f1": f1}


# ============================================================
# 4. Main script
# ============================================================

def main():
    random.seed(42)

    csv_path = "clustered_skill_gap_dataset_2000.csv"

    # Step 2: load dataset
    df = pd.read_csv(csv_path)
    print("Loaded dataset with shape:", df.shape)

    # Step 3: parse lists from comma-separated strings
    df["required_list"] = df["required_skills"].apply(to_list)
    df["candidate_list"] = df["candidate_skills"].apply(to_list)
    df["expected_gap_list"] = df["expected_gap"].apply(to_list)

    # Step 4: load embedding model once
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    print(f"Loading embedding model: {model_name}")
    model = SentenceTransformer(model_name)

    # Step 5: compute predicted gap for each row
    partial_threshold = 0.55  # you can tune this
    print(f"Computing predicted gaps with partial_threshold={partial_threshold}...")
    df["predicted_gap"] = df.apply(
        lambda row: compute_gap_for_row(
            row["required_list"],
            row["candidate_list"],
            model=model,
            partial_threshold=partial_threshold
        ),
        axis=1
    )

    # Step 6: evaluate precision, recall, F1
    metrics = evaluate_gap_detection(df)

    print("\n=== Global Skill Gap Detection Metrics ===")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall   : {metrics['recall']:.4f}")
    print(f"F1-score : {metrics['f1']:.4f}")

    # Step 7: show a few example rows
    print("\n=== Sample Rows (first 20) ===")
    for i in range(20):
        row = df.iloc[i]
        print(f"\nRow {i+1}:")
        print("Required skills : ", row['required_list'])
        print("Candidate skills: ", row['candidate_list'])
        print("Expected gap    : ", row['expected_gap_list'])
        print("Predicted gap   : ", row['predicted_gap'])

    # Optional: save predictions to a new CSV
    out_path = "clustered_skill_gap_dataset_2000_Result.csv"
    df.to_csv(out_path, index=False)
    print(f"\nSaved dataset with predictions to: {out_path}")


if __name__ == "__main__":
    main()


Found existing dataset: clustered_skill_gap_dataset_2000.csv
Loaded dataset with shape: (2000, 3)
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
Computing predicted gaps with partial_threshold=0.55...

=== Global Skill Gap Detection Metrics ===
Precision: 0.8964
Recall   : 0.9508
F1-score : 0.9228

=== Sample Rows (first 5) ===

Row 1:
Required skills :  ['Go', 'B2B Sales', 'Marketing Analytics', 'SEM', 'Linux', 'Kubernetes']
Candidate skills:  ['Adaptability', 'Diagnosis', 'Digital Marketing', 'Go', 'Kubernetes', 'Linux', 'Marketing Analytics', 'PHP', 'SEM', 'Teamwork']
Expected gap    :  ['B2B Sales']
Predicted gap   :  ['B2B Sales']

Row 2:
Required skills :  ['Patient Examination', 'Creativity', 'B2C Sales', 'Java', 'Deep Learning', 'Arbitration', 'Accounting']
Candidate skills:  ['Accounting', 'Arbitration', 'B2C Sales', 'Communication', 'Creativity', 'Deep Learning', 'HR', 'Java', 'Kotlin', 'Patient Examination']
Expected gap    :  []
Predicted gap   :  []

Row 3