In [5]:
import re, unicodedata, numpy as np, pandas as pd
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer, util
import torch


def normalize(t: str) -> str:
    t = unicodedata.normalize("NFKD", t.lower().strip())
    t = re.sub(r"[^a-z0-9.+\s]", " ", t)
    return " ".join(t.split())


def embed(texts: List[str], model: SentenceTransformer) -> torch.Tensor:
    with torch.no_grad():
        return model.encode(texts, convert_to_tensor=True, show_progress_bar=False)


def compare_skills(
    cv: List[str],
    jd: List[str],
    thr: float = 0.6,
    model_name: str = "sentence-transformers/all-mpnet-base-v2",
) -> Dict[str, Any]:
    cv_norm, jd_norm = [normalize(s) for s in cv], [normalize(s) for s in jd]
    model = SentenceTransformer(model_name)

    sim = util.cos_sim(embed(cv_norm, model), embed(jd_norm, model)).cpu().numpy()
    bin_ = (sim >= thr).astype(int)

    # DataFrame có nhãn hàng/cột
    df_sim = pd.DataFrame(sim, index=cv, columns=jd).round(3)
    df_bin = pd.DataFrame(bin_, index=cv, columns=jd)

    # JD skill nào có ít nhất 1 cột =1
    matched_mask = bin_.sum(axis=0) > 0
    matched_jd = list(np.array(jd)[matched_mask])
    unmatched_jd = list(np.array(jd)[~matched_mask])

    score = matched_mask.sum()
    score_pct = round(score / len(jd) * 100, 2) if jd else 0.0

    return {
        "similarity_df": df_sim,
        "binary_df": df_bin,
        "matched_jd": matched_jd,
        "unmatched_jd": unmatched_jd,
        "score": score,
        "score_pct": score_pct,
    }

In [None]:
if __name__ == "__main__":
    cv_skills = [
        "Python", "SQL", "Docker", "TensorFlow", "Data Analysis",
        "Kubernetes", "React", "Node.js", "AWS", "Git"
    ]

    jd_skills = [
        "Python Programming", "Machine Learning", "Docker", "K8s",
        "Data Analytics", "ReactJS", "Amazon Web Services",
        "Git Version Control", "SQL", "Deep Learning"
    ]

    print("\n======= SINGLE DATASET =======")
    res = compare_skills(cv_skills, jd_skills, thr=0.6)

    print("\nCosine Similarity Matrix:")
    print(res["similarity_df"])

    print("\nBinary Match Matrix (1 = match):")
    print(res["binary_df"])

    print("\n✓ Matched JD skills   :", ", ".join(res["matched_jd"]) or "(none)")
    print("✗ Unmatched JD skills :", ", ".join(res["unmatched_jd"]) or "(none)")

    print(f"\nSkill-match score: {res['score']} / {len(jd_skills)}  "
          f"({res['score_pct']}%)")




Cosine Similarity Matrix:
               Python Programming  Machine Learning  Docker    K8s  \
Python                      0.788             0.385   0.181  0.117   
SQL                         0.295             0.316   0.078  0.148   
Docker                      0.135             0.148   1.000  0.133   
TensorFlow                  0.299             0.409   0.262  0.131   
Data Analysis               0.359             0.485   0.094  0.105   
Kubernetes                  0.074             0.134   0.582  0.264   
React                       0.148             0.233   0.274  0.196   
Node.js                     0.232             0.146   0.236  0.049   
AWS                         0.198             0.196   0.337  0.283   
Git                         0.220             0.218   0.298  0.159   

               Data Analytics  ReactJS  Amazon Web Services  \
Python                  0.430    0.069                0.197   
SQL                     0.504    0.032                0.197   
Docker      