Written BY Zetian


	1.	Read four CSV files—job descriptions (system_jd), peer resumes (system_resume), and three keyword-frequency tables—and load the user’s resume text; initialize a SentenceTransformer BERT model.
	2.	Prompt the user for a job category, then match it to the closest JD, resume, and skill categories by computing cosine similarities over BERT embeddings.
	3.	Compute the semantic similarity between the full user resume and the concatenated texts of the matched job descriptions and peer resumes.
	4.	Parse the stored keyword lists, then for each category identify which top keywords are absent from the user’s resume by combining substring checks with an embedding-based similarity threshold.
	5.	Print concise reports showing similarity scores and lists of missing keywords for job descriptions, peer resumes, and skill requirements.

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
user_resume_address = '/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/user_resume/zetian_resume.txt'

In [3]:
bert = SentenceTransformer('all-mpnet-base-v2')

In [4]:
# work
def get_cos_similarity(user_text: str, texts: list[str]) -> np.ndarray:
    """
    Compute cosine similarity between a single user_text and a list of texts.
    Returns an array of similarity scores of length len(texts).
    """
    # Encode the user text and all candidate texts
    user_emb = bert.encode(user_text, convert_to_numpy=True)
    texts_emb = bert.encode(texts, convert_to_numpy=True)
    
    # Compute cosine similarity: (A·B) / (||A|| * ||B||)
    dot_products = texts_emb @ user_emb
    text_norms = np.linalg.norm(texts_emb, axis=1)
    user_norm = np.linalg.norm(user_emb)
    sims = dot_products / (text_norms * user_norm + 1e-8)
    return sims

#work
def get_most_similar(user_input_category: str, system_category_list: list[str]) -> tuple[str, float]:
    """
    Find which category in system_category_list is most similar to user_input_category.
    Returns the best-matching category and its similarity score.
    """
    sims = get_cos_similarity(user_input_category, system_category_list)
    best_idx = int(np.argmax(sims))
    return system_category_list[best_idx], float(sims[best_idx])


In [5]:
# work
def get_user_category(jd_categories, resume_categories, skill_categories): 
    user_input = input("Enter job category: ")
    print(f'category that user entered: {user_input}')
    print()
    jd_match = get_most_similar(user_input, jd_categories)[0]
    res_match = get_most_similar(user_input, resume_categories)[0]
    skill_match = get_most_similar(user_input, skill_categories)[0]
    print(f"Matched JD category : {jd_match} (the most similar category we found for you!)")
    print(f"Matched Resume category: {res_match} (the most similar category we found for you!)")
    print(f"Matched Skill category: {skill_match} (the most similar category we found for you!)")
    return jd_match, res_match, skill_match

#work
def get_user_resume() -> str:
    file_path = user_resume_address
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content


In [14]:
import ast
import re
from sentence_transformers import SentenceTransformer, util
import numpy as np

# 关键词字符串解析
def parse_keyword_str(s: str) -> list[tuple[str, int]]:
    clean = re.sub(r'np\.int64\((\d+)\)', r'\1', s)
    return ast.literal_eval(clean)


def analyze_keywords_similarity(jd_category, 
                                resume_category,
                                skill_category,
                                user_resume, 
                                system_jd_keywords, 
                                system_resume_keywords,
                                system_skill_keywords,
                                sim_threshold: float = 0.75):
    """
    用 embedding 语义相似度来判断关键词覆盖情况：
    - 若用户简历 token 与关键词有子串匹配，或语义相似度 ≥ sim_threshold，就算已覆盖。
    """
    # 1) 将 CSV 中的字符串 [("kw", np.int64), ...] 还原为列表
    for df in (system_jd_keywords, system_resume_keywords, system_skill_keywords):
        df['top_keywords'] = (
            df['top_keywords']
              .astype(str)
              .apply(parse_keyword_str)
        )

    # 2) 抽取并编码用户简历中的 token
    user_tokens = set(re.findall(r'\b[a-zA-Z]{3,}\b', user_resume.lower()))
    user_token_list = list(user_tokens)
    user_embeds = bert.encode(user_token_list, convert_to_tensor=True)

    def covers(keyword: str) -> bool:
        w = keyword.lower()
        # 子串匹配
        if any(w in ut for ut in user_tokens):
            return True
        # 语义相似度匹配
        kw_embed = bert.encode(w, convert_to_tensor=True)
        cos_scores = util.cos_sim(kw_embed, user_embeds)  # shape [1, N]
        if cos_scores.max().item() >= sim_threshold:
            return True
        return False

    # —— 3) 检查 JD 关键词 —— #
    jd_list = system_jd_keywords.loc[
        system_jd_keywords['job'] == jd_category, 'top_keywords'
    ].iloc[0]
    missing_jd = [(w, c) for w, c in jd_list if not covers(w)]

    print(f"\nMissing keywords from job descriptions ({jd_category}):")
    if not missing_jd:
        print("  None! Your resume covers all top JD keywords.")
    else:
        for w, c in missing_jd:
            print(f"  {w}: {c}")

    # —— 4) 检查他人简历关键词 —— #
    res_list = system_resume_keywords.loc[
        system_resume_keywords['Category'] == resume_category, 'top_keywords'
    ].iloc[0]
    missing_res = [(w, c) for w, c in res_list if not covers(w)]

    print(f"\nMissing keywords from others' resumes ({resume_category}):")
    if not missing_res:
        print("  None! Your resume covers all top resume keywords.")
    else:
        for w, c in missing_res:
            print(f"  {w}: {c}")

    # —— 5) 检查 JD 技能要求 —— #
    skill_list = system_skill_keywords.loc[
        system_skill_keywords['Category'] == skill_category, 'top_keywords'
    ].iloc[0]
    missing_skill = [(w, c) for w, c in skill_list if not covers(w)]

    print(f"\nMissing keywords from skill requirements ({skill_category}):")
    if not missing_skill:
        print("  None! Your resume covers all top skill keywords.")
    else:
        for w, c in missing_skill:
            print(f"  {w}: {c}")

In [7]:
#work
def analyze_embedding_similarity(jd_category, 
                                resume_category, 
                                user_resume, 
                                system_jd, 
                                system_resume):

    jd_texts = system_jd.loc[system_jd['job'] == jd_category, 'description'].astype(str).tolist()
    if not jd_texts:
        print(f"No job descriptions found for category '{jd_category}'.")
        return
    long_jd_text = " ".join(jd_texts)



    # 2. 编码
    jd_emb = bert.encode(long_jd_text, convert_to_numpy=True)
    user_resume_emb = bert.encode(user_resume, convert_to_numpy=True)

    # 3. 计算余弦相似度
    cos_sim = np.dot(jd_emb, user_resume_emb) / (
        np.linalg.norm(jd_emb) * np.linalg.norm(user_resume_emb) + 1e-8
    )

    print()
    print(f'We found {len(jd_texts)} records of job descriptions in the category you entered! ')
    print(f"Cosine similarity between JD category '{jd_category}' "
          f"and your resume: {cos_sim:.4f}")
    print() 

    resume_text = system_resume.loc[system_resume['Category'] == resume_category, 'Resume'].astype(str).tolist()
    if not resume_text:
        print(f"No job descriptions found for category '{resume_category}'.")
        return
    

    long_resume_text = " ".join(resume_text)


    # 2. 编码
    resume_emb = bert.encode(long_resume_text, convert_to_numpy=True)

    # 3. 计算余弦相似度
    cos_sim = np.dot(resume_emb, user_resume_emb) / (
        np.linalg.norm(resume_emb) * np.linalg.norm(user_resume_emb) + 1e-8
    )
    print()
    print(f'We found {len(resume_text)} records of resumes in the category you entered! ')
    print(f"Cosine similarity between resume category '{resume_category}' "
          f"and your resume: {cos_sim:.4f}")
    print()


In [15]:
def main():

    system_jd = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/'
    + 'datasets/processed_data/cleaned_job_data.csv')
    system_resume = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/'
    + 'datasets/processed_data/plain_resume.csv')
    system_jd_keywords = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/processed_data/keywords/cleaned_jobdata_keyword_freq.csv')
    system_resume_keywords = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/processed_data/keywords/plain_resume_keyword_freq.csv') 
    system_skill_keywords = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/processed_data/keywords/industry_top_skill_keywords.csv')


    jd_categories = system_jd['job'].unique().tolist()
    resume_categories = system_resume['Category'].unique().tolist()
    skill_categories = system_skill_keywords['Category'].unique().tolist()

    jd_category, resume_category, skill_category = get_user_category(jd_categories, resume_categories, skill_categories)
    # get the right category for user's resume to be analyze

    user_resume = get_user_resume()


    analyze_embedding_similarity(jd_category, resume_category, user_resume, system_jd, system_resume)
    #get the report of the similarity between user's resume and job desciprtion and other resumes in our database
    #in terms of embedding using BERT

    analyze_keywords_similarity(jd_category, resume_category, skill_category, user_resume, system_jd_keywords, system_resume_keywords, system_skill_keywords)
    # #get the report of the similarity between user's resume and job desciprtion and other resumes in our database
    # #in terms of keywords

In [16]:
main()

category that user entered: java developer

Matched JD category : application developer (the most similar category we found for you!)
Matched Resume category: Java Developer (the most similar category we found for you!)
Matched Skill category: IT-Software, Software Services (the most similar category we found for you!)

We found 25 records of job descriptions in the category you entered! 
Cosine similarity between JD category 'application developer' and your resume: 0.4791


We found 84 records of resumes in the category you entered! 
Cosine similarity between resume category 'Java Developer' and your resume: 0.4938


Missing keywords from job descriptions (application developer):
  cloud: 14
  understanding: 13
  solutions: 12
  technical: 12
  embo: 11
  architecture: 10
  tools: 10
  culture: 9
  environment: 9
  hands: 9
  server: 9
  access: 8
  agile: 8
  angular: 8
  communication: 8
  commercial: 7
  platform: 7
  platforms: 7
  solution: 7
  asp: 6
  based: 6
  degree: 6
  emb