Written BY Zetian


how we could deal with Job description and resume to give specific advise to a resume given by a user: Use BERT to encode the user’s resume alongside industry job descriptions and other resumes in the industry and measure their cosine similarity to identify alignment gaps.
Then extract the top keywords from the job descriptions that are missing in the user’s resume and suggest adding them as relevant skills or terms. 

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
user_resume_address = '/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/user_resume/zetian_resume.txt'

In [3]:
bert = SentenceTransformer('all-mpnet-base-v2')

In [4]:
# work
def get_cos_similarity(user_text: str, texts: list[str]) -> np.ndarray:
    """
    Compute cosine similarity between a single user_text and a list of texts.
    Returns an array of similarity scores of length len(texts).
    """
    # Encode the user text and all candidate texts
    user_emb = bert.encode(user_text, convert_to_numpy=True)
    texts_emb = bert.encode(texts, convert_to_numpy=True)
    
    # Compute cosine similarity: (A·B) / (||A|| * ||B||)
    dot_products = texts_emb @ user_emb
    text_norms = np.linalg.norm(texts_emb, axis=1)
    user_norm = np.linalg.norm(user_emb)
    sims = dot_products / (text_norms * user_norm + 1e-8)
    return sims

#work
def get_most_similar(user_input_category: str, system_category_list: list[str]) -> tuple[str, float]:
    """
    Find which category in system_category_list is most similar to user_input_category.
    Returns the best-matching category and its similarity score.
    """
    sims = get_cos_similarity(user_input_category, system_category_list)
    best_idx = int(np.argmax(sims))
    return system_category_list[best_idx], float(sims[best_idx])


In [5]:
# work
def get_user_category(jd_categories, resume_categories): 
    user_input = input("Enter job category: ")
    jd_match = get_most_similar(user_input, jd_categories)[0]
    res_match = get_most_similar(user_input, resume_categories)[0]
    print(f"Matched JD: {jd_match}")
    print(f"Matched Resume: {res_match}")
    return jd_match, res_match

#work
def get_user_resume() -> str:
    file_path = user_resume_address
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content


In [None]:
def analyze_keywords_similarity(jd_category, 
                                resume_category, 
                                user_resume, 
                                system_jd_keywords, 
                                system_resume_keywords):
    


In [13]:
#work
def analyze_embedding_similarity(jd_category, 
                                resume_category, 
                                user_resume, 
                                system_jd, 
                                system_resume):

    jd_texts = system_jd.loc[system_jd['job'] == jd_category, 'description'].astype(str).tolist()
    if not jd_texts:
        print(f"No job descriptions found for category '{jd_category}'.")
        return
    long_jd_text = " ".join(jd_texts)



    # 2. 编码
    jd_emb = bert.encode(long_jd_text, convert_to_numpy=True)
    user_resume_emb = bert.encode(user_resume, convert_to_numpy=True)

    # 3. 计算余弦相似度
    cos_sim = np.dot(jd_emb, user_resume_emb) / (
        np.linalg.norm(jd_emb) * np.linalg.norm(user_resume_emb) + 1e-8
    )

    print()
    print(f'We found {len(jd_texts)} records of job descriptions in the category you entered! ')
    print(f"Cosine similarity between JD category '{jd_category}' "
          f"and your resume: {cos_sim:.4f}")
    print() 

    resume_text = system_resume.loc[system_resume['Category'] == resume_category, 'Resume'].astype(str).tolist()
    if not resume_text:
        print(f"No job descriptions found for category '{resume_category}'.")
        return
    

    long_resume_text = " ".join(resume_text)


    # 2. 编码
    resume_emb = bert.encode(long_resume_text, convert_to_numpy=True)

    # 3. 计算余弦相似度
    cos_sim = np.dot(resume_emb, user_resume_emb) / (
        np.linalg.norm(resume_emb) * np.linalg.norm(user_resume_emb) + 1e-8
    )
    print()
    print(f'We found {len(resume_text)} records of resumes in the category you entered! ')
    print(f"Cosine similarity between resume category '{resume_category}' "
          f"and your resume: {cos_sim:.4f}")
    print()


In [None]:
def main():

    system_jd = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/'
+ 'datasets/processed_data/cleaned_job_data.csv')
    system_resume = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/'
    + 'datasets/processed_data/plain_resume.csv')
    system_jd_keywords = pd.read_csv('')
    system_resume_keywords = pd.read_csv('') 

    jd_categories = system_jd['job'].unique().tolist()
    resume_categories = system_resume['Category'].unique().tolist()


    jd_category, resume_category = get_user_category(jd_categories, resume_categories)
    # get the right category for user's resume to be analyze

    user_resume = get_user_resume()


    # analyze_keywords_similarity(jd_category, resume_category, user_resume, system_jd_keywords, system_resume_keywords)
    # #get the report of the similarity between user's resume and job desciprtion and other resumes in our database
    # #in terms of keywords
    analyze_embedding_similarity(jd_category, resume_category, user_resume, system_jd, system_resume)
    #get the report of the similarity between user's resume and job desciprtion and other resumes in our database
    #in terms of embedding using BERT

In [15]:
main()

Matched JD: physicist
Matched Resume: Arts

We found 45 records of job descriptions in the category you entered! 
Cosine similarity between JD category 'physicist' and your resume: 0.3798


We found 36 records of resumes in the category you entered! 
Cosine similarity between resume category 'Arts' and your resume: 0.3612

