Written BY Zetian


how we could deal with Job description and resume to give specific advise to a resume given by a user: Use BERT to encode the user’s resume alongside industry job descriptions and other resumes in the industry and measure their cosine similarity to identify alignment gaps.
Then extract the top keywords from the job descriptions that are missing in the user’s resume and suggest adding them as relevant skills or terms. 

In [21]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
user_resume_address = '/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/user_resume/zetian_resume.txt'

In [3]:
bert = SentenceTransformer('all-mpnet-base-v2')

In [4]:
# work
def get_cos_similarity(user_text: str, texts: list[str]) -> np.ndarray:
    """
    Compute cosine similarity between a single user_text and a list of texts.
    Returns an array of similarity scores of length len(texts).
    """
    # Encode the user text and all candidate texts
    user_emb = bert.encode(user_text, convert_to_numpy=True)
    texts_emb = bert.encode(texts, convert_to_numpy=True)
    
    # Compute cosine similarity: (A·B) / (||A|| * ||B||)
    dot_products = texts_emb @ user_emb
    text_norms = np.linalg.norm(texts_emb, axis=1)
    user_norm = np.linalg.norm(user_emb)
    sims = dot_products / (text_norms * user_norm + 1e-8)
    return sims

#work
def get_most_similar(user_input_category: str, system_category_list: list[str]) -> tuple[str, float]:
    """
    Find which category in system_category_list is most similar to user_input_category.
    Returns the best-matching category and its similarity score.
    """
    sims = get_cos_similarity(user_input_category, system_category_list)
    best_idx = int(np.argmax(sims))
    return system_category_list[best_idx], float(sims[best_idx])


In [None]:
# work
def get_user_category(jd_categories, resume_categories): 
    user_input = input("Enter job category: ")
    jd_match = get_most_similar(user_input, jd_categories)[0]
    res_match = get_most_similar(user_input, resume_categories)[0]
    print(f"Matched JD: {jd_match}")
    print(f"Matched Resume: {res_match}")
    return jd_match, res_match

#work
def get_user_resume() -> str:
    file_path = user_resume_address
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content


In [51]:
import ast, re

def parse_keyword_str(s: str) -> list[tuple[str,int]]:
    # 1) strip off the surrounding quotes if any, 2) replace np.int64(60) → 60
    clean = re.sub(r'np\.int64\((\d+)\)', r'\1', s)
    # 3) safely evaluate into Python literals
    return ast.literal_eval(clean)



def analyze_keywords_similarity(jd_category, 
                                resume_category, 
                                user_resume, 
                                system_jd_keywords, 
                                system_resume_keywords):
    
    system_jd_keywords['top_keywords'] = (
    system_jd_keywords['top_keywords']
    .astype(str)          # ensure it's a string
    .apply(parse_keyword_str)
    )
    system_resume_keywords['top_keywords'] = (
    system_resume_keywords['top_keywords']
    .astype(str)
    .apply(parse_keyword_str)
    )

    user_tokens = set(re.findall(r'\b[a-zA-Z]{3,}\b', user_resume.lower()))

    jd_keywords_list = system_jd_keywords.loc[system_jd_keywords['job']==jd_category, 'top_keywords']
    jd_keywords_list = jd_keywords_list.iloc[0]
    missing_jd_words = [(w, c) for w, c in jd_keywords_list if w.lower() not in user_tokens]
    # jd_keywords_list's every element is (word, np.int() of its count)
    #print out all the keywords that user_resume lacks and its importance(just its count)

    print(f"\nMissing keywords from job descriptions ({jd_category}):")

    if not missing_jd_words:
        print("  None! Your resume covers all top JD keywords.")
    else:
        for word, count in missing_jd_words:
            print(f"  {word}: {count}")


    resume_keywords_list = system_resume_keywords.loc[system_resume_keywords['Category']==resume_category, 'top_keywords']
    resume_keywords_list = resume_keywords_list.iloc[0]
    # resume_keywords_list's every element is (word, np.int() of its count)
    #print out all the keywords that user_resume lacks and its importance(just its count)
    missing_resume_words = [(w, c) for w, c in resume_keywords_list if w.lower() not in user_tokens]
    print(f"\nMissing keywords from others' resumes ({missing_resume_words}):")

    if not missing_resume_words:
        print("  None! Your resume covers all top resumes keywords.")
    else:
        for word, count in missing_resume_words:
            print(f"  {word}: {count}")

In [13]:
#work
def analyze_embedding_similarity(jd_category, 
                                resume_category, 
                                user_resume, 
                                system_jd, 
                                system_resume):

    jd_texts = system_jd.loc[system_jd['job'] == jd_category, 'description'].astype(str).tolist()
    if not jd_texts:
        print(f"No job descriptions found for category '{jd_category}'.")
        return
    long_jd_text = " ".join(jd_texts)



    # 2. 编码
    jd_emb = bert.encode(long_jd_text, convert_to_numpy=True)
    user_resume_emb = bert.encode(user_resume, convert_to_numpy=True)

    # 3. 计算余弦相似度
    cos_sim = np.dot(jd_emb, user_resume_emb) / (
        np.linalg.norm(jd_emb) * np.linalg.norm(user_resume_emb) + 1e-8
    )

    print()
    print(f'We found {len(jd_texts)} records of job descriptions in the category you entered! ')
    print(f"Cosine similarity between JD category '{jd_category}' "
          f"and your resume: {cos_sim:.4f}")
    print() 

    resume_text = system_resume.loc[system_resume['Category'] == resume_category, 'Resume'].astype(str).tolist()
    if not resume_text:
        print(f"No job descriptions found for category '{resume_category}'.")
        return
    

    long_resume_text = " ".join(resume_text)


    # 2. 编码
    resume_emb = bert.encode(long_resume_text, convert_to_numpy=True)

    # 3. 计算余弦相似度
    cos_sim = np.dot(resume_emb, user_resume_emb) / (
        np.linalg.norm(resume_emb) * np.linalg.norm(user_resume_emb) + 1e-8
    )
    print()
    print(f'We found {len(resume_text)} records of resumes in the category you entered! ')
    print(f"Cosine similarity between resume category '{resume_category}' "
          f"and your resume: {cos_sim:.4f}")
    print()


In [53]:
def main():

    system_jd = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/'
+ 'datasets/processed_data/cleaned_job_data.csv')
    system_resume = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/'
    + 'datasets/processed_data/plain_resume.csv')
    system_jd_keywords = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/processed_data/cleaned_jobdata_keyword_freq.csv')
    system_resume_keywords = pd.read_csv('/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/' \
    'datasets/processed_data/plain_resume_keyword_freq.csv') 

    

    jd_categories = system_jd['job'].unique().tolist()
    resume_categories = system_resume['Category'].unique().tolist()


    jd_category, resume_category = get_user_category(jd_categories, resume_categories)
    # get the right category for user's resume to be analyze

    user_resume = get_user_resume()


    analyze_embedding_similarity(jd_category, resume_category, user_resume, system_jd, system_resume)
    #get the report of the similarity between user's resume and job desciprtion and other resumes in our database
    #in terms of embedding using BERT

    analyze_keywords_similarity(jd_category, resume_category, user_resume, system_jd_keywords, system_resume_keywords)
    # #get the report of the similarity between user's resume and job desciprtion and other resumes in our database
    # #in terms of keywords

In [54]:
main()

Matched JD: application developer
Matched Resume: Java Developer

We found 25 records of job descriptions in the category you entered! 
Cosine similarity between JD category 'application developer' and your resume: 0.4791


We found 84 records of resumes in the category you entered! 
Cosine similarity between resume category 'Java Developer' and your resume: 0.4938


Missing keywords from job descriptions (application developer):
  applications: 25
  cloud: 14
  understanding: 13
  solutions: 12
  technical: 12
  embo: 11
  net: 11
  architecture: 10
  teams: 10
  tools: 10
  culture: 9
  environment: 9
  hands: 9
  js: 9
  server: 9
  access: 8
  agile: 8
  angular: 8
  communication: 8
  technologies: 8
  commercial: 7
  developing: 7
  platform: 7
  platforms: 7
  solution: 7
  sql: 7
  ability: 6
  asp: 6
  based: 6
  degree: 6
  ember: 6
  enable: 6
  learning: 6

Missing keywords from others' resumes ([('exprience', 408), ('developer', 288), ('details', 276), ('ajax', 138), ('jsp