In [2]:
import pandas as pd

### Load Resume

In [3]:
resume = pd.read_csv("Resume.csv")
resume.shape

(2484, 4)

In [4]:
resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [5]:
resume['Category'].unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [6]:
top_categories = resume['Category'].value_counts().head(5).index.tolist()
top_categories

['INFORMATION-TECHNOLOGY',
 'BUSINESS-DEVELOPMENT',
 'FINANCE',
 'ADVOCATE',
 'ACCOUNTANT']

In [7]:
selected_resume = resume[resume['Category'].isin(top_categories)]
selected_resume.shape

(594, 4)

### Load JD

In [8]:
jd = pd.read_csv("jd.csv")

In [9]:
jd.head()

Unnamed: 0,category,title,company,description
0,information-technology,Systems Administrator,Delphon,About the job\n\n\nWe are seeking a skilled Sy...
1,information-technology,Head of Digital Technology,Transcendental,About the job\nTranscendental is looking for a...
2,information-technology,Information Technology Technian,Bishop-Wisecarver,"About the job\nAbout BW\n\nFor over 70 years, ..."
3,information-technology,Information Technology Support Technian,Grassland Dairy Products,"About the job\nGrassland Dairy Products, Inc. ..."
4,information-technology,Information Technology Intern,Newmark,About the job\nJob Description\n\nOur customiz...


### Preprocess - Resume

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)       # Remove html
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # Remove numbers and punctuation
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return ' '.join(tokens)

selected_resume['clean_resume'] = selected_resume['Resume_str'].apply(clean_text)
selected_resume = selected_resume[selected_resume['clean_resume'].str.len() > 50]  # remove resume that have short texts


[nltk_data] Downloading package stopwords to /Users/yewei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yewei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_resume['clean_resume'] = selected_resume['Resume_str'].apply(clean_text)


### Preprocess - JD

In [11]:
jd['clean_jd'] = jd['description'].astype(str).apply(clean_text)

jd = jd[jd['clean_jd'].str.len() > 50]


# Resume Categorization

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
selected_resume.shape

(593, 5)

In [43]:
selected_resume.head(5)

Unnamed: 0,ID,Resume_str,Resume_html,Category,clean_resume,predicted_category
217,36856210,INFORMATION TECHNOLOGY Summar...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,information technology summary dedicated infor...,information-technology
218,21780877,INFORMATION TECHNOLOGY SPECIALIST\tGS...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,information technology specialist experience c...,information-technology
219,33241454,INFORMATION TECHNOLOGY SUPERVISOR ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,information technology supervisor summary seek...,information-technology
220,25990239,INFORMATION TECHNOLOGY INSTRUCTOR ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,information technology instructor summary seve...,information-technology
221,16899268,INFORMATION TECHNOLOGY MANAGER/ANALYS...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,information technology manageranalyst professi...,information-technology


In [44]:
jd.head(5)

Unnamed: 0,category,title,company,description,clean_jd
0,information-technology,Systems Administrator,Delphon,About the job\n\n\nWe are seeking a skilled Sy...,job seeking skilled system administrator join ...
1,information-technology,Head of Digital Technology,Transcendental,About the job\nTranscendental is looking for a...,job transcendental looking highly motivated he...
2,information-technology,Information Technology Technian,Bishop-Wisecarver,"About the job\nAbout BW\n\nFor over 70 years, ...",job year bishopwisecarver wbenc certified woma...
3,information-technology,Information Technology Support Technian,Grassland Dairy Products,"About the job\nGrassland Dairy Products, Inc. ...",job grassland dairy product inc seeking enthus...
4,information-technology,Information Technology Intern,Newmark,About the job\nJob Description\n\nOur customiz...,job job description customized week program de...


### Generate category keywords based on job description dataset.

In [74]:
!pip install pandas numpy sentence-transformers scikit-learn tqdm

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl.metadata (3.8 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m34

In [12]:
jd_df = jd.copy()
resume_df = selected_resume.copy()
resume_df['Category'] = resume_df['Category'].str.lower().str.strip()
jd_df['category'] = jd_df['category'].str.lower().str.strip()

In [13]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

# Sample format assumption
# resume_df.columns = ['ID', 'Resume_str', 'Resume_html', 'Category', 'clean_resume']
# jd_df.columns = ['category', 'title', 'company', 'description', 'clean_jd']

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast

# Step 1: Generate keywords dynamically per category using JDs
def extract_top_keywords(jd_texts, top_n=20):
    vectorizer = CountVectorizer(stop_words='english', max_features=top_n)
    X = vectorizer.fit_transform(jd_texts)
    keywords = vectorizer.get_feature_names_out()
    return list(keywords)

# Create a dictionary: {category -> [top keywords]}
category_keywords = {}
for category in jd_df['category'].unique():
    jd_texts = jd_df[jd_df['category'] == category]['clean_jd'].tolist()
    if jd_texts:
        keywords = extract_top_keywords(jd_texts, top_n=30)
        category_keywords[category] = keywords

# Step 2: Compute semantic similarity + keyword matching
def compute_semantic_similarity(resume_text, jd_texts):
    resume_embedding = model.encode(resume_text, convert_to_tensor=True)
    jd_embeddings = model.encode(jd_texts, convert_to_tensor=True)
    scores = util.cos_sim(resume_embedding, jd_embeddings)[0]
    return float(scores.max()), float(scores.mean())

def compute_keyword_score(resume_text, keyword_list):
    resume_text_lower = resume_text.lower()
    matched = [kw for kw in keyword_list if kw.lower() in resume_text_lower]
    return len(matched) / len(keyword_list) if keyword_list else 0

# Step 3: Resume-JD scoring loop
results = []

for idx, row in tqdm(resume_df.iterrows(), total=len(resume_df)):
    resume_id = row['ID']
    resume_text = row['clean_resume']
    category = row['Category']

    # Get matching JDs for same category
    jd_texts = jd_df[jd_df['category'] == category]['clean_jd'].tolist()
    keyword_list = category_keywords.get(category, [])

    if not jd_texts:
        max_sim, avg_sim, keyword_score = 0, 0, 0
    else:
        max_sim, avg_sim = compute_semantic_similarity(resume_text, jd_texts)
        keyword_score = compute_keyword_score(resume_text, keyword_list)

    results.append({
        'ID': resume_id,
        'Category': category,
        'Max_Similarity': max_sim,
        'Avg_Similarity': avg_sim,
        'Keyword_Score': keyword_score
    })

scores_df = pd.DataFrame(results)

# Step 4: Normalize and label
scaler = MinMaxScaler()
scores_df[['Max_Similarity', 'Avg_Similarity', 'Keyword_Score']] = scaler.fit_transform(
    scores_df[['Max_Similarity', 'Avg_Similarity', 'Keyword_Score']]
)

# Combine into final score (weights can be adjusted)
scores_df['Final_Score'] = (
    0.6 * scores_df['Max_Similarity'] +
    0.2 * scores_df['Avg_Similarity'] +
    0.2 * scores_df['Keyword_Score']
)

# Assign label based on score
def assign_label(score):
    if score >= 0.75:
        return 'good'
    elif score >= 0.5:
        return 'average'
    else:
        return 'bad'

scores_df['Label'] = scores_df['Final_Score'].apply(assign_label)

# Merge back to resumes
resume_labeled_df = resume_df.merge(scores_df[['ID', 'Final_Score', 'Label']], on='ID')

# Preview result
print(resume_labeled_df[['ID', 'Category', 'Final_Score', 'Label']].head())


100%|██████████| 593/593 [04:52<00:00,  2.03it/s]

         ID                Category  Final_Score    Label
0  36856210  information-technology     0.786784     good
1  21780877  information-technology     0.726022  average
2  33241454  information-technology     0.711078  average
3  25990239  information-technology     0.471817      bad
4  16899268  information-technology     0.659673  average





In [15]:
print(scores_df[['Max_Similarity', 'Avg_Similarity', 'Keyword_Score']].describe())

       Max_Similarity  Avg_Similarity  Keyword_Score
count      593.000000      593.000000     593.000000
mean         0.644965        0.667395       0.568939
std          0.176412        0.174405       0.188184
min          0.000000        0.000000       0.000000
25%          0.532870        0.546550       0.428571
50%          0.642017        0.678759       0.571429
75%          0.777100        0.811690       0.714286
max          1.000000        1.000000       1.000000


In [16]:
resume_labeled_df.head(5)

Unnamed: 0,ID,Resume_str,Resume_html,Category,clean_resume,Final_Score,Label
0,36856210,INFORMATION TECHNOLOGY Summar...,"<div class=""fontsize fontface vmargins hmargin...",information-technology,information technology summary dedicated infor...,0.786784,good
1,21780877,INFORMATION TECHNOLOGY SPECIALIST\tGS...,"<div class=""fontsize fontface vmargins hmargin...",information-technology,information technology specialist experience c...,0.726022,average
2,33241454,INFORMATION TECHNOLOGY SUPERVISOR ...,"<div class=""fontsize fontface vmargins hmargin...",information-technology,information technology supervisor summary seek...,0.711078,average
3,25990239,INFORMATION TECHNOLOGY INSTRUCTOR ...,"<div class=""fontsize fontface vmargins hmargin...",information-technology,information technology instructor summary seve...,0.471817,bad
4,16899268,INFORMATION TECHNOLOGY MANAGER/ANALYS...,"<div class=""fontsize fontface vmargins hmargin...",information-technology,information technology manageranalyst professi...,0.659673,average
