# Load data & handling

In [1]:
import kagglehub

# Download latest version
Resume_path = kagglehub.dataset_download("snehaanbhawal/resume-dataset")
JOB_path = kagglehub.dataset_download("ravindrasinghrana/job-description-dataset")

print("Path to Resume dataset files:", Resume_path)
print("Path to JOB dataset files:", JOB_path)

Path to Resume dataset files: C:\Users\AIJimmy\.cache\kagglehub\datasets\snehaanbhawal\resume-dataset\versions\1
Path to JOB dataset files: C:\Users\AIJimmy\.cache\kagglehub\datasets\ravindrasinghrana\job-description-dataset\versions\1


In [22]:
import pandas as pd

Resume_data = pd.read_csv(Resume_path + "/Resume/Resume.csv")
JOB_data = pd.read_csv(JOB_path + "/Job_Descriptions.csv")

In [23]:
Resume_data_sample = Resume_data[:1000]
JOB_data_sample = JOB_data[:1000]

pd.DataFrame(Resume_data_sample).to_csv("Resume_data_sample.csv", index=False)
pd.DataFrame(JOB_data_sample).to_csv("JOB_data_sample.csv", index=False)

In [None]:
from ydata_profiling import ProfileReport

Resume_profile = ProfileReport(Resume_data, title="Resumes Description Data Profiling Report")
JOB_profile = ProfileReport(JOB_data, title="Job Description Data Profiling Report")

Resume_profile.to_file("Resumes Description Data Profiling Report.html")
JOB_profile.to_file("Job Description Data Profiling Report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:04<00:00,  1.20s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 23/23 [00:31<00:00,  1.39s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Preprocess

In [3]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from num2words import num2words
from nltk.corpus import wordnet
import contractions
from nltk.corpus import stopwords
import re 
import nltk

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [4]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def preprocess_text(text):
    text = text.lower()
    text = contractions.fix(text)
    text = text.replace('.', ' . ')
    text = re.sub(r'<[^>]*>', '', text)
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
    text = "".join(num2words(int(word)) if word.isdigit() else word for word in text)
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w in stop_words]
    tagged = nltk.tag.pos_tag(text)
    lemmatized_words = []

    for word, tag in tagged:
        wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
        lemmatized_words.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    return ' '.join(lemmatized_words)


## Handle job dataset

In [5]:
for row,col in zip(JOB_data.iloc[0], JOB_data.columns):
    print(col,"\t",row)

Job Id 	 1089843540111562
Experience 	 5 to 15 Years
Qualifications 	 M.Tech
Salary Range 	 $59K-$99K
location 	 Douglas
Country 	 Isle of Man
latitude 	 54.2361
longitude 	 -4.5481
Work Type 	 Intern
Company Size 	 26801
Job Posting Date 	 2022-04-24
Preference 	 Female
Contact Person 	 Brandon Cunningham
Contact 	 001-381-930-7517x737
Job Title 	 Digital Marketing Specialist
Role 	 Social Media Manager
Job Portal 	 Snagajob
Job Description 	 Social Media Managers oversee an organizations social media presence. They create and schedule content, engage with followers, and analyze social media metrics to drive brand awareness and engagement.
Benefits 	 {'Flexible Spending Accounts (FSAs), Relocation Assistance, Legal Assistance, Employee Recognition Programs, Financial Counseling'}
skills 	 Social media platforms (e.g., Facebook, Twitter, Instagram) Content creation and scheduling Social media analytics and insights Community engagement Paid social advertising
Responsibilities 	 Manage 

In [6]:
if 'Job Id' in JOB_data.columns:
    JOB_data.drop(columns=['Job Id','latitude','longitude','Job Posting Date','Company Size','location','Country'], inplace=True)
JOB_data.head()

Unnamed: 0,Experience,Qualifications,Salary Range,Work Type,Preference,Contact Person,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,5 to 15 Years,M.Tech,$59K-$99K,Intern,Female,Brandon Cunningham,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,2 to 12 Years,BCA,$56K-$116K,Intern,Female,Francisco Larsen,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,0 to 12 Years,PhD,$61K-$104K,Temporary,Male,Gary Gibson,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,4 to 11 Years,PhD,$65K-$91K,Full-Time,Female,Joy Lucero,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,1 to 12 Years,MBA,$64K-$87K,Intern,Female,Julie Johnson,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


In [7]:
if 'Experience' in JOB_data.columns and 'Salary Range' in JOB_data.columns:
    JOB_data["MIN Experience"] = JOB_data["Experience"].apply(lambda x: str(x).split()[0])
    JOB_data["MAX Experience"] = JOB_data["Experience"].apply(lambda x: str(x).split()[-2])
    JOB_data["MIN Salary"] = JOB_data["Salary Range"].apply(lambda x: str(x).split('-')[0])
    JOB_data["MAX Salary"] = JOB_data["Salary Range"].apply(lambda x: str(x).split('-')[-1])
    JOB_data["MIN Salary"] = JOB_data["MIN Salary"].str.replace('$','').str.replace('K','').astype(int)*1000
    JOB_data["MAX Salary"] = JOB_data["MAX Salary"].str.replace('$','').str.replace('K','').astype(int)*1000
    JOB_data.drop(columns=['Experience','Salary Range'], inplace=True)
JOB_data.head()

Unnamed: 0,Qualifications,Work Type,Preference,Contact Person,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile,MIN Experience,MAX Experience,MIN Salary,MAX Salary
0,M.Tech,Intern,Female,Brandon Cunningham,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie...",5,15,59000,99000
1,BCA,Intern,Female,Francisco Larsen,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com...",2,12,56000,116000
2,PhD,Temporary,Male,Gary Gibson,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P...",0,12,61000,104000
3,PhD,Full-Time,Female,Joy Lucero,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O...",4,11,65000,91000
4,MBA,Intern,Female,Julie Johnson,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ...",1,12,64000,87000


In [8]:
print('Qualifications unique values:', JOB_data['Qualifications'].unique())
print('Work Type unique values:', JOB_data['Work Type'].unique())
print('Job Portal unique values:', JOB_data['Job Portal'].unique())

Qualifications unique values: ['M.Tech' 'BCA' 'PhD' 'MBA' 'MCA' 'M.Com' 'BBA' 'B.Tech' 'B.Com' 'BA']
Work Type unique values: ['Intern' 'Temporary' 'Full-Time' 'Contract' 'Part-Time']
Job Portal unique values: ['Snagajob' 'Idealist' 'Jobs2Careers' 'FlexJobs' 'Indeed'
 'Stack Overflow Jobs' 'Glassdoor' 'USAJOBS' 'SimplyHired'
 'Internships.com' 'The Muse' 'Dice' 'CareerBuilder' 'Monster'
 'ZipRecruiter' 'LinkedIn']


In [9]:
JOB_data["Job_requirements"] = JOB_data["Job Title"].astype(str) + ". " + JOB_data["Job Description"].astype(str) + " " + JOB_data["Qualifications"].astype(str) + ". " + JOB_data["Responsibilities"].astype(str)+". " + JOB_data["skills"].astype(str)

In [10]:
JOB_data["Job_preprocessed_requirements"] = JOB_data["Job_requirements"][:1000].apply(preprocess_text)

In [11]:
JOB_data["Job_preprocessed_requirements"][0]

'digital marketing specialist social medium manager oversee organization social medium presence create schedule content engage follower analyze social medium metric drive brand awareness engagement tech manage grow social medium account create engaging content interact online community develop social medium content calendar strategy monitor social medium trend engagement metric social medium platform e g facebook twitter instagram content creation schedule social medium analytics insight community engagement pay social advertising'

## Handle resume dataset

In [12]:
Resume_data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [13]:
if 'ID' in Resume_data.columns and 'Resume_html' in Resume_data.columns:
    Resume_data.drop(columns=['ID','Resume_html'], inplace=True)
Resume_data.head()

Unnamed: 0,Resume_str,Category
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR


In [14]:
print('Category unique values:', Resume_data['Category'].unique())

Category unique values: ['HR' 'DESIGNER' 'INFORMATION-TECHNOLOGY' 'TEACHER' 'ADVOCATE'
 'BUSINESS-DEVELOPMENT' 'HEALTHCARE' 'FITNESS' 'AGRICULTURE' 'BPO' 'SALES'
 'CONSULTANT' 'DIGITAL-MEDIA' 'AUTOMOBILE' 'CHEF' 'FINANCE' 'APPAREL'
 'ENGINEERING' 'ACCOUNTANT' 'CONSTRUCTION' 'PUBLIC-RELATIONS' 'BANKING'
 'ARTS' 'AVIATION']


In [15]:
Resume_data["Resume_preprocessed_str"] = Resume_data["Resume_str"][:1000].apply(preprocess_text)

In [16]:
Resume_data["Resume_preprocessed_str"][0]

'hr administrator marketing associate hr administrator summary dedicate customer service manager onefive year experience hospitality customer service management respect builder leader customer focus team strives instill share enthusiastic commitment customer service highlight focus customer satisfaction team management marketing savvy conflict resolution technique train development skilled multi tasker client relation specialist accomplishment missouri dot supervisor training certification certify ihg customer loyalty marketing segment hilton worldwide general manager training certification accomplish trainer cross server hospitality system hilton onq micros opera pms fidelio opera reservation system or holidex complete course seminar customer service sale strategies inventory control loss prevention safety time management leadership performance assessment experience hr administrator marketing associate hr administrator dec twozeroonethree current company name city state help develop p

# Read Resume

In [63]:
import os
import PyPDF2
from docx import Document

def read_file(file_path):
    extension = os.path.splitext(file_path)[1].lower()
    if extension == '.txt':
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            return text
    elif extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif extension == '.docx':
         return extract_text_from_docx(file_path)
    else:
        return "Unsupported file type"
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text
def extract_text_from_docx(file_path):
    text = ""
    doc = Document(file_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

## Test PDF

In [None]:
text = read_file("example_resume.pdf")
print(text)

Hayden  Smith   
214 Mitre Avenue , Park Hill, 3045 |  04 501 123 456  |  haydensmith@email.com 
Page 1 
  
(Tip: Name, m obile number and email address are essential. Current address could be included 
especially if you live nearby. Ensur e that it is clearly displayed)  
Career  Objective  
I am r eliable hard working Y ear 11 student seeking casual or part -time custo mer service  work  in a 
sports retail environment . Having played soccer for nine years and a keen all-round sports 
enthusiast,  I am looking to contribute knowledge and proven communications skills.  
 (Tip: A career objective isn’t essential , but it’s useful if you don't have much experience and can 
convey enthusiasm and motivation. Briefly summarise any work you have done, your strengths and 
relevant expertise and state how you aim to apply this to your career goal. Adjust the statement to 
reflect the role yo u are applying for.)   
Availability  
Monday – Friday: 4.30pm – 10.00pm  
Saturday – Sunday: 8 .00am 

## Test txt

In [None]:
text = read_file("example_resume.txt")
print(text)

Hayden Smith
214 Mitre Avenue, Park Hill, 3045 | 04501 123 456 | haydensmith@email.com

(Tip: Name, mobile number and email address are essential. Current address could be included especially if you live nearby. Ensure that it is clearly displayed)
Career Objective
I am reliable hard working Year 11 student seeking casual or part-time customer service work in a sports retail environment. Having played soccer for nine years and a keen all-round sports enthusiast, I am looking to contribute knowledge and proven communications skills.

(Tip: A career objective isn’t essential, but it’s useful if you don't have much experience and can convey enthusiasm and motivation. Briefly summarise any work you have done, your strengths and relevant expertise and state how you aim to apply this to your career goal. Adjust the statement to reflect the role you are applying for.)
Availability
Monday – Friday: 4.30pm – 10.00pm Saturday – Sunday: 8.00am – 11.00pm (up to 20 hours per week)
(Tip: When lookin

## Test docx

In [None]:
text = read_file("example_resume.docx")
print(text)

Hayden Smith
214 Mitre Avenue, Park Hill, 3045 | 04501 123 456 | haydensmith@email.com

(Tip: Name, mobile number and email address are essential. Current address could be included especially if you live nearby. Ensure that it is clearly displayed)
Career Objective
I am reliable hard working Year 11 student seeking casual or part-time customer service work in a sports retail environment. Having played soccer for nine years and a keen all-round sports enthusiast, I am looking to contribute knowledge and proven communications skills.

(Tip: A career objective isn’t essential, but it’s useful if you don't have much experience and can convey enthusiasm and motivation. Briefly summarise any work you have done, your strengths and relevant expertise and state how you aim to apply this to your career goal. Adjust the statement to reflect the role you are applying for.)
Availability
Monday – Friday: 4.30pm – 10.00pm Saturday – Sunday: 8.00am – 11.00pm (up to 20 hours per week)
(Tip: When lookin

# Model

In [None]:
from huggingface_hub import login
login("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")

HTTPError: Invalid user token. The token stored is invalid. Please run `hf auth login` to update it.

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

OSError: sentence-transformers/all-mpnet-base-v2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [18]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

job_text = JOB_data["Job_preprocessed_requirements"].tolist()
job_embeddings = []
for req in job_text[:1000]:
    emb = model.encode(req, convert_to_tensor=False)
    job_embeddings.append(emb)

In [19]:
resume_text = Resume_data["Resume_preprocessed_str"].tolist()
resume_embeddings = []
for res in resume_text[:1000]:
    emb = model.encode(res)
    resume_embeddings.append(emb)

In [24]:
import pickle
import os

# Create a model directory if it doesn't exist
models_dir = "model"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"Created '{models_dir}' directory")

# Save embeddings
print("\n" + "="*80)
print("SAVING MODELS AND EMBEDDINGS")
print("="*80)

# Save job embeddings
job_embeddings_path = os.path.join(models_dir, "job_embeddings.pkl")
with open(job_embeddings_path, 'wb') as f:
    pickle.dump(job_embeddings, f)
print(f"✓ Saved job embeddings to: {job_embeddings_path}")
print(f"  Size: {len(job_embeddings)} embeddings")

# Save resume embeddings
resume_embeddings_path = os.path.join(models_dir, "resume_embeddings.pkl")
with open(resume_embeddings_path, 'wb') as f:
    pickle.dump(resume_embeddings, f)
print(f"✓ Saved resume embeddings to: {resume_embeddings_path}")
print(f"  Size: {len(resume_embeddings)} embeddings")

# Save preprocessed job data
job_data_path = os.path.join(models_dir, "job_data.pkl")
with open(job_data_path, 'wb') as f:
    pickle.dump(JOB_data_sample, f)
print(f"✓ Saved job data to: {job_data_path}")

# Save preprocessed resume data
resume_data_path = os.path.join(models_dir, "resume_data.pkl")
with open(resume_data_path, 'wb') as f:
    pickle.dump(Resume_data_sample, f)
print(f"✓ Saved resume data to: {resume_data_path}")

Created 'model' directory

SAVING MODELS AND EMBEDDINGS
✓ Saved job embeddings to: model\job_embeddings.pkl
  Size: 1000 embeddings
✓ Saved resume embeddings to: model\resume_embeddings.pkl
  Size: 1000 embeddings
✓ Saved job data to: model\job_data.pkl
✓ Saved resume data to: model\resume_data.pkl
✓ Saved resume data to: model\resume_data.pkl


# Matching dataset examples

In [None]:
from sentence_transformers import util
import torch
import numpy as np

def search_resumes_by_similarity(resume_embeddings, job_embeddings, top_k=5, similarity_threshold=0.0):
    results = []
    
    for i, resume_embedding in enumerate(resume_embeddings):
        # Calculate cosine similarity between this resume and all jobs
        resume_emb_tensor = torch.from_numpy(np.array(resume_embedding)).unsqueeze(0)
        job_embs_tensor = torch.from_numpy(np.array(job_embeddings))
        
        # Compute cosine similarity
        cosine_scores = util.pytorch_cos_sim(resume_emb_tensor, job_embs_tensor)[0]
        
        # Get top k matches
        top_k_actual = min(top_k, len(job_embeddings))
        top_results = torch.topk(cosine_scores, k=top_k_actual)
        
        # Filter by similarity threshold and store results
        matches = []
        for score, job_idx in zip(top_results.values, top_results.indices):
            if score.item() >= similarity_threshold:
                matches.append({
                    'job_index': job_idx.item(),
                    'similarity_score': score.item()
                })
        
        results.append({
            'resume_index': i,
            'top_matches': matches
        })
    
    return results

# Run the search
search_results = search_resumes_by_similarity(
    resume_embeddings=resume_embeddings,
    job_embeddings=job_embeddings,
    top_k=3,
    similarity_threshold=0.0
)

# Display results
for result in search_results[:100]:  # Show first 5 resumes
    print(f"\nResume {result['resume_index']}:")
    for idx, match in enumerate(result['top_matches'], 1):
        print(f"  {idx}. Job {match['job_index']}: Similarity = {match['similarity_score']:.4f}")


Resume 0:
  1. Job 75: Similarity = 0.6636
  2. Job 19: Similarity = 0.6292
  3. Job 94: Similarity = 0.5993

Resume 1:
  1. Job 75: Similarity = 0.6904
  2. Job 94: Similarity = 0.6511
  3. Job 19: Similarity = 0.6143

Resume 2:
  1. Job 19: Similarity = 0.5965
  2. Job 94: Similarity = 0.5836
  3. Job 75: Similarity = 0.5356

Resume 3:
  1. Job 75: Similarity = 0.7289
  2. Job 19: Similarity = 0.6669
  3. Job 94: Similarity = 0.6564

Resume 4:
  1. Job 94: Similarity = 0.6850
  2. Job 19: Similarity = 0.6561
  3. Job 65: Similarity = 0.5636

Resume 5:
  1. Job 19: Similarity = 0.5876
  2. Job 75: Similarity = 0.5863
  3. Job 94: Similarity = 0.5859

Resume 6:
  1. Job 19: Similarity = 0.7248
  2. Job 94: Similarity = 0.6946
  3. Job 75: Similarity = 0.6055

Resume 7:
  1. Job 94: Similarity = 0.6785
  2. Job 65: Similarity = 0.6180
  3. Job 19: Similarity = 0.6031

Resume 8:
  1. Job 94: Similarity = 0.6839
  2. Job 19: Similarity = 0.6623
  3. Job 97: Similarity = 0.5236

Resume 9:

In [None]:
def search_jobs_by_similarity(job_embeddings, resume_embeddings, top_k=5, similarity_threshold=0.0):
    results = []
    
    for i, job_embedding in enumerate(job_embeddings):
        # Calculate cosine similarity between this job and all resumes
        job_emb_tensor = torch.from_numpy(np.array(job_embedding)).unsqueeze(0)
        resume_embs_tensor = torch.from_numpy(np.array(resume_embeddings))
        
        # Compute cosine similarity
        cosine_scores = util.pytorch_cos_sim(job_emb_tensor, resume_embs_tensor)[0]
        
        # Get top k matches
        top_k_actual = min(top_k, len(resume_embeddings))
        top_results = torch.topk(cosine_scores, k=top_k_actual)
        
        # Filter by similarity threshold and store results
        matches = []
        for score, resume_idx in zip(top_results.values, top_results.indices):
            if score.item() >= similarity_threshold:
                matches.append({
                    'resume_index': resume_idx.item(),
                    'similarity_score': score.item()
                })
        
        results.append({
            'job_index': i,
            'top_matches': matches
        })
    
    return results

# Run the search
job_search_results = search_jobs_by_similarity(
    job_embeddings=job_embeddings,
    resume_embeddings=resume_embeddings,
    top_k=3,
    similarity_threshold=0.0
)

# Display results
for result in job_search_results[:100]:  # Show first 100 jobs
    print(f"\nJob {result['job_index']}:")
    for idx, match in enumerate(result['top_matches'], 1):
        print(f"  {idx}. Resume {match['resume_index']}: Similarity = {match['similarity_score']:.4f}")


Job 0:
  1. Resume 1: Similarity = 0.4203
  2. Resume 0: Similarity = 0.4140
  3. Resume 12: Similarity = 0.4117

Job 1:
  1. Resume 1: Similarity = 0.4414
  2. Resume 94: Similarity = 0.4358
  3. Resume 47: Similarity = 0.4351

Job 2:
  1. Resume 12: Similarity = 0.5464
  2. Resume 7: Similarity = 0.5148
  3. Resume 11: Similarity = 0.5037

Job 3:
  1. Resume 94: Similarity = 0.4226
  2. Resume 12: Similarity = 0.4106
  3. Resume 78: Similarity = 0.4067

Job 4:
  1. Resume 43: Similarity = 0.6191
  2. Resume 0: Similarity = 0.5943
  3. Resume 3: Similarity = 0.5920

Job 5:
  1. Resume 94: Similarity = 0.4934
  2. Resume 78: Similarity = 0.4808
  3. Resume 12: Similarity = 0.4729

Job 6:
  1. Resume 15: Similarity = 0.4338
  2. Resume 64: Similarity = 0.4324
  3. Resume 5: Similarity = 0.4309

Job 7:
  1. Resume 59: Similarity = 0.4645
  2. Resume 3: Similarity = 0.4406
  3. Resume 1: Similarity = 0.4304

Job 8:
  1. Resume 3: Similarity = 0.4977
  2. Resume 1: Similarity = 0.4953
  3

# Matching input examples

In [None]:
def search_jobs_by_resume(resume_text, model, job_embeddings, job_data, 
                          top_k=3, min_similarity=0.0, normalize_scores=True):
    # Preprocess and encode the resume
    processed_resume = preprocess_text(resume_text)
    resume_embedding = model.encode(processed_resume)
    
    # Convert to tensor
    resume_emb_tensor = torch.from_numpy(np.array(resume_embedding)).unsqueeze(0).float()
    job_embs_tensor = torch.from_numpy(np.array(job_embeddings)).float()
    
    # Compute cosine similarity
    cosine_scores = util.pytorch_cos_sim(resume_emb_tensor, job_embs_tensor)[0]
    
    # Get top k matches
    top_k_actual = min(top_k, len(job_embeddings))
    top_results = torch.topk(cosine_scores, k=top_k_actual)
    
    # Filter and format results
    matches = []
    for score, job_idx in zip(top_results.values, top_results.indices):
        score_val = score.item()
        if score_val >= min_similarity:
            if normalize_scores:
                score_normalized = score_val * 100
            else:
                score_normalized = score_val
            
            job_idx_int = job_idx.item()
            job_row = job_data.iloc[job_idx_int]
            
            matches.append({
                'job_index': job_idx_int,
                'similarity_score': score_normalized,
                'match_quality': 'Excellent' if score_normalized >= 80 else 'Very Good' if score_normalized >= 75 else 'Good' if score_normalized >= 65 else 'Fair',
                'job_title': job_row.get('Job Title', 'N/A') if 'Job Title' in job_row.index else 'N/A',
                'company': job_row.get('Company', 'N/A') if 'Company' in job_row.index else 'N/A',
                'job_description_preview': str(job_row.get('Job Description', 'N/A'))[:100] if 'Job Description' in job_row.index else 'N/A',
                'min_salary': job_row.get('MIN Salary', 'N/A') if 'MIN Salary' in job_row.index else 'N/A',
                'max_salary': job_row.get('MAX Salary', 'N/A') if 'MAX Salary' in job_row.index else 'N/A',
                'min_experience': job_row.get('MIN Experience', 'N/A') if 'MIN Experience' in job_row.index else 'N/A',
                'max_experience': job_row.get('MAX Experience', 'N/A') if 'MAX Experience' in job_row.index else 'N/A',
                'work_type': job_row.get('Work Type', 'N/A') if 'Work Type' in job_row.index else 'N/A',
                'qualifications': job_row.get('Qualifications', 'N/A') if 'Qualifications' in job_row.index else 'N/A',
                'company profile': job_row.get('Company Profile', 'N/A') if 'Company Profile' in job_row.index else 'N/A'
            })
    
    return {
        'total_matches': len(matches),
        'best_match_score': matches[0]['similarity_score'] if matches else 0,
        'top_matches': matches
    }

# Example usage: Search for jobs matching a single resume
print("=" * 80)
print("SEARCH JOBS FOR A SINGLE RESUME")
print("=" * 80)

# Get a sample resume
sample_resume = read_file("example_resume.txt")

# Search for matching jobs
results = search_jobs_by_resume(
    resume_text=sample_resume,
    model=model,
    job_embeddings=job_embeddings,
    job_data=JOB_data,
    top_k=3,
    min_similarity=0.4,
    normalize_scores=True
)

print(f"Found {results['total_matches']} relevant jobs (>40% similarity)")
print(f"Best Match Score: {results['best_match_score']:.1f}%\n")

for idx, match in enumerate(results['top_matches'], 1):
    print(f"{idx}. Job {match['job_index']}: {match['similarity_score']:.1f}% ({match['match_quality']})")
    print(f"   Title: {match['job_title']}")
    print(f"   Company: {match['company']}")
    print(f"   Preview: {match['job_description_preview']}")
    print(f"   Salary Range: {match['min_salary']} - {match['max_salary']}")
    print(f"   Experience Required: {match['min_experience']} - {match['max_experience']}")
    print(f"   Work Type: {match['work_type']}")
    print(f"   Qualifications: {match['qualifications']}")
    print(f"   Company Profile: {match['company profile']}")
    print()

SEARCH JOBS FOR A SINGLE RESUME
Found 3 relevant jobs (>40% similarity)
Best Match Score: 51.2%

1. Job 94: 51.2% (Fair)
   Title: Human Resources Manager
   Company: Titan Company
   Preview: Talent Acquisition Managers oversee the recruitment and hiring process. They develop recruitment str
   Salary Range: 58000 - 83000
   Experience Required: 1 - 8
   Work Type: Part-Time
   Qualifications: MCA
   Company Profile: {"Sector":"Consumer Goods","Industry":"Jewelry and Watches","City":"Bengaluru","State":"Karnataka","Zip":"560001","Website":"https://www.titancompany.in/","Ticker":"TITAN","CEO":"C.K. Venkataraman"}

2. Job 75: 45.7% (Fair)
   Title: Technical Writer
   Company: Reynolds American Inc.
   Preview: Documentation Specialists create and manage documents, manuals, or technical materials. They ensure 
   Salary Range: 60000 - 113000
   Experience Required: 1 - 10
   Work Type: Full-Time
   Qualifications: B.Com
   Company Profile: {"Sector":"Tobacco","Industry":"Tobacco","City"

In [None]:
def search_resumes_by_job(job_text, model, resume_embeddings, resume_data, 
                          top_k=3, min_similarity=0.0, normalize_scores=True):
    # Preprocess and encode the job description
    processed_job = preprocess_text(job_text)
    job_embedding = model.encode(processed_job)
    
    # Convert to tensor
    job_emb_tensor = torch.from_numpy(np.array(job_embedding)).unsqueeze(0).float()
    resume_embs_tensor = torch.from_numpy(np.array(resume_embeddings)).float()
    
    # Compute cosine similarity
    cosine_scores = util.pytorch_cos_sim(job_emb_tensor, resume_embs_tensor)[0]
    
    # Get top k matches
    top_k_actual = min(top_k, len(resume_embeddings))
    top_results = torch.topk(cosine_scores, k=top_k_actual)
    
    # Filter and format results
    matches = []
    for score, resume_idx in zip(top_results.values, top_results.indices):
        score_val = score.item()
        if score_val >= min_similarity:
            if normalize_scores:
                score_normalized = score_val * 100
            else:
                score_normalized = score_val
            
            resume_idx_int = resume_idx.item()
            resume_row = resume_data.iloc[resume_idx_int]
            
            matches.append({
                'resume_index': resume_idx_int,
                'similarity_score': score_normalized,
                'match_quality': 'Excellent' if score_normalized >= 80 else 'Very Good' if score_normalized >= 75 else 'Good' if score_normalized >= 65 else 'Fair',
                'category': resume_row.get('Category', 'N/A') if 'Category' in resume_row.index else 'N/A',
                'resume_preview': str(resume_row.get('Resume_str', 'N/A'))[:150] if 'Resume_str' in resume_row.index else 'N/A'
            })
    
    return {
        'total_matches': len(matches),
        'best_match_score': matches[0]['similarity_score'] if matches else 0,
        'top_matches': matches
    }

# Example usage: Search for resumes matching a single job
print("=" * 80)
print("SEARCH RESUMES FOR A SINGLE JOB")
print("=" * 80)

# Get a sample job description
sample_job = read_file("example_job.txt")

# Search for matching resumes
job_results = search_resumes_by_job(
    job_text=sample_job,
    model=model,
    resume_embeddings=resume_embeddings,
    resume_data=Resume_data,
    top_k=3,
    min_similarity=0.0,  # No filter - show all top matches
    normalize_scores=True
)

print(f"\nFound {job_results['total_matches']} relevant resumes")
print(f"Best Match Score: {job_results['best_match_score']:.1f}%\n")

for idx, match in enumerate(job_results['top_matches'], 1):
    print(f"{idx}. Resume {match['resume_index']}: {match['similarity_score']:.1f}% ({match['match_quality']})")
    print(f"   Category: {match['category']}")
    print(f"   Preview: {match['resume_preview']}")
    print()

SEARCH RESUMES FOR A SINGLE JOB

Found 3 relevant resumes
Best Match Score: 89.6%

1. Resume 81: 89.6% (Excellent)
   Category: HR
   Preview:          FIELD HR ASSOCIATE           Summary    Reliable HR Field Associate with a Master's of science in Human Resource management emphasis as a Gen

2. Resume 10: 88.9% (Excellent)
   Category: HR
   Preview:          HR ASSISTANT       Summary    Highly motivated, and a dynamic Human Resources professional with diverse credentials seeking a position with a

3. Resume 35: 85.3% (Excellent)
   Category: HR
   Preview:          HR MANAGER         Summary     Human Resources Professional with practical understanding of business needs. Areas of expertise include confli



In [138]:
def match_job_with_resume(job_text, resume_text, model, normalize_scores=True):
    # Preprocess both texts
    processed_job = preprocess_text(job_text)
    processed_resume = preprocess_text(resume_text)
    
    # Encode both texts
    job_embedding = model.encode(processed_job)
    resume_embedding = model.encode(processed_resume)
    
    # Convert to tensors
    job_emb_tensor = torch.from_numpy(np.array(job_embedding)).unsqueeze(0).float()
    resume_emb_tensor = torch.from_numpy(np.array(resume_embedding)).unsqueeze(0).float()
    
    # Compute cosine similarity
    cosine_score = util.pytorch_cos_sim(job_emb_tensor, resume_emb_tensor)[0][0].item()
    
    if normalize_scores:
        similarity_score = cosine_score * 100
    else:
        similarity_score = cosine_score
    
    # Determine match quality
    if normalize_scores:
        match_quality = 'Excellent' if similarity_score >= 80 else 'Very Good' if similarity_score >= 75 else 'Good' if similarity_score >= 65 else 'Fair' if similarity_score >= 50 else 'Poor'
    else:
        match_quality = 'Excellent' if similarity_score >= 0.8 else 'Very Good' if similarity_score >= 0.75 else 'Good' if similarity_score >= 0.65 else 'Fair' if similarity_score >= 0.5 else 'Poor'
    
    return {
        'similarity_score': similarity_score,
        'match_quality': match_quality,
        'match_percentage': f"{similarity_score:.1f}%" if normalize_scores else f"{similarity_score:.4f}"
    }

# Example usage: Match a specific job with a specific resume
print("=" * 80)
print("MATCH INPUT JOB WITH INPUT RESUME")
print("=" * 80)

# Get sample job and resume

sample_job = read_file("example_job.txt")
sample_resume = read_file("example_resume.txt")
    
print(f"\nJob Description Preview:\n{sample_job[:150]}...\n")
print(f"Resume Preview:\n\n{sample_resume[:150]}...\n")
    
# Calculate match score
match_result = match_job_with_resume(
        job_text=sample_job,
        resume_text=sample_resume,
        model=model,
        normalize_scores=True
    )
    
print("=" * 80)
print("MATCH RESULT")
print("=" * 80)
print(f"Similarity Score: {match_result['match_percentage']}")
print(f"Match Quality: {match_result['match_quality']}")
print(f"Raw Score: {match_result['similarity_score']:.4f}")
print("=" * 80)
    

MATCH INPUT JOB WITH INPUT RESUME

Job Description Preview:
Job Title: HR Administrator

Location: [City, Country or “Hybrid/Remote”]
Employment Type: Full-time
Department: Human Resources
Reports To: HR Manage...

Resume Preview:

Hayden Smith
214 Mitre Avenue, Park Hill, 3045 | 04501 123 456 | haydensmith@email.com

(Tip: Name, mobile number and email address are essential. Cur...

MATCH RESULT
Similarity Score: 65.3%
Match Quality: Good
Raw Score: 65.2736


# Resume Screening Using NLP - Notebook Summary

## Project Overview
This notebook implements an **NLP-based Resume Screening System** using semantic similarity matching. It leverages transformer models and cosine similarity to automatically match resumes with job descriptions.

---

## 1. Data Loading & Preparation

### Datasets Used:
- **Resume Dataset**: `snehaanbhawal/resume-dataset` from Kaggle
- **Job Dataset**: `ravindrasinghrana/job-description-dataset` from Kaggle

### Data Processing:
- Loaded both datasets using `kagglehub`
- Removed unnecessary columns (location, salary range in raw format, etc.)
- Parsed salary ranges and experience requirements
- Generated data profiling reports (HTML format)

---

## 2. Text Preprocessing

### Preprocessing Pipeline:
The `preprocess_text()` function applies the following transformations:

1. **Text Normalization**: Convert to lowercase
2. **Contraction Expansion**: Fix contractions (e.g., "don't" → "do not")
3. **Special Character Removal**: Remove HTML tags, punctuation
4. **Number Conversion**: Convert numbers to words
5. **Tokenization**: Break text into word tokens using NLTK
6. **Stopword Removal**: Remove common English words
7. **POS Tagging**: Identify part-of-speech tags
8. **Lemmatization**: Reduce words to base form based on POS tag

**Example**: "Looking for 5+ years of Python experience" → "look five plus year python experience"

---

## 3. Embedding Generation

### Model: `all-mpnet-base-v2`
- **Library**: Sentence Transformers
- **Dimensions**: 768D embeddings
- **Advantage**: Better semantic understanding than lighter models

### Embedding Process:
```
Preprocessed Text → SentenceTransformer → 768D Vector
```

- Generated embeddings for first 100 job descriptions
- Generated embeddings for first 100 resumes

---

## 4. Core Matching Functions

### Function 1: `search_resumes_by_similarity()`
**Purpose**: Find best job matches for each resume

**Parameters**:
- `resume_embeddings`: List of resume vectors
- `job_embeddings`: List of job vectors
- `top_k`: Number of top matches (default: 5)
- `similarity_threshold`: Minimum similarity score

**Output**: For each resume, returns top K job matches with cosine similarity scores

---

### Function 2: `search_jobs_by_similarity()`
**Purpose**: Find best resume matches for each job

**Parameters**:
- `job_embeddings`: List of job vectors
- `resume_embeddings`: List of resume vectors
- `top_k`: Number of top matches (default: 5)
- `similarity_threshold`: Minimum similarity score

**Output**: For each job, returns top K resume matches with scores

---

### Function 3: `search_jobs_by_resume()`
**Purpose**: Find best jobs for a specific input resume text

**Parameters**:
- `resume_text`: Raw resume text (from file or string)
- `model`: SentenceTransformer model instance
- `job_embeddings`: Pre-computed job embeddings
- `job_data`: Job DataFrame with metadata
- `top_k`: Number of top matches (default: 3)
- `min_similarity`: Minimum similarity threshold (default: 0.0)
- `normalize_scores`: Convert to 0-100 scale (default: True)

**Returns**:
- Total matches found
- Best match score
- List of matches with:
  - Job index & similarity score
  - Job title, company, description preview
  - Salary range, experience required
  - Work type, qualifications, company profile

**Use Case**: Upload a resume → Get top relevant job recommendations

---

### Function 4: `search_resumes_by_job()`
**Purpose**: Find best resumes for a specific input job description

**Parameters**:
- `job_text`: Raw job description text
- `model`: SentenceTransformer model instance
- `resume_embeddings`: Pre-computed resume embeddings
- `resume_data`: Resume DataFrame with metadata
- `top_k`: Number of top matches (default: 3)
- `min_similarity`: Minimum similarity threshold (default: 0.0)
- `normalize_scores`: Convert to 0-100 scale (default: True)

**Returns**:
- Total matches found
- Best match score
- List of matches with:
  - Resume index & similarity score
  - Resume category
  - Resume text preview

**Use Case**: Post a job → Get best matching candidates

---

### Function 5: `match_job_with_resume()`
**Purpose**: Direct one-to-one matching between a specific job and resume

**Parameters**:
- `job_text`: Job description text
- `resume_text`: Resume text
- `model`: SentenceTransformer model instance
- `normalize_scores`: Convert to 0-100 scale (default: True)

**Returns**:
- Similarity score (0-100%)
- Match quality (Excellent/Very Good/Good/Fair/Poor)
- Raw score (0-1)

**Use Case**: Quick compatibility check for a single job-resume pair

---

## 5. Similarity Score Interpretation

### Match Quality Thresholds (Normalized):
| Score Range | Quality | Interpretation |
|---|---|---|
| 80-100% | Excellent | Highly relevant match |
| 75-79% | Very Good | Strong match |
| 65-74% | Good | Relevant match |
| 50-64% | Fair | Somewhat relevant |
| 0-49% | Poor | Not relevant |

### Raw Scores (0-1 scale):
| Range | Meaning |
|---|---|
| 0.7-1.0 | Excellent match |
| 0.5-0.7 | Good match |
| 0.3-0.5 | Moderate match |
| 0.0-0.3 | Poor match |

---

## 6. How It Works - Step by Step

### Example Workflow:

1. **User uploads resume** → `search_jobs_by_resume(resume_text, ...)`
2. **Preprocess** → Remove noise, lemmatize, standardize
3. **Encode** → Convert to 768D vector using SentenceTransformer
4. **Compare** → Calculate cosine similarity with all job embeddings
5. **Rank** → Get top 3-10 matches
6. **Return** → Display jobs with scores and metadata

### Cosine Similarity Calculation:
```
similarity = dot_product(resume_vec, job_vec) / (||resume_vec|| × ||job_vec||)
```
Result: 0 (completely different) to 1 (identical semantic meaning)

---
