In [139]:
from sentence_transformers import SentenceTransformer

#Embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def get_embedding(text: str):
    return model.encode(text)

In [140]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#match the numbers
def match_score(resume_emb, job_emb):
    score = cosine_similarity(
        resume_emb.reshape(1, -1),
        job_emb.reshape(1, -1)
    )[0][0]
    return round(score * 100, 2)  #round percentually


A simple case is presented bellow: what one resume vs one job.

In [141]:
#with open("data/resume.txt", "r") as f: #get resumes from inside data folder
#    resume_text = f.read()

#with open("define/jobdesc.txt", "r") as f: #get job description defined inside folder
#    job_text = f.read()
    
    
#Embedding
#resume_emb = get_embedding(resume_text)  
#job_emb = get_embedding(job_text)

#Match the numbers
#score = match_score(resume_emb, job_emb)

#Qualify
#print(f"Resume â†” Job Match Score: {score}%")


What if there was more than one resume? What if everyone submitted their resumes with their names and the code provided a the order of best-fit of the candidates?

In [142]:
with open("define/jobdesc.txt", "r") as f: #get job description defined inside folder
    job_text = f.read()
print(job_text)

We are looking for a Machine Learning Intern with experience in Python,
data preprocessing, model evaluation, and basic NLP.


In [143]:
from pathlib import Path

resumes = {}

for file in Path("data").glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        resumes[file.name] = f.read()

# Example
print(resumes)

{'joaquina.txt': 'Skills: Python, R, Machine Learning, Data Analyis, PyTorch, MatplotLib\nExperience: Python developer with experience in ML, including libraries such as PyTorch\n', 'maria.txt': 'Skills: Python, Machine Learning, Data Analysis, Pandas, Scikit-learn\nExperience: Built ML models for regression and classification problems.\n', 'miguel.txt': 'Skills: SQL, Data Analysis, Pandas, Azure\nExperience: Used SQL to analysi big data sets, interested in ML\n', 'paulo.txt': 'Skills: Python, ML, Data Analysis\nExperience: Built ML models \n'}


In [144]:
job_emb = get_embedding(job_text) #if the job description is the same, we can keep it as a constant
#if there are multiple job descriptions or candidates apply and the job defines their position
#it's possible to create another loop for evaluation

In [145]:
def comp_embs(job_emb,res): #job_emb is the embedding of the job description, res is a dictionary of resumes
    emb = {}
    match = {}
    for filename, text in resumes.items():
        emb[filename] =  get_embedding(text) 
        match[filename] = match_score(emb[filename], job_emb)
        
    return emb,dict(sorted(match.items(), key=lambda item: item[1], reverse=True))

#this function returns the embeddings as well as the 
        

In [146]:
emb_res, matches = comp_embs(job_emb,resumes)

In [147]:
print(matches)

{'paulo.txt': 62.13, 'joaquina.txt': 55.64, 'maria.txt': 54.9, 'miguel.txt': 47.63}


In [148]:
print(f"The resume with the highest score, {list(matches.values())[0]}%, was {list(matches.keys())[0]}, followed by {list(matches.keys())[1]}, {list(matches.keys())[2]} and {list(matches.keys())[3]}.")


The resume with the highest score, 62.13%, was paulo.txt, followed by joaquina.txt, maria.txt and miguel.txt.
