In [12]:
#import packages
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
import spacy

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain.prompts import ChatPromptTemplate
import os
from langchain_groq import ChatGroq
import re
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

Helper Functions

In [13]:
def clean_text(text):
    # Remove unwanted characters, extra spaces, and normalize whitespace
    cleaned_text = text.replace('\n', ' ').replace('\x0c', '').strip()
    return ' '.join(cleaned_text.split())

In [14]:
from data_prep import load_data, read_docx
job_code = 'JPC-55662'
job_desc_path = f'../data/{job_code}/{job_code} Job Description.docx'
job_description = clean_text(read_docx(job_desc_path))

In [15]:
from data_prep import load_data, read_docx
job_code = 'JPC-55662'
resumes_path = f'../data/{job_code}/resume_data'
resumes = load_data(resumes_path)

In [16]:
resumes

Unnamed: 0,id,body
0,Sowjanya,Sowjanya\nEmail ID:- Sowjanyabejawada92@gmail....
1,Abheesta,Abheesta Vemuru\nPhone: (646)-981-3281\nEmail:...
2,Rajya,Rajya Lakshmi Bathula\n316 821 1345\nlakshmira...
3,Uday,Uday Reddy\nPython Developer\nPhone: 732-377-4...
4,Rithvik,Rithvik P\nEmail:rithvikrao222@gmail.com\nPH: ...
5,Rafael,\nRafael Alvarez Gaucin\n(925)-310-8234\nrafae...
6,Bharath,Bharath Kumar P\t\tPython Developer\nBharathku...
7,Krishnaroopa,KRISHNAROOPA\nEmail ID: krishnaroopa773@gmail....
8,Devendra,Devendra\nSr. Python Developer\n\nE-mail: dev...
9,ShanthiRatna,shanthiRatna Siliveri\nPython Developer\n51097...


In [17]:
def get_skills(resume_txt):
    nlp = spacy.load("en_core_web_sm")
    skills = "jz_skill_patterns.jsonl"

    ruler = nlp.add_pipe("entity_ruler", before = "ner")
    ruler.from_disk(skills)
    doc = nlp(resume_txt)

    resume_skill_list = set()
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            resume_skill_list.add(ent.text.lower())

    return resume_skill_list

In [18]:
job_skills = get_skills(job_description)
job_skills

{'css',
 'database',
 'design',
 'django',
 'flask',
 'git',
 'hibernate',
 'html',
 'java',
 'javascript',
 'nosql',
 'python',
 'scalability',
 'software',
 'spring',
 'sql'}

In [19]:
def get_job_skill_weights(llm, job_description, job_skills):
    prompt_template=ChatPromptTemplate.from_template(
        """
        I need your help to analyze the following job description and extract the skills mentioned. For each skill, please assign a weightage between 0 and 1, where 1 indicates the skill is most critical for the job, and 0 indicates it is least important.
        Skills should mostly include tech stacks and tools but can also include concepts and procedures that need to be known.
        Here is the job description:
        "{job_description}"

        Here is a list of skills identified by a spaCy model. Use this as a reference. Add to this list or remove unapplicable skills. The final list should not exceed 10 skills:
        {job_skills}

        Skills should not be too specific. If you identify related skills that can be logically combined into a broader category, group them together and provide a single weightage for the broader category. Ensure that the weightages reflect the importance of each skill or category based on the job description. Make sure the skills are commonly found in resumes.

        Please provide the extracted skills along with their corresponding weightages in the following format:
        - Skill: [skill_name], Weightage: [weightage]

        Make sure to rank the skills based on their importance to the job.

        """
    )
    prompt = prompt_template.invoke({"job_description": job_description, "job_skills": job_skills})
    result = llm.invoke(prompt)
    print(result.content)
    return result.content
    
def parse_resume_skill_weights(input_string):
    pattern = r"Skill:\s*(.*?),\s*Weightage:\s*(\d+\.\d+)"

    # Find all matches in the input text
    matches = re.findall(pattern, input_string)

    # Convert matches to a DataFrame
    df = pd.DataFrame(matches, columns=['skill', 'weightage'])

    # Convert Weightage to numeric type
    df['weightage'] = pd.to_numeric(df['weightage'])
    
    return df


In [30]:
groq_api_key=os.environ['GROQ_API_KEY']
llm=ChatGroq(groq_api_key=groq_api_key,
         model_name="mixtral-8x7b-32768")

In [31]:
def prompt_llm(llm, retriever, skill):
    prompt_template=ChatPromptTemplate.from_template(
        """
    You are a job recruiter evaluating a candidate's resume. Based on the resume content provided, 
    please evaluate the candidate's proficiency in {skill} on a scale of 1 to 5, where:
    1 - No proficiency or mention
    2 - Basic understanding 
    3 - Intermediate proficiency 
    4 - Proficient
    5 - Expert-level proficiency

    Do not use information that is not related to {skill}. 
    Resume Content: {context}

    Score for {skill}: 

    Please provide it in the following format:
    score: (integer from 1-5)
    reason: (short explanation for your score)
    """
    )
    relevant_docs = retriever.invoke(skill)
    context = " ".join([doc.page_content for doc in relevant_docs])
    prompt = prompt_template.invoke({"context": context, "skill": skill})
    result = llm.invoke(prompt)
    print(result.content)
    return result.content

def parse_score_reason(input_string):
    if input_string == None:
        return None, None
    score_match = re.search(r'score:\s*(\d+)', input_string)
    reason_match = re.search(r'reason:\s*(.*)', input_string, re.DOTALL)
    
    # Extract and return values
    score = int(score_match.group(1)) if score_match else None
    reason = reason_match.group(1).strip() if reason_match else None
    
    return score, reason

In [32]:
temp_string = get_job_skill_weights(llm, job_description, job_skills)
skill_df = parse_resume_skill_weights(temp_string)

Based on the job description and the list of skills provided, here are the top 10 skills for the Python Developer with Java position, along with their weightages:

1. Skill: Python, Weightage: 0.95
Develop, test, and deploy high-quality software solutions using Python and Java. Proven experience as a Python Developer with a strong understanding of Python frameworks (e.g., Django, Flask).
2. Skill: Java, Weightage: 0.9
Develop, test, and deploy high-quality software solutions using Python and Java. Solid experience with Java development, including familiarity with Java frameworks (e.g., Spring, Hibernate).
3. Skill: Object-Oriented Programming (OOP), Weightage: 0.85
Strong understanding of object-oriented programming (OOP) principles and design patterns.
4. Skill: RESTful APIs and web services, Weightage: 0.8
Experience with RESTful APIs and web services.
5. Skill: Version Control Systems (Git), Weightage: 0.75
Experience with version control systems (e.g., Git).
6. Skill: Databases (SQ

In [33]:
resume_scores= []
for i in range(resumes.shape[0]):
    print("----------------", resumes['id'].iloc[i])
    single_document = Document(page_content=clean_text(resumes['body'].iloc[i]))
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    docs = text_splitter.split_documents([single_document])
    
    #loading the embedding model from huggingface
    embedding_model_name = "BAAI/bge-small-en-v1.5"
    # model_kwargs = {"device": "cuda"}
    embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    # model_kwargs=model_kwargs
    )
    vectorstore = FAISS.from_documents(docs, embeddings)
    
    retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.3})
    

    
    scores = []
    reasons = []

    # Loop through each skill, call the function, and parse the result
    for skill in skill_df['skill']:
        input_string = prompt_llm(llm, retriever, skill)
        score, reason = parse_score_reason(input_string)
        scores.append(score)
        reasons.append(reason)

    # Create a DataFrame
    data = {'skill': list(skill_df['skill']), 'score': scores, 'reason': reasons}
    df = pd.DataFrame(data)
    merged_df = pd.merge(df, skill_df, how='inner')
    merged_df['weighted_Score'] = merged_df['score'] * merged_df['weightage']

    # Calculate the weighted sum
    final_score = merged_df['weighted_Score'].sum()
    resume_scores.append(final_score)

---------------- Sowjanya
score: 4
reason: The candidate has strong experience in developing software using Python, and has mentioned using several Python libraries such as Beautiful Soup, NumPy, and SciPy for analysis and design. They also have experience with Python packages Matplotlib and SciPy. This level of experience and use of libraries and packages suggests a proficient level of Python knowledge.
score: 4
reason: The candidate has demonstrated proficient use of Java in their experience, implementing and monitoring solutions using JDBC and working with Java-based tools and IDEs. They also have experience in developing and designing automation frameworks using Java-related scripting languages like Shell and Bash. However, there is no mention of expert-level proficiency such as leading a team or working on complex Java projects.
score: 4
reason: The candidate has demonstrated proficiency in Object-Oriented Programming (OOP) by having experience in ORM programming for converting da

In [None]:
resumes['final_score'] = resume_scores
resumes.sort_values(by='final_score', ascending=False)

Unnamed: 0,id,body,final_score
8,Devendra,Devendra\nSr. Python Developer\n\nE-mail: dev...,25.0
6,Bharath,Bharath Kumar P\t\tPython Developer\nBharathku...,24.1
4,Rithvik,Rithvik P\nEmail:rithvikrao222@gmail.com\nPH: ...,23.9
9,ShanthiRatna,shanthiRatna Siliveri\nPython Developer\n51097...,23.7
2,Rajya,Rajya Lakshmi Bathula\n316 821 1345\nlakshmira...,23.5
3,Uday,Uday Reddy\nPython Developer\nPhone: 732-377-4...,23.2
7,Krishnaroopa,KRISHNAROOPA\nEmail ID: krishnaroopa773@gmail....,22.7
0,Sowjanya,Sowjanya\nEmail ID:- Sowjanyabejawada92@gmail....,22.6
1,Abheesta,Abheesta Vemuru\nPhone: (646)-981-3281\nEmail:...,22.6
5,Rafael,\nRafael Alvarez Gaucin\n(925)-310-8234\nrafae...,18.7
