In [1]:
#import packages
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
import spacy

import warnings
warnings.filterwarnings("ignore")

Helper Functions

In [2]:
def clean_text(text):
    # Remove unwanted characters, extra spaces, and normalize whitespace
    cleaned_text = text.replace('\n', ' ').replace('\x0c', '').strip()
    return ' '.join(cleaned_text.split())

### Data Retrieval

Job Description

In [3]:
from data_prep import load_data, read_docx
job_code = 'JPC-52176'
job_desc_path = f'../data/{job_code}/{job_code} Job Description.docx'
job_description = clean_text(read_docx(job_desc_path))

Resume

In [4]:
from data_prep import load_data, read_docx
job_code = 'JPC-55662'
resumes_path = f'../data/{job_code}/resume_data'
resumes = load_data(resumes_path)

In [9]:
resumes

Unnamed: 0,id,body
0,Sowjanya,Sowjanya\nEmail ID:- Sowjanyabejawada92@gmail....
1,Abheesta,Abheesta Vemuru\nPhone: (646)-981-3281\nEmail:...
2,Rajya,Rajya Lakshmi Bathula\n316 821 1345\nlakshmira...
3,Uday,Uday Reddy\nPython Developer\nPhone: 732-377-4...
4,Rithvik,Rithvik P\nEmail:rithvikrao222@gmail.com\nPH: ...
5,Rafael,\nRafael Alvarez Gaucin\n(925)-310-8234\nrafae...
6,Bharath,Bharath Kumar P\t\tPython Developer\nBharathku...
7,Krishnaroopa,KRISHNAROOPA\nEmail ID: krishnaroopa773@gmail....
8,Devendra,Devendra\nSr. Python Developer\n\nE-mail: dev...
9,ShanthiRatna,shanthiRatna Siliveri\nPython Developer\n51097...


In [None]:
resume_id = 0

In [8]:
# file_path = '../data/JPC-52176/resume_data/Bhavana Karwar Automation Developer.pdf'
# loader = PyPDFLoader(file_path)
# documents = loader.load()
# full_text = " ".join([clean_text(doc.page_content) for doc in documents])
single_document = Document(page_content=clean_text(resumes.iloc[resume_id]['body']))

Retrieving Skills from job description

In [88]:
def get_skills(resume_txt):
    nlp = spacy.load("en_core_web_sm")
    skills = "jz_skill_patterns.jsonl"

    ruler = nlp.add_pipe("entity_ruler", before = "ner")
    ruler.from_disk(skills)
    doc = nlp(resume_txt)

    resume_skill_list = set()
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            resume_skill_list.add(ent.text.lower())

    return resume_skill_list

In [89]:
job_skills = get_skills(job_description)
job_skills

{'communications',
 'core network',
 'javascript',
 'python',
 'support',
 'testing',
 'wireless'}

### Data Prep

In [90]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

Chunking resume and adding to vectorstore

In [91]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
docs = text_splitter.split_documents([single_document])

In [92]:
#loading the embedding model from huggingface
embedding_model_name = "BAAI/bge-small-en-v1.5"
# model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name,
  # model_kwargs=model_kwargs
)

In [93]:
vectorstore = FAISS.from_documents(docs, embeddings)

### Prompting and Retrieval

In [94]:
from langchain.prompts import ChatPromptTemplate
import os
from langchain_groq import ChatGroq
import re
import pandas as pd

In [95]:
def get_job_skill_weights(llm, job_description, job_skills):
    prompt_template=ChatPromptTemplate.from_template(
        """
        I need your help to analyze the following job description and extract the skills mentioned. For each skill, please assign a weightage between 0 and 1, where 1 indicates the skill is most critical for the job, and 0 indicates it is least important.
        Skills should mostly include tech stacks and tools but can also include concepts and procedures that need to be known.
        Here is the job description:
        "{job_description}"

        Here is a list of skills identified by a spaCy model. Use this as a reference. Add to this list or remove unapplicable skills. The final list should not exceed 10 skills:
        {job_skills}

        Skills should not be too specific. If you identify related skills that can be logically combined into a broader category, group them together and provide a single weightage for the broader category. Ensure that the weightages reflect the importance of each skill or category based on the job description. Make sure the skills are commonly found in resumes.

        Please provide the extracted skills along with their corresponding weightages in the following format:
        - Skill: [skill_name], Weightage: [weightage]

        Make sure to rank the skills based on their importance to the job.

        """
    )
    prompt = prompt_template.invoke({"job_description": job_description, "job_skills": job_skills})
    result = llm.invoke(prompt)
    print(result.content)
    return result.content
    
def parse_resume_skill_weights(input_string):
    pattern = r"Skill:\s*(.*?),\s*Weightage:\s*(\d+\.\d+)"

    # Find all matches in the input text
    matches = re.findall(pattern, input_string)

    # Convert matches to a DataFrame
    df = pd.DataFrame(matches, columns=['skill', 'weightage'])

    # Convert Weightage to numeric type
    df['weightage'] = pd.to_numeric(df['weightage'])
    
    return df


In [96]:
temp_string = get_job_skill_weights(llm, job_description, job_skills)

Based on the job description and the list of skills provided, here are the extracted skills with their corresponding weightages:

1. Skill: Python, Weightage: 0.9
2. Skill: JavaScript, Weightage: 0.9
3. Skill: Test Automation Development, Weightage: 0.9
4. Skill: 5G or other wireless technologies, Weightage: 0.8
5. Skill: Network protocols, Weightage: 0.5
6. Skill: Test Script Design and Development, Weightage: 0.7
7. Skill: Manual Testing, Weightage: 0.2
8. Skill: Test Case Analysis, Weightage: 0.6
9. Skill: Test Coverage Determination, Weightage: 0.5
10. Skill: Test Report Preparation, Weightage: 0.5

Explanation:

* Python and JavaScript are essential skills for this job, with a weightage of 0.9 each.
* Test Automation Development is the most critical skill for this position, with a weightage of 0.9.
* Experience with 5G or other wireless technologies is crucial, with a weightage of 0.8.
* Network protocols understanding is a plus, with a weightage of 0.5.
* Test Script Design and D

In [97]:
skill_df = parse_resume_skill_weights(temp_string)
skill_df

Unnamed: 0,skill,weightage
0,Python,0.9
1,JavaScript,0.9
2,Test Automation Development,0.9
3,5G or other wireless technologies,0.8
4,Network protocols,0.5
5,Test Script Design and Development,0.7
6,Manual Testing,0.2
7,Test Case Analysis,0.6
8,Test Coverage Determination,0.5
9,Test Report Preparation,0.5


In [98]:
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.3})

In [99]:
def prompt_llm(llm, skill):
    prompt_template=ChatPromptTemplate.from_template(
        """
    You are a job recruiter evaluating a candidate's resume. Based on the resume content provided, 
    please evaluate the candidate's proficiency in {skill} on a scale of 1 to 5, where:
    1 - No proficiency or mention
    2 - Basic understanding 
    3 - Intermediate proficiency 
    4 - Proficient
    5 - Expert-level proficiency

    Do not use information that is not related to {skill}. 
    Resume Content: {context}

    Score for {skill}: 

    Please provide it in the following format:
    score: (integer from 1-5)
    reason: (short explanation for your score)
    """
    )
    relevant_docs = retriever.invoke(skill)
    context = " ".join([doc.page_content for doc in relevant_docs])
    prompt = prompt_template.invoke({"context": context, "skill": skill})
    result = llm.invoke(prompt)
    print(result.content)
    return result.content

def parse_score_reason(input_string):
    if input_string == None:
        return None, None
    score_match = re.search(r'score:\s*(\d+)', input_string)
    reason_match = re.search(r'reason:\s*(.*)', input_string, re.DOTALL)
    
    # Extract and return values
    score = int(score_match.group(1)) if score_match else None
    reason = reason_match.group(1).strip() if reason_match else None
    
    return score, reason

In [100]:
groq_api_key=os.environ['GROQ_API_KEY']
llm=ChatGroq(groq_api_key=groq_api_key,
         model_name="mixtral-8x7b-32768")

In [101]:
# result_string = prompt_llm(llm, "python")


In [104]:
scores = []
reasons = []

# Loop through each skill, call the function, and parse the result
for skill in skill_df['skill']:
    input_string = prompt_llm(llm, skill)
    score, reason = parse_score_reason(input_string)
    scores.append(score)
    reasons.append(reason)

# Create a DataFrame
data = {'skill': list(skill_df['skill']), 'score': scores, 'reason': reasons}
df = pd.DataFrame(data)


score: 4
reason: The candidate has demonstrated proficient use of Python by implementing an embedded system setup and writing scripts to test encoders. Additionally, Python is listed as one of their custom build languages, indicating they have experience using it for a variety of tasks.
score: 3
reason: The candidate has listed JavaScript as one of their programming languages, which indicates an intermediate proficiency. However, there are no specific projects or accomplishments related to JavaScript that would suggest a higher level of expertise.
score: 4
reason: The candidate has extensive experience as a Senior Test Automation Engineer, developing test automation for UI, API services, and database validations using Groovy scripting language or built-in functions of Ready API or other API testing tools. They have worked with architecture teams to identify appropriate test solutions and demonstrated the ability to exercise all aspects of application code. Additionally, they have exper

In [105]:
merged_df = pd.merge(df, skill_df, how='inner')
merged_df

Unnamed: 0,skill,score,reason,weightage
0,Python,4,The candidate has demonstrated proficient use ...,0.9
1,JavaScript,3,The candidate has listed JavaScript as one of ...,0.9
2,Test Automation Development,4,The candidate has extensive experience as a Se...,0.9
3,5G or other wireless technologies,3,The candidate has an intermediate proficiency ...,0.8
4,Network protocols,3,The candidate has demonstrated intermediate pr...,0.5
5,Test Script Design and Development,4,The candidate has extensive experience in test...,0.7
6,Manual Testing,2,The candidate's resume shows a strong focus on...,0.2
7,Test Case Analysis,4,The candidate has extensive experience in soft...,0.6
8,Test Coverage Determination,3,The candidate has experience with various prog...,0.5
9,Test Report Preparation,3,The candidate has experience as a Test Enginee...,0.5


In [107]:
merged_df['weighted_Score'] = merged_df['score'] * merged_df['weightage']

# Calculate the weighted sum
final_score = merged_df['weighted_Score'].sum()
final_score

22.4