# Evaluation

Let's compare my workflow to a baseline of feeding the entire job description + entire resume (without parsing) to an LLM and asking it to predict a score

In [7]:
import json
import sys
import os
from pprint import pprint
from pydantic import BaseModel, RootModel, Field
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from resume_scanner.utils.with_structured_output import with_structured_output

In [3]:
def evaluate_score(predicted_score: int, gt_score: int) -> bool:
    print(f"Ground truth: {gt_score}\tPredicted: {predicted_score}")
    return predicted_score == gt_score

In [21]:
FINAL_SCORING_TEMPLATE = """
You are an expert resume evaluator. Given a parsed resume and a parsed job description (both as JSON strings), evaluate how well the resume aligns with and exceeds the job requirements. Assume all candidates meet the minimum qualifications. Focus on identifying standout qualities and experiences that set candidates apart.

#### Evaluation Criteria
Evaluate based on the following dimensions with stricter standards for higher scores:

1. **Skills**  
   - Check for strong alignment between the resume's skills and the job description's "Required Skills" and "Preferred Skills." Skill alignment is **very important.**
   - While general skills like Python, C++, Java, etc. are a must, you must also assess whether the candidate possesses more domain-specific skills that align with the all of the job requirements.
   - If the resume primarily contains domain-specific skills that are not relevant to the job, the resume **CANNOT** receive a high score for skill alignment.
   - In-context skill usage is favored over listing a skill in isolation. For example, "Implemented data migration pipeline in Python" is much stronger than "Python" by itself.

2. **Experience**  
   - Award higher scores for directly relevant professional, internship, or project experiences. Evaluate depth, relevance, and outcomes achieved.  
   - Measurable outcomes, leadership roles, or significant contributions (e.g., "Improved system efficiency by 25%" or "Led a team of 5 engineers to develop a critical feature") should be weighted more heavily than vague descriptions.  
   - Deduct points for generic or irrelevant experience, even if impressive in other contexts.

3. **Education**
   - Award more points for candidates whose education matches any preferred education requirements
   - If a GPA is listed, high GPAs (>= 3.8) should be given more weight

---

#### Scoring Guidelines
Assign a score between 1 and 5 using the following scale:  
- **1 (Meets minimum requirements)** – Fulfills basic qualifications but lacks distinguishing factors.  
- **2 (Slightly exceeds requirements)** – Shows some additional strengths but is not particularly notable.  
- **3 (Clearly exceeds requirements)** – Demonstrates clear strengths in one or more dimensions with solid alignment to the role.
- **4 (Greatly exceeds requirements)** – Candidate is exceptional in key areas, exceeding most expectations with significant value-add potential.  
- **5 (Outstanding candidate)** – Candidate far surpasses expectations with extraordinary qualifications and achievements across multiple dimensions.

---

### Output Format
```json
{{
    "Reasoning": [
        "Skills": "<brief justification>",
        "Experience": "<brief justification>",
        "Education": "<brief justification>",
    ],
    "Overall Assessment": "<brief summary of why this candidate may be a strong/weak fit; if the score is less than 4, this section must include the sentence \"The candidate did not receive a <score> because ...\">",
    "Score": <integer from 1 to 5>
}}
```

### Notes for Scoring
- Reserve a score of 5 for resumes that demonstrate exceptional alignment with the job description, clear differentiation from typical candidates, and significant potential value to the role or organization.
- **Only candidates who receive a 4 or 5 will be pursued for the next stage of the hiring process.**
- Ensure all reasoning and feedback clearly justify the assigned score.

---

### Input

Resume:
{resume_info}

Job Description:
{job_info}

Output:
"""

In [9]:
class Reasoning(BaseModel):
    skills: str     = Field(..., alias="Skills")
    experience: str = Field(..., alias="Experience")
    education: str  = Field(..., alias="Education")

class ResumeEvaluation(BaseModel):
    reasoning: Reasoning            = Field(..., alias="Reasoning")
    overall_assessment: str         = Field(..., alias="Overall Assessment")
    score: int                      = Field(..., alias="Score")

In [11]:
with open("../data/input/dataset.json", "r") as file:
    dataset = json.load(file)

In [7]:
len(dataset)

9

In [14]:
with open("../data/output/parsed_resume_info.json", "r") as file:
    resume = json.load(file)
    
with open("../data/output/parsed_job_desc.json", "r") as file:
    job_desc = json.load(file)

## Baseline

In [22]:
def predict(resume: dict, job_desc: dict):
    output = with_structured_output(
        prompt=FINAL_SCORING_TEMPLATE.format(resume_info=resume, job_info=job_desc),
        schema=ResumeEvaluation)
    
    print("\nJob Description:")
    pprint(job_desc)
    print("\nResume:")
    pprint(resume)
    print("\nEvaluation:")
    pprint(output)

In [12]:
predict(resume, job_desc)

NameError: name 'resume' is not defined

In [13]:
def predict(sample: dict):
    job_desc = sample["job_description"]
    resume = sample["resume"]
    
    output = with_structured_output(
        prompt=FINAL_SCORING_TEMPLATE.format(resume_info=resume, job_info=job_desc),
        schema=ResumeEvaluation)
    
    print("\nJob Description:")
    pprint(job_desc)
    print("\nResume:")
    pprint(resume)
    print("\nEvaluation:")
    pprint(output)

In [16]:
with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(predict, dataset)


Job Description:
('Marketing Content Manager\n'
 '\n'
 "We're looking for a Marketing Content Manager to lead our content strategy "
 'and create compelling marketing materials across multiple channels.\n'
 '\n'
 'Required Skills:\n'
 '- 5+ years of content marketing experience\n'
 '- Excellent writing and editing skills\n'
 '- Experience managing content calendars\n'
 '- SEO and content optimization expertise\n'
 '- Project management experience\n'
 '\n'
 'Preferred Qualifications:\n'
 '- Experience with video content production\n'
 '- Knowledge of Adobe Creative Suite\n'
 '- Social media marketing experience\n'
 '- Marketing automation tool expertise\n'
 '\n'
 'Education:\n'
 "- Bachelor's degree in Marketing, Communications, or related field\n"
 '\n'
 'Responsibilities:\n'
 '- Develop and execute content strategy\n'
 '- Create and edit marketing content\n'
 '- Manage editorial calendar\n'
 '- Oversee content team and freelancers\n'
 '- Track and report content performance')

Resume

## GPT 4o

In [18]:
from openai import OpenAI

In [19]:
class Output(BaseModel):
    output: list[str]

In [20]:
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-2024-11-20",
    messages=[
        {
            "role": "user",
            "content": "Tell me a story in JSON list format"
        }
    ],
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "ListOfStrings",
            "schema": Output.model_json_schema()
        }
    }
)

In [84]:
def predict_gpt(resume: dict, job_desc: dict):
    response = client.chat.completions.create(
        model="gpt-4o-2024-11-20",
        messages=[
            {
                "role": "user",
                "content": FINAL_SCORING_TEMPLATE.format(resume_info=resume, job_info=job_desc)
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "ResumeScore",
                "schema": ResumeEvaluation.model_json_schema()
            }
        }
    )
    
    print("\nJob Description:")
    pprint(job_desc)
    print("\nResume:")
    pprint(resume)
    print("\nEvaluation:")
    pprint(json.loads(response.choices[0].message.content))

In [17]:
def predict_gpt(sample: dict) -> int:
    job_desc = sample["job_description"]
    resume = sample["resume"]
    gt_score = sample["score"]
    
    response = client.chat.completions.create(
        model="gpt-4o-2024-11-20",
        messages=[
            {
                "role": "user",
                "content": FINAL_SCORING_TEMPLATE.format(resume_info=resume, job_info=job_desc)
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "ResumeScore",
                "schema": ResumeEvaluation.model_json_schema()
            }
        }
    )
    
    print("\nJob Description:")
    pprint(job_desc)
    print("\nResume:")
    pprint(resume)
    print("\nEvaluation:")
    pprint(json.loads(response.choices[0].message.content))

In [23]:
with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(predict_gpt, dataset)


Job Description:
('Systems Administrator\n'
 '\n'
 'We are seeking a Systems Administrator to maintain and optimize our IT '
 'infrastructure and systems.\n'
 '\n'
 'Required Skills:\n'
 '- 4+ years of systems administration experience\n'
 '- Strong knowledge of Windows Server and Linux\n'
 '- Experience with virtualization (VMware, Hyper-V)\n'
 '- Network administration skills\n'
 '- Security and compliance knowledge\n'
 '\n'
 'Preferred Qualifications:\n'
 '- Cloud platform experience (AWS/Azure)\n'
 '- PowerShell scripting expertise\n'
 '- Experience with backup and DR solutions\n'
 '- ITIL certification\n'
 '\n'
 'Education:\n'
 "- Bachelor's degree in IT, Computer Science, or related field\n"
 '\n'
 'Responsibilities:\n'
 '- Maintain server infrastructure and systems\n'
 '- Manage user access and security\n'
 '- Implement backup and recovery procedures\n'
 '- Monitor system performance\n'
 '- Provide technical support when needed')

Resume:
('ROBERT CHEN\n'
 'rchen@email.com | (5

## Parsing -> scoring

In [1]:
import sys
import os
from pprint import pprint

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from resume_scanner.parsing.resume_parsing import parse_resume
from resume_scanner.parsing.job_parsing import parse_job_desc
from resume_scanner.scoring.initial_filtering import filter_min_reqs
from resume_scanner.scoring.final_scoring import score_resume

Fully sequential parsing: 1:25

Parallel parsing: 1:15

In [2]:
parsed_resume = parse_resume("../data/input/resumes/Kevin_resume.pdf")

In [3]:
with open("../data/input/jobs/google_swe_senior.txt", "r") as file:
    job_text = file.read()

In [4]:
score_resume(parsed_resume, job_text)

{'Reasoning': {'Skills': 'The candidate has a strong background in software development with experience in multiple programming languages. They have also worked on data structures and algorithms, which is a requirement for the position.',
  'Experience': "The candidate's experience in testing, maintaining, or launching software products aligns with the job requirements. However, they lack experience with state-of-the-art GenAI techniques and ML infrastructure, which are preferred qualifications.",
  'Education': "The candidate has a bachelor's degree, which meets the minimum qualification requirement. However, they do not have a master's degree or PhD in Computer Science or related technical field, which is a preferred qualification."},
 'Overall Assessment': 'The candidate has some relevant skills and experience for the position but lacks some of the preferred qualifications. They may be considered for an entry-level position or further training to meet the requirements.',
 'Score': 3