# Evaluation

Let's compare my workflow to a baseline of feeding the entire job description + entire resume (without parsing) to an LLM and asking it to predict a score

In [1]:
import json
import sys
import os
from pprint import pprint
from pydantic import BaseModel, RootModel, Field
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from resume_scanner.utils.decode import decode_with_ollama, decode_with_openai
from resume_scanner.parsing.parsing import parse_resume
from resume_scanner.scoring.scoring import score_resume

In [2]:
parsed_resume = parse_resume("../data/input/resumes/Kareem_resume.pdf")

In [3]:
pprint(parsed_resume)

ResumeInfo(education=Education(root=[School(name='Texas A&M University', majors=['Bachelor of Science in Computer Science'], minors=[], gpa=None, grad_year=2026, honors=['Engineering Academic Excellence Award', "Jeffrey John Becker '82 Scholarship"], coursework=['Software Engineering', 'Data Structures & Algorithms', 'Computer Systems', 'Computer Architecture', 'Discrete Math', 'Linear Algebra', 'Calculus III'])]), experience=Experiences(roles=[Experience(company='Qatar Computing Research Institute', role='Artificial Intelligence Intern', contributions=['Optimized machine learning models (Inception v3, EfficientNet, ResNet) on a dataset of 60,000+ images for DNA Enhancer Sequence Prediction, achieving 98.8% testing accuracy on unseen data.', 'Conducted extensive experiments comparing the accuracy of different models and tuning various hyperparameters to enhance overall model performance.', 'Applied advanced image processing techniques (movement probability matrix, wave transforms) to e

In [4]:
with open("../data/input/jobs/spacex_hardware.txt", "r") as file:
    job_desc = file.read()

In [5]:
FINAL_SCORING_TEMPLATE = """
### Instruction

You are an expert at evaluating resumes for a job opening. Your goal is to score each resume based on its alignment with the provided job description. Use the scoring criteria below to evaluate each section and calculate an overall score. Provide concise, explainable feedback for each score. 

NOTE: since you are an automated system for screening candidates, you must be brutally honest and cold. If a resume does not closely match a niche skillset required by a job, you MUST score the resume lower.

For example, if a job is looking for hardware experience and a resume only has software experience, the experience section should score at most a 2.

#### Scoring Criteria

For each section, score based on the following criteria:
1. **Relevance (0-5)**: How well does the content align with the job description?
2. **Depth (0-5)**: How substantial and well-developed is the content?
3. **Impact (0-5)**: Does the content demonstrate measurable outcomes or achievements?

If a section is missing, assign a score of 0 for that section.

#### Scoring Rubric

| Score | Meaning |
|-------|---------|
| **0** | **Not Applicable / Missing**: No content provided, or the section is irrelevant to the job. |
| **1** | **Poor**: Content exists but is highly generic, irrelevant, or underdeveloped. Little to no measurable impact is demonstrated. |
| **2** | **Below Average**: Somewhat relevant but lacks depth or specificity. Minimal impact or achievements are demonstrated. |
| **3** | **Average**: Content is moderately relevant, with adequate detail. Some measurable impact or effort is evident, but not exceptional. |
| **4** | **Good**: Content is highly relevant, well-detailed, and demonstrates meaningful contributions or achievements. Could be improved slightly to reach exceptional quality. |
| **5** | **Excellent**: Content is exceptionally relevant, detailed, and impactful, showcasing strong alignment with job requirements and significant measurable outcomes. |

---

### Output Format

```json
{{
   "experience": {{
      "relevance": [Score],
      "depth": [Score],
      "impact": [Score],
      "comment": "[Explanation for score that mentions experience requirements (e.g. "5+ years experience with...") from job description]"
   }},
   "education": {{
      "alignment": [Score]
      "comment": "[Explanation for score that mentions education requirements from job description]"
   }},
   "projects": {{
      "relevance": [Score],
      "depth": [Score],
      "impact": [Score],
      "comment": "[Explanation for score that mentions specific requirements from job description]"
   }},
   "leadership": {{
      "relevance": [Score],
      "depth": [Score],
      "impact": [Score],
      "comment": "[Explanation for score that mentions specific requirements from job description]"
   }},
   "research": {{
      "relevance": [Score],
      "depth": [Score],
      "impact": [Score],
      "comment": "[Explanation for score that mentions specific requirements from job description]"
   }},
   "skills": {{
      "alignment": [Score]
      "comment": "[Explanation for score that mentions specific requirements from job description]"
   }}
   "overall_comment": "[General comments about the candidate's resume, including strengths, weaknesses, and alignment with the job description]"
}}
```

---

### Input

Work Experience:
```{work_experience}```

Education:
```{education}```

Projects:
```{projects}```

Leadership:
```{leadership}```

Research:
```{research}```

Skills:
```{skills}```

Job description:
```{job_desc}```

Output:
"""

In [6]:
class SectionScore(BaseModel):
    relevance: int  = Field(..., description="Relevance score (0-5)")
    depth: int      = Field(..., description="Depth score (0-5)")
    impact: int     = Field(..., description="Impact score (0-5)")
    comment: str    = Field(..., description="Explanation for the scores in this section")
    
class ReducedSectionScore(BaseModel):
    alignment: int  = Field(..., description="Alignment score (0-5)")
    comment: str    = Field(..., description="Explanation for the score in this section")

class ResumeEvaluation(BaseModel):
    experience: SectionScore      
    education: ReducedSectionScore
    projects: SectionScore        
    leadership: SectionScore      
    research: SectionScore        
    skills: ReducedSectionScore   
    overall_comment: str             = Field(..., description="General comments about the resume, including strengths and weaknesses")

In [10]:
from openai import OpenAI

client = OpenAI()

def score_resume_gpt(resume: dict, job_desc: str):
    DEFAULT = "Unavailable."
    
    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-11-20",
        messages=[
            {
                "role": "user",
                "content": FINAL_SCORING_TEMPLATE.format(
                    job_desc=job_desc,
                    work_experience=resume.experience or DEFAULT,
                    education=resume.education or DEFAULT,
                    projects=resume.projects or DEFAULT,
                    leadership=resume.leadership or DEFAULT,
                    research=resume.research or DEFAULT,
                    skills=resume.skills or DEFAULT
                ),
            }
        ],
        response_format=ResumeEvaluation
    )
    
    return response.choices[0].message.parsed

In [3]:
DEFAULT = "Unavailable."
eval = decode_with_openai(
    prompt=FINAL_SCORING_TEMPLATE.formatFINAL_SCORING_TEMPLATE.format(
        job_desc=job_desc,
        work_experience=parsed_resume.experience or DEFAULT,
        education=parsed_resume.education or DEFAULT,
        projects=parsed_resume.projects or DEFAULT,
        leadership=parsed_resume.leadership or DEFAULT,
        research=parsed_resume.research or DEFAULT,
        skills=parsed_resume.skills or DEFAULT
    ),
    schema=ResumeEvaluation
)

NameError: name 'FINAL_SCORING_TEMPLATE' is not defined

In [409]:
eval = score_resume_gpt(parsed_resume, job_desc)
eval = eval.model_dump()
pprint(eval)

{'education': {'alignment': 3,
               'comment': 'The obtained degree in Computer Science is '
                          'technically relevant, though not directly aligned '
                          'with the preferred engineering disciplines.'},
 'experience': {'comment': 'The candidate has a strong background in software '
                           'development and machine learning, which is less '
                           'aligned with the required thermal systems '
                           'experience.',
                'depth': 3,
                'impact': 3,
                'relevance': 2},
 'leadership': {'comment': 'Leadership experiences indicate organizational '
                           'effectiveness but do not relate closely to thermal '
                           'systems engineering challenges.',
                'depth': 2,
                'impact': 2,
                'relevance': 1},
 'overall_comment': 'The candidate demonstrates considerable technical '

## Assign weights to the sections

Let's ask the LLM to generate weights for each resume section based on the job description

In [268]:
SECTION_WEIGHT_TEMPLATE = """
### Instruction

You are an expert at evaluating job descriptions. Given a job description, your job is to assign weights to resume sections that will be used to score resumes against the job description.

There are six resume sections:
1. Education
2. Experience
3. Projects
4. Leadership
5. Research
6. Skills

Assign a percentage weight between 0 and 1 that determines how important each resume section is when evaluating a resume's fit to a provided job description.

Notes on weight assignments:
- In general, work experience should be given substantially more weight than other sections.
- For non-research roles, research should be given a weight of 0.
- Senior-level roles should have a project weight of 0.

**ALL WEIGHTS MUST SUM TO 1**

Please think step-by-step and output your reasoning in the "Reasoning" section before assigning weights. **You must explicitly sum up the scores you provided and validate that it adds up to 1. If it is not, please re-compute the weights.**

### Output Format

```json
{{
    "reasoning": "<reasoning leading to weight assignments>",
    "validation": "<step-by-step caluclations that ensure scores add up to 1>",
    "education": <weight between 0 and 1>,
    "experience": <weight between 0 and 1>,
    "projects": <weight between 0 and 1>,
    "leadership": <weight between 0 and 1>,
    "research": <weight between 0 and 1>,
    "skills": <weight between 0 and 1>
}}
```

---

### Input

Job Description:
{job_desc}

Output:
"""

In [217]:
class ResumeWeights(BaseModel):
    reasoning: str
    validation: str
    education: float
    experience: float
    projects: float
    leadership: float
    research: float
    skills: float

In [89]:
weights = decode_with_ollama(
    prompt=SECTION_WEIGHT_TEMPLATE.format(job_desc=job_desc),
    schema=ResumeWeights
)

In [161]:
def get_weights_gpt(job_desc: str):
    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-11-20",
        messages=[
            {
                "role": "user",
                "content": SECTION_WEIGHT_TEMPLATE.format(job_desc=job_desc)
            }
        ],
        response_format=ResumeWeights
    )
    
    return response.choices[0].message.parsed

In [410]:
weights = get_weights_gpt(job_desc)
pprint(weights.model_dump())
weights = weights.model_dump()

{'education': 0.1,
 'experience': 0.5,
 'leadership': 0.1,
 'projects': 0.2,
 'reasoning': 'Based on the Starship Thermal Hardware Engineer job '
              'description, with the primary focus being technical expertise, '
              'practical experience, collaboration capabilities, and design '
              'innovation, weights have been assigned prioritizing these '
              'aspects.',
 'research': 0.0,
 'skills': 0.1,
 'validation': 'Summing the weights: 0.1 (Education) + 0.5 (Experience) + 0.2 '
               '(Projects) + 0.1 (Leadership) + 0.0 (Research) + 0.1 (Skills) '
               '= 1.0. The total is 1, meeting the requirements.'}


## Final Scoring

In [411]:
edu_score = eval["education"]["alignment"] / 5 * weights["education"]
exp_score = (eval["experience"]["depth"] + eval["experience"]["impact"] * eval["experience"]["relevance"] / 5) / 10 * weights["experience"]
proj_score = (eval["projects"]["depth"] + eval["projects"]["impact"] * eval["projects"]["relevance"] / 5) / 10 * weights["projects"]
research_score = (eval["research"]["depth"] + eval["research"]["impact"] * eval["research"]["relevance"] / 5) / 10 * weights["research"]
leadership_score = (eval["leadership"]["depth"] + eval["leadership"]["impact"] * eval["leadership"]["relevance"] / 5) / 10 * weights["leadership"]
skills_score = eval["skills"]["alignment"] / 5 * weights["skills"]

final_score = edu_score + exp_score + proj_score + research_score + leadership_score + skills_score
final_score

0.43800000000000006