# Fine-grained Parsing

After initial parsing, let's parse out details from each resume section individually

In [26]:
import sys
import os
import json
from pprint import pprint
from pydantic import BaseModel, RootModel, Field

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from resume_scanner.utils.with_structured_output import with_structured_output
from resume_scanner.parsing.initial_parsing import parse_resume_sections

In [55]:
parsed_resume = parse_resume_sections("../data/input/resumes/Kevin_resume.pdf")

In [None]:
pprint(parsed_resume)

## Parse Education

In [10]:
class School(BaseModel):
    name: str           = Field(..., alias="Name")
    majors: list[str]   = Field(..., alias="Majors")
    minors: list[str]   = Field(..., alias="Minors")
    gpa: float          = Field(None, alias="GPA")
    grad_year: int      = Field(..., alias="Graduation Year")

class Education(RootModel[list[School]]):
    pass

In [11]:
EDUCATION_EXTRACTION_PROMPT = """
You are an expert resume parser. Given some resume text, your job is to parse education information as a list of JSON objects representing each school attended. Follow this format for each school:
    {{
        "Name": "<Name of School>",
        "Majors": ["list", "of", "majors"],
        "Minors": ["list", "of", "minors"],
        "GPA": <GPA>,
        "Graduation Year": <Graduation Year>
    }},

Notes:
1. If there are no minors, set "Minors" to an empty list.
2. If there is no GPA listed, set "GPA" to None.
3. If any school does not have a graduation year listed, omit the school from the output.
4. Output the full name of all degrees, e.g., "BS in Computer Science", "M.S. in Information Science". Note that the resume may contain a double major. If so, output all degrees with their full names, making sure to incldue the type of degree for each major ("BS," "MS," etc.). Please note that some schools offer emphasis areas or modifiers to the major that are not themselves considered majors, e.g. "Computer Science with statistics emphasis" is equivalent to "Computer Science."
5. If the resume does not contain information for one of the sections, return an empty list for that section.

Extracted information must be **explicitly contained in the resume.**

Resume text:
{resume_text}

Output:
"""

In [12]:
education_info = with_structured_output(
    prompt=EDUCATION_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Education"]),
    schema=Education)

In [None]:
education_info

## Parse Experience

In [4]:
class Experience(BaseModel):
    company: str = Field(..., alias="Company")
    role: str = Field(..., alias="Role")
    contributions: list[str] = Field(..., alias="Contributions")
    start_date: str = Field(..., alias="Start Date")
    end_date: str = Field(..., alias="End Date")
    skills: list[str] = Field(..., alias="Skills")
    
class Experiences(BaseModel):
    roles: list[Experience] = Field(..., alias="Roles")
    yoe: float = Field(..., alias="YOE")

In [8]:
EXPERIENCE_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's work experience and format it as a list of JSON objects:
    {{
        "Roles": [
            {{
                "Company": "<company>",
                "Role": "<applicant's role at the company>",
                "Contributions": ["list", "of", "contributions", "in", "the", "role"],
                "Start Date": "<start date, formatted as MM-yyyy>",
                "End Date": "<end date, formatted as MM-yyyy>", 
                "Skills": ["list", "of", "relevant", "skills"]
            }},
            ...
        ],
        "YOE": <Total Years of Experience> 
    }}
    
The extracted information must be **explicitly contained in the resume.**

Calculate "YOE" by summing up the duration of all experiences, rounded to the nearest quarter-year. **Note that overlapping timeframes should not double-counted.**

When extracting "Skills" for each role, please extract specific technical terms AND niche, domain-specific skills.

Resume text:
{resume_text}

Output:
"""

In [11]:
experience_info = with_structured_output(
    EXPERIENCE_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Experience"]),
    Experiences)

In [None]:
pprint(experience_info)

## Parse Projects

In [17]:
class Project(BaseModel):
    name: str = Field(..., alias="Name")
    contributions: list[str] = Field(..., alias="Contributions")
    skills: list[str] = Field(..., alias="Skills")
    
class Projects(RootModel[list[Project]]):
    pass

In [18]:
PROJECTS_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's project experience and format it as a list of JSON objects, where each object has the following format:
    {{
        "Name": "<name of project>",
        "Contributions": ["list", "of", "contributions", "in", "the", "project"],
        "Skills": ["list", "of", "relevant", "skills"]
    }}
    
The extracted information must be **explicitly contained in the resume.**

When extracting "Skills" for each project, please extract specific technical terms AND niche, domain-specific skills.

Resume text:
{resume_text}

Output:
"""

In [33]:
project_info = with_structured_output(
    PROJECTS_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Projects"]),
    Projects)

In [None]:
pprint(project_info)

## Parse Leadership

In [22]:
class Leadership(BaseModel):
    org: str = Field(..., alias="Organization")
    role: str = Field(..., alias="Role")
    contributions: list[str] = Field(..., alias="Contributions")

In [21]:
LEADERSHIP_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's leadership experience and format it as a list of JSON objects, where each object has the following format:
    {{
        "Organization": "<name of organization>",
        "Role": "<applicant's role at organization>",
        "Contributions": ["list", "of", "contributions", "in", "the", "position"]
    }}
    
The extracted information must be **explicitly contained in the resume.**

Resume text:
{resume_text}

Output:
"""

In [40]:
leadership_info = with_structured_output(
    LEADERSHIP_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Leadership"]),
    Leadership)

In [None]:
leadership_info

## Parse Research

In [44]:
class ResearchRole(BaseModel):
    institution: str = Field(..., alias="Institution")
    role: str = Field(..., alias="Role")
    contributions: list[str] = Field(..., alias="Contributions")
    start_date: str = Field(..., alias="Start Date")
    end_date: str = Field(..., alias="End Date")
    skills: list[str] = Field(..., alias="Skills")
    
class Research(BaseModel):
    roles: list[ResearchRole] = Field(..., alias="Roles")
    publications: list[str] = Field(..., alias="Publications")

In [45]:
RESEARCH_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's project experience and format it as a JSON object with the following fields:
    {{
        "Roles": [
            {{
                "Institution": "<name of research institution>",
                "Role": "<applicant's research role>",
                "Contributions": ["list", "of", "contributions", "in", "the", "role"],
                "Start Date": "<start date, formatted as MM-yyyy>",
                "End Date": "<end date, formatted as MM-yyyy>", 
                "Skills": ["list", "of", "relevant", "skills"]
            }},
            ...
        ],
        "Publications": ["list", "of", "publications"]
    }}
    
The extracted information must be **explicitly contained in the resume.**

When extracting "Skills" for each research role, please extract specific technical terms AND niche, domain-specific skills.

Resume text:
{resume_text}

Output:
"""

In [48]:
research_info = with_structured_output(
    LEADERSHIP_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Projects"]),
    Research)

In [None]:
pprint(research_info)

## Skill Parsing 

In [57]:
class Skills(RootModel[list[str]]):
    pass

In [None]:
parsed_resume["Skills"]

In [51]:
SKILL_EXTRACTION_TEMPLATE = """
You are an expert are parsing resumes. Given a resume, your job is to parse the specific technical skills.

Format your output as a JSON list of strings.

Resume text:
{resume_text}

Output:
"""

In [59]:
skills_info = with_structured_output(
    prompt=SKILL_EXTRACTION_TEMPLATE.format(resume_text=parsed_resume["Skills"]),
    schema=Skills)

In [None]:
skills_info

### Putting it all together

In [None]:
parsed_info = {
    "Education": education_info,
    "Work Experience": experience_info,
    "Skills": skills_info
}
pprint(parsed_info)

In [23]:
with open("../output/parsed_resume_info.json", "w") as file:
    json.dump(parsed_info, file, indent=4)