# Fine-grained Parsing

After initial parsing, let's parse out details from each resume section individually

In [26]:
import sys
import os
import json
from pprint import pprint
from pydantic import BaseModel, RootModel, Field

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from resume_scanner.utils.with_structured_output import with_structured_output
from resume_scanner.parsing.initial_parsing import parse_resume_sections

In [55]:
parsed_resume = parse_resume_sections("../data/input/resumes/Kevin_resume.pdf")

In [56]:
pprint(parsed_resume)

{'Education': 'Texas A&M University May 2026\n'
              'BS in Computer Science, Minor in Statistics and Math College '
              'Station, TX\n'
              'Cumulative GPA: 4.0/4.0\n'
              'Honors: Dean’s Honor Roll, Engineering Honors (EH), Dean’s '
              'Excellence Award Semi-finalist\n'
              'Coursework: Data Structures & Algorithms, Software Engineering, '
              'Computer Systems, Discrete Math, Linear Algebra',
 'Experience': 'AI/ML Intern Aug 2024 – Dec 2024\n'
               '• Developed knowledge graph (KG) generation pipeline with '
               'internal LLM microservices to allow multi-hop\n'
               'reasoning in 3-stage retrieval augmented generation (RAG) '
               'pipeline\n'
               '• Extracted 30+ domain-specific seed topics from text corpus '
               'with BERTopic for KG subgraph creation\n'
               '• Achieved100%schema-compliantLLMoutputsviaprompt engineering '
               'a

## Parse Education

In [10]:
class School(BaseModel):
    name: str           = Field(..., alias="Name")
    majors: list[str]   = Field(..., alias="Majors")
    minors: list[str]   = Field(..., alias="Minors")
    gpa: float          = Field(None, alias="GPA")
    grad_year: int      = Field(..., alias="Graduation Year")

class Education(RootModel[list[School]]):
    pass

In [11]:
EDUCATION_EXTRACTION_PROMPT = """
You are an expert resume parser. Given some resume text, your job is to parse education information as a list of JSON objects representing each school attended. Follow this format for each school:
    {{
        "Name": "<Name of School>",
        "Majors": ["list", "of", "majors"],
        "Minors": ["list", "of", "minors"],
        "GPA": <GPA>,
        "Graduation Year": <Graduation Year>
    }},

Notes:
1. If there are no minors, set "Minors" to an empty list.
2. If there is no GPA listed, set "GPA" to None.
3. If any school does not have a graduation year listed, omit the school from the output.
4. Output the full name of all degrees, e.g., "BS in Computer Science", "M.S. in Information Science". Note that the resume may contain a double major. If so, output all degrees with their full names, making sure to incldue the type of degree for each major ("BS," "MS," etc.). Please note that some schools offer emphasis areas or modifiers to the major that are not themselves considered majors, e.g. "Computer Science with statistics emphasis" is equivalent to "Computer Science."
5. If the resume does not contain information for one of the sections, return an empty list for that section.

Extracted information must be **explicitly contained in the resume.**

Resume text:
{resume_text}

Output:
"""

In [12]:
education_info = with_structured_output(
    prompt=EDUCATION_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Education"]),
    schema=Education)

In [13]:
education_info

[{'Name': 'Texas A&M University',
  'Majors': ['BS in Computer Science'],
  'Minors': ['Statistics', 'Math'],
  'Graduation Year': 2026,
  'GPA': 4.0}]

## Parse Experience

In [4]:
class Experience(BaseModel):
    company: str = Field(..., alias="Company")
    role: str = Field(..., alias="Role")
    contributions: list[str] = Field(..., alias="Contributions")
    start_date: str = Field(..., alias="Start Date")
    end_date: str = Field(..., alias="End Date")
    skills: list[str] = Field(..., alias="Skills")
    
class Experiences(BaseModel):
    roles: list[Experience] = Field(..., alias="Roles")
    yoe: float = Field(..., alias="YOE")

In [8]:
EXPERIENCE_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's work experience and format it as a list of JSON objects:
    {{
        "Roles": [
            {{
                "Company": "<company>",
                "Role": "<applicant's role at the company>",
                "Contributions": ["list", "of", "contributions", "in", "the", "role"],
                "Start Date": "<start date, formatted as MM-yyyy>",
                "End Date": "<end date, formatted as MM-yyyy>", 
                "Skills": ["list", "of", "relevant", "skills"]
            }},
            ...
        ],
        "YOE": <Total Years of Experience> 
    }}
    
The extracted information must be **explicitly contained in the resume.**

Calculate "YOE" by summing up the duration of all experiences, rounded to the nearest quarter-year. **Note that overlapping timeframes should not double-counted.**

When extracting "Skills" for each role, please extract specific technical terms AND niche, domain-specific skills.

Resume text:
{resume_text}

Output:
"""

In [11]:
experience_info = with_structured_output(
    EXPERIENCE_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Experience"]),
    Experiences)

In [12]:
pprint(experience_info)

{'Roles': [{'Company': 'Internship',
            'Contributions': ['Developed knowledge graph (KG) generation '
                              'pipeline with internal LLM microservices',
                              'Extracted 30+ domain-specific seed topics from '
                              'text corpus with BERTopic for KG subgraph '
                              'creation',
                              'Achieved100%schema-compliantLLMoutputsviaprompt '
                              'engineering andgrammar-contrained decoding',
                              'Packaged KG generation logic into reusable, '
                              'object-oriented Python modules used by 30 '
                              'developers'],
            'End Date': '12-2024',
            'Role': 'AI/ML Intern',
            'Skills': ['Knowledge Graph Generation',
                       'LLM Microservices',
                       'BERTopic',
                       'Prompt Engineering',
               

## Parse Projects

In [17]:
class Project(BaseModel):
    name: str = Field(..., alias="Name")
    contributions: list[str] = Field(..., alias="Contributions")
    skills: list[str] = Field(..., alias="Skills")
    
class Projects(RootModel[list[Project]]):
    pass

In [18]:
PROJECTS_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's project experience and format it as a list of JSON objects, where each object has the following format:
    {{
        "Name": "<name of project>",
        "Contributions": ["list", "of", "contributions", "in", "the", "project"],
        "Skills": ["list", "of", "relevant", "skills"]
    }}
    
The extracted information must be **explicitly contained in the resume.**

When extracting "Skills" for each project, please extract specific technical terms AND niche, domain-specific skills.

Resume text:
{resume_text}

Output:
"""

In [33]:
project_info = with_structured_output(
    PROJECTS_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Projects"]),
    Projects)

In [34]:
pprint(project_info)

[{'Contributions': ['Built AI bots to compete in Pop Tac Toe',
                    'Made bots using heuristics, minimax with alpha-beta '
                    'pruning, and reinforcement learning',
                    'Created a bot combining heuristics and minimax'],
  'Name': 'Pop Tac Toe AI Bots',
  'Skills': ['heuristics',
             'minimax',
             'alpha-beta pruning',
             'reinforcement learning']},
 {'Contributions': ['Designed an algorithm using K-means clustering to analyze '
                    'and cluster the embeddings of news articles',
                    'Implemented the algorithm to find the most relevant topic '
                    'label by comparing the cosine similarity of two articles',
                    'Used the algorithm to predict 5 mystery articles topic '
                    'labels and content'],
  'Name': 'Embedding Crackers',
  'Skills': ['K-means clustering', 'cosine similarity']},
 {'Contributions': ['Created an app to encourage saf

## Parse Leadership

In [22]:
class Leadership(BaseModel):
    org: str = Field(..., alias="Organization")
    role: str = Field(..., alias="Role")
    contributions: list[str] = Field(..., alias="Contributions")

In [21]:
LEADERSHIP_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's leadership experience and format it as a list of JSON objects, where each object has the following format:
    {{
        "Organization": "<name of organization>",
        "Role": "<applicant's role at organization>",
        "Contributions": ["list", "of", "contributions", "in", "the", "position"]
    }}
    
The extracted information must be **explicitly contained in the resume.**

Resume text:
{resume_text}

Output:
"""

In [40]:
leadership_info = with_structured_output(
    LEADERSHIP_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Leadership"]),
    Leadership)

In [41]:
leadership_info

{'Organization': 'Texas A&M AIAA CanSat Team',
 'Role': 'Electrical Team Lead',
 'Contributions': ['Designed electrical power and avionics subsystems of a 300 cubic inch satellite for AIAA’s annual CanSat design-build-launch competition with a team of two other students.',
  'Created schedules for electrical system progress and assigned tasks to team members.',
  'Utilized EasyEDA to create preliminary PCB design, fitting over 15 components into a 63 mm radius space.',
  'Performed trade studies of air pressure sensors, pitot tubes, microcontrollers, GPS units, XBee radios, gyroscopes, and cameras with a $1000 budget and 700g mass budget.',
  'Presented 10 slides covering the electrical power subsystem, microcontroller, and inertial measurement unit to competition judges as part of the Preliminary Design Review and Cumulative Design Review.',
  'Soldered over 20 electrical components and 2 independent electrical systems on satellite, including avionics and power for each system.']}

## Parse Research

In [44]:
class ResearchRole(BaseModel):
    institution: str = Field(..., alias="Institution")
    role: str = Field(..., alias="Role")
    contributions: list[str] = Field(..., alias="Contributions")
    start_date: str = Field(..., alias="Start Date")
    end_date: str = Field(..., alias="End Date")
    skills: list[str] = Field(..., alias="Skills")
    
class Research(BaseModel):
    roles: list[ResearchRole] = Field(..., alias="Roles")
    publications: list[str] = Field(..., alias="Publications")

In [45]:
RESEARCH_EXTRACTION_PROMPT = """
You are an expert at parsing resumes. Given some resume text, your job is to extract information about the candidate's project experience and format it as a JSON object with the following fields:
    {{
        "Roles": [
            {{
                "Institution": "<name of research institution>",
                "Role": "<applicant's research role>",
                "Contributions": ["list", "of", "contributions", "in", "the", "role"],
                "Start Date": "<start date, formatted as MM-yyyy>",
                "End Date": "<end date, formatted as MM-yyyy>", 
                "Skills": ["list", "of", "relevant", "skills"]
            }},
            ...
        ],
        "Publications": ["list", "of", "publications"]
    }}
    
The extracted information must be **explicitly contained in the resume.**

When extracting "Skills" for each research role, please extract specific technical terms AND niche, domain-specific skills.

Resume text:
{resume_text}

Output:
"""

In [48]:
research_info = with_structured_output(
    LEADERSHIP_EXTRACTION_PROMPT.format(resume_text=parsed_resume["Projects"]),
    Research)

In [49]:
pprint(research_info)

{'Publications': [],
 'Roles': [{'Contributions': ['Designed electrical power and avionics '
                              'subsystems of a 300 cubic inch satellite for '
                              'AIAA’s annual CanSat design-build-launch '
                              'competition with a team of two other students.',
                              'Created schedules for electrical system '
                              'progress and assigned tasks to team members.',
                              'Utilized EasyEDA to create preliminary PCB '
                              'design, fitting over 15 components into a 63 mm '
                              'radius space.',
                              'Performed trade studies of air pressure '
                              'sensors, pitot tubes, microcontrollers, GPS '
                              'units, XBee radios, gyroscopes, and cameras '
                              'with a $1000 budget and 700g mass budget.',
                  

## Skill Parsing 

In [57]:
class Skills(RootModel[list[str]]):
    pass

In [58]:
parsed_resume["Skills"]

'Languages: Java, Python, C/C++, TypeScript/JavaScript, C#, SQL, HTML/CSS, MATLAB, Bash\nFrameworks: React, Bootstrap, Flask, JUnit, ASP.NET, Entity Framework, Spring Boot\nLibraries: Numpy, Pandas, Matplotlib, LangChain, OpenAI, Pydantic, TensorFlow\nDeveloper Tools: Linux, Git (GitHub, Gerrit), Anaconda, Jupyter, Azure DevOps, Jenkins, Vim, Docker\nDatabases: PostgreSQL, MySQL, Neo4J'

In [51]:
SKILL_EXTRACTION_TEMPLATE = """
You are an expert are parsing resumes. Given a resume, your job is to parse the specific technical skills.

Format your output as a JSON list of strings.

Resume text:
{resume_text}

Output:
"""

In [59]:
skills_info = with_structured_output(
    prompt=SKILL_EXTRACTION_TEMPLATE.format(resume_text=parsed_resume["Skills"]),
    schema=Skills)

In [60]:
skills_info

['Java',
 'Python',
 'C/C++',
 'TypeScript/JavaScript',
 'C#',
 'SQL',
 'HTML/CSS',
 'MATLAB',
 'Bash',
 'React',
 'Bootstrap',
 'Flask',
 'JUnit',
 'ASP.NET',
 'Entity Framework',
 'Spring Boot',
 'Numpy',
 'Pandas',
 'Matplotlib',
 'LangChain',
 'OpenAI',
 'Pydantic',
 'TensorFlow',
 'Linux',
 'Git',
 'Anaconda',
 'Jupyter',
 'Azure DevOps',
 'Jenkins',
 'Vim',
 'Docker',
 'PostgreSQL',
 'MySQL',
 'Neo4J']

### Putting it all together

In [22]:
parsed_info = {
    "Education": education_info,
    "Work Experience": experience_info,
    "Skills": skills_info
}
pprint(parsed_info)

{'Education': [{'GPA': 4.0,
                'Graduation Year': 2026,
                'Majors': ['BS in Computer Science'],
                'Minors': ['Statistics', 'Math'],
                'Name': 'Texas A&M University'}],
 'Skills': {'Domain-Specific Skills': ['Knowledge Graph (KG) generation '
                                       'pipeline',
                                       'LLM microservices',
                                       'Multi-hop reasoning in RAG pipeline',
                                       'BERTopic for KG subgraph creation',
                                       'Prompt engineering and '
                                       'grammar-constrained decoding',
                                       'Object-oriented Python modules',
                                       'Java data analysis suite architecture '
                                       'redesign',
                                       'MATLAB profiler for bottleneck '
                         

In [23]:
with open("../output/parsed_resume_info.json", "w") as file:
    json.dump(parsed_info, file, indent=4)