# Initial Parsing

Before doing more fine-grained parsing on individual resume sections (Education, Experience, etc.), we need to parse the sections out

Couple of options:

1. Regex-based
2. Text-only LLM
3. Vision LLM (for PDFs that are not formatted with text)

In [None]:
import re
import json
from pprint import pprint
from collections import defaultdict
from pydantic import BaseModel, RootModel, Field
from pdf2image import convert_from_path
from PIL import Image
from pytesseract import pytesseract

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from resume_scanner.utils.with_structured_output import with_structured_output

## Parsing Key Information With Regex

1. Phone number
2. Email
3. LinkedIn
4. GitHub
5. Location

### Phone number

In [4]:
def parse_phone_number(text: str) -> str:
    phone_pattern = re.compile(r"""
        (?:\+(?P<country_code>1)+\s)?   # Match country code (if exists)
        (?:\()?
        (?P<area_code>\d{3})            # Match area code (first 3 digits)
        (?:[.-]|\)\s?)
        (?P<prefix>\d{3})               # Match prefix (second 3 digits)
        [.-]
        (?P<line_number>\d{4})          # Match line number (last 4 digits)
    """, re.VERBOSE)
    match = re.search(phone_pattern, text)
    if match:
        return f"{match.group("area_code")}-{match.group("prefix")}-{match.group("line_number")}"
    return ""

In [None]:
phone_number = "(832) 416-3570"

parse_phone_number(phone_number)

### Email

In [5]:
def parse_email(text: str) -> str:
    email_pattern = re.compile(r"""
        \b
        [A-Za-z0-9._%+-]+   # Local part
        @
        [A-Za-z0-9.-]+      # Domain 
        \.[A-Za-z]{2,}
        \b
    """, re.VERBOSE)
    match = re.search(email_pattern, text)
    if match:
        return match.group()
    return ""

In [None]:
email = "Some leading text: kevzhang2022@tamu.edu and trailing text"

parse_email(email)

### LinkedIn

In [6]:
def parse_linkedin(text: str) -> str:
    linkedin_pattern = r"(?:(?:https://)?(?:www.)?linkedin.com/in/(?P<profile_id>[A-Za-z0-9-]{5,30})/?\b)"
    match = re.search(linkedin_pattern, text)
    if match:
        return f"https://linkedin.com/in/{match.group("profile_id")}"
    return ""

In [None]:
linkedin = "LinkedIn: linkedin.com/in/kevinkz some trailing text"

parse_linkedin(linkedin)

### GitHub

In [7]:
def parse_github(text: str) -> str:
    github_pattern = r"(?:(?:https://)?(?:www.)?github.com/(?P<username>[A-Za-z0-9-]{1,39})/?\b)"
    match = re.search(github_pattern, text)
    if match:
        return f"https://github.com/{match.group("username")}"
    return ""

In [None]:
github = "GitHub: github.com/n1v3x2 some trailing text"

parse_github(github)

### Location

In [8]:
def parse_location(text: str) -> str:
    location_pattern = r"(?P<city>\b[A-Za-z0-9]+(?:(?:[ .'-]|. )[A-Za-z0-9]+)*),\s?(?P<state>[A-Z]{2})"
    match = re.search(location_pattern, text)
    if match:
        return f"{match.group("city")}, {match.group("state")}"
    return ""

In [None]:
location = "College Station, TX"

parse_location(location)

### Tying it all together

In [10]:
resume_text = extract_pdf_text("../input/resumes/Kareem_resume.pdf")
parsed_info = {
    "Phone": parse_phone_number(resume_text),
    "Email": parse_email(resume_text),
    "LinkedIn": parse_linkedin(resume_text),
    "GitHub": parse_github(resume_text),
}

In [None]:
parsed_info

## Option 1: Purely Regex-Based Parsing

In [4]:
heading_map = {
    # Experience
    r"(Work|Relevant|Professional)?\s*(Experience|History)": "Experience",
    r"(Employment|Career)\s*(History|Experience)": "Experience",
    r"(Internship|Internships|Intern Experiences?)": "Experience",
    r"(Freelance|Contract)\s*(Work|Experience)": "Experience",
    r"Work": "Experience",

    # Education
    r"(Education|Educational Background|Academic History|Academic Background)": "Education",
    r"(Certifications|Courses|Licenses|Trainings|Accreditations)": "Certifications",
    r"(Professional Development|Learning)": "Certifications",

    # Skills
    r"(Skills|Technical Skills|Key Competencies|Core Competencies|Abilities)": "Skills",
    r"(Technical Proficiencies|Technical Expertise|Expertise|Proficiencies)": "Skills",
    r"(Languages|Programming Languages)": "Skills",
    
    # Projects
    r"(Projects|Key Projects|Personal Projects|Side Projects)": "Projects",
    r"(Freelance Projects|Independent Projects|Portfolio)": "Projects",

    # Achievements and Awards
    r"(Achievements?|Awards?|Honors?|Accolades?|Recognitions?)": "Achievements",
    r"(Accomplishments|Milestones)": "Achievements",

    # Volunteer Work
    r"(Volunteer|Volunteering|Community( Service)?|Volunteer Experience)": "Volunteer Work",
    r"(Social Work|Non-Profit Work)": "Volunteer Work",

    # Leadership
    r"(Leadership|Leadership Experience|Leadership Roles|Positions of Responsibility)": "Leadership",
    r"(Managerial Experience|Team Leadership|Organizational Roles)": "Leadership",

    # Publications and Research
    r"(Publications?|Research|Academic Papers|Articles|Journals?)": "Publications",
    r"(Research Projects|Thesis|Dissertation)": "Research",

    # Interests and Hobbies
    r"(Interests?|Hobbies?|(Extracurricular|Collegiate) Activities)": "Interests",
    r"(Passions?|Leisure Activities)": "Interests",

    # Objective or Summary
    r"(Objective|Career Objective|Professional Objective)": "Summary",
    r"(Summary|Professional (Highlights|Profile|Summary)|Career Summary)": "Summary",

    # References
    r"(References?|Professional References|Referees?)": "References",
}

def normalize_heading(heading):
    for pattern, normalized_heading in heading_map.items():
        if re.search(pattern, heading, re.IGNORECASE):
            return normalized_heading
    return "Miscellaneous"

In [5]:
def extract_sections_by_heading(resume_text: str) -> dict[str, str]:
    heading_pattern = r"""
        ^(                                  
            [A-Z][a-z]+(?:\ [A-Z][a-z]+)?(?:[\s]*\n)    # Matches Captialized headings
            |
            [A-Z]{3,}(?:\ [A-Z]{2,})*(?::?[\s]*\n)      # Matches ALL CAPS headings 
        )
    """
    heading_regex = re.compile(heading_pattern, re.VERBOSE | re.MULTILINE)
    matches = list(re.finditer(heading_regex, resume_text))
    
    sections = defaultdict(list)
    for i, match in enumerate(matches):
        # The section starts at the end of the heading
        start = match.end()
        # The section ends at the start of the next heading or the end of the resume
        end = matches[i + 1].start() if i + 1 < len(matches) else len(resume_text)
        
        heading = match.group(1).strip()
        normalized_heading = normalize_heading(heading)
        
        if normalized_heading == "Miscellaneous":
            sections[normalized_heading].append(resume_text[start:end].strip())
        else :
            sections[normalized_heading] = resume_text[start:end].strip()
    
    return dict(sections)

In [None]:
resume_text = extract_pdf_text("../sample-data/Ben_Resume.pdf")
resume_sections = extract_sections_by_heading(resume_text)
pprint(resume_sections)

Regex-based parsing is too inflexible for handling a wide variety of resume formats... maybe an LLM-based solution would work better

## Option 2: LLM with Structured Output

In [None]:
class Resume(BaseModel):
    experience: str = Field(..., alias="Experience")
    education: str  = Field(..., alias="Education")
    skills: str     = Field(..., alias="Skills")
    projects: str   = Field(..., alias="Projects")
    leadership: str = Field(..., alias="Leadership")
    research: str   = Field(..., alias="Research")

In [10]:
INITIAL_EXTRACTION_PROMPT = """
You are an expert at parsing resume information. Given resume text, your job is to parse individual sections based on resume heading. Follow this format:
    {{
        "Experience": "<Experience>",
        "Education": "<Education>",
        "Skills": "<Skills>",
        "Projects": "<Projects>",
        "Leadership": "<Leadership Experience>",
        "Research": "<Research Experience>"
    }}
    
Your job is very simple: simply copy everything under each resume section into the output format; do not worry about formatting. 

If a resume does not contain one of the sections, output an empty string for that section. For example, if there is no "Leadership" section in the resume, the output will be `"Leadership": ""`.

In the experience section, **ensure that you include company names**, which are usually listed beside or under position names.

**The parsed information must be explicitly contained in the resume.**

**Do not exclude any information from the resume.**

Resume:
{resume_text}

Output:
"""

In [11]:
resume_text = extract_pdf_text("../input/resumes/Kevin_resume.pdf")
parsed_resume = with_structured_output(
    prompt=INITIAL_EXTRACTION_PROMPT.format(resume_text=resume_text),
    schema=Resume,
    model="llama3.1")

Add regex-parsed sections to output, but don't include it in the parsed resume output for privacy reasons

In [12]:
parsed_info = {
    "Phone": parse_phone_number(resume_text),
    "Email": parse_email(resume_text),
    "LinkedIn": parse_linkedin(resume_text),
    "GitHub": parse_github(resume_text),
}

with open("../output/parsed_personal_info.json", "w") as file:
    json.dump(parsed_info, file, indent=4)

In [13]:
with open("../output/parsed_resume.json", "w") as file:
    json.dump(parsed_resume, file, indent=4)

### Trying out HuggingFace

In [None]:
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
import os

load_dotenv()

HF_API_KEY = os.getenv("HF_API_KEY")
client = InferenceClient(token=HF_API_KEY)

response = client.text_generation(
    model="meta-llama/Llama-3.2-3B-Instruct",
    prompt="Answer concisely: What is 10 x 10?",
    max_new_tokens=50)

response

In [None]:
response = client.text_generation(
    model="meta-llama/Llama-3.1-8B-Instruct",
    prompt=INITIAL_EXTRACTION_PROMPT.format(resume_text=resume_text),
    grammar={
        "type": "json",
        "value": Resume.model_json_schema()
    }
)

Need HF Pro subscription to access some Llama models; I think Ollama might be my best option after all

## Option 3: Vision Model Parsing

In [None]:
extract_pdf_text("../sample_data/Kevin_resume_img.pdf")

Convert the PDF to an image

In [None]:
images = convert_from_path("../sample_data/Ben_resume.pdf", dpi=300)

for i, image in enumerate(images):
    image.save(f"output/Ben_resume.jpg", "JPEG")
    print(f"Saved page {i + 1}")

Run the vision model

In [8]:
IMAGE_EXTRACTION_PROMPT = """
You are an expert at parsing resume information from an image. Given an image of a resume, your job is to parse individual sections based on resume heading. Follow this format:
    {{
        "Experience": "<Experience>",
        "Education": "<Education>",
        "Skills": "<Skills>",
        "Projects": "<Projects>",
        "Leadership": "<Leadership Experience>",
        "Research": "<Research Experience>"
    }}
    
Your job is very simple: simply copy everything under each resume section into the output format; do not worry about formatting. 

If a resume does not contain one of the sections, output an empty string for that section. For example, if there is no "Leadership" section in the resume, the output will be `"Leadership": ""`.

In the experience section, **ensure that you include company names**, which are usually listed beside or under position names.

**The parsed information must be explicitly contained in the resume.**

**Do not exclude any information from the resume.**

Output:
"""

In [19]:
from ollama import chat

response = chat(
    model="llama3.2-vision",
    messages=[
        {
            "role": "user",
            "content": IMAGE_EXTRACTION_PROMPT,
            "images": ["output/Ben_resume.jpg"]
        }
    ],
    format=Resume.model_json_schema())

In [None]:
parsed_resume = json.loads(response.message.content)
pprint(parsed_resume)

The bad with VLMs:
1. Most state-of-the-art VLMs cannot process very large images (e.g., Llama3.2-vision can only process images up to 1120x1120)
2. llama3.2-vision runs pretty slow on my computer, probably because it requires 11B parameters vs only 8B from llama3.1
3. llama3.2-vision isn't very good at parsing a lot of textual information from resumes; it's missing a lot of information from the "Experience" section

## Option 4: OCR -> LLM parsing

In [13]:
pdf_images = convert_from_path("../sample_data/Kevin_resume.pdf", dpi=300)
pdf_images[0].save("output/Kevin_resume.png", "PNG")

In [14]:
extracted_pdf_text = pytesseract.image_to_string(pdf_images[0])

In [None]:
pprint(extracted_pdf_text)

In [16]:
class Resume(BaseModel):
    experience: str = Field(..., alias="Experience")
    education: str  = Field(..., alias="Education")
    skills: str     = Field(..., alias="Skills")
    projects: str   = Field(..., alias="Projects")
    leadership: str = Field(..., alias="Leadership")
    research: str   = Field(..., alias="Research")
    
INITIAL_EXTRACTION_PROMPT = """
You are an expert at parsing resume information. Given resume text, your job is to parse individual sections based on resume heading. Follow this format:
    {{
        "Experience": "<Experience>",
        "Education": "<Education>",
        "Skills": "<Skills>",
        "Projects": "<Projects>",
        "Leadership": "<Leadership Experience>",
        "Research": "<Research Experience>"
    }}
    
Your job is very simple: simply copy everything under each resume section into the output format; do not worry about formatting. 

If a resume does not contain one of the sections, output an empty string for that section. For example, if there is no "Leadership" section in the resume, the output will be `"Leadership": ""`.

In the experience section, **ensure that you include company names**, which are usually listed beside or under position names.

**The parsed information must be explicitly contained in the resume.**

**Do not exclude any information from the resume.**

Resume:
{resume_text}

Output:
"""

In [17]:
parsed_resume = with_structured_output(
    prompt=INITIAL_EXTRACTION_PROMPT.format(resume_text=extracted_pdf_text),
    schema=Resume,
    model="llama3.1")

In [None]:
pprint(parsed_resume)