# Parse text from resumes in PDF format

**TODO**
- [ ] Implement **OCR-based parsing methods** for PDFs that were created from images

In [45]:
import pdfplumber
import re
from pprint import pprint
from collections import defaultdict

### Text Extraction
Extract text via PDFPlumber (if in machine-readable format)

In [6]:
def extract_pdf_text(pdf_path: str) -> str:
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [37]:
resume_text = extract_pdf_text("../resume-samples/sample.pdf")
pprint(resume_text)

('Kevin Zhang\n'
 '(832) 416-3570 | kevzhang2022@gmail.com | linkedin.com/in/kevinkz | '
 'github.com/n1v3x2\n'
 'Education\n'
 'Texas A&M University May 2026\n'
 'BS in Computer Science, Minor in Statistics and Math College Station, TX\n'
 'Cumulative GPA: 4.0/4.0\n'
 'Honors: Dean’s Honor Roll, Engineering Honors (EH), Dean’s Excellence Award '
 'Semi-finalist\n'
 'Coursework: Data Structures & Algorithms, Software Engineering, Computer '
 'Systems, Discrete Math, Linear Algebra\n'
 'Experience\n'
 'AI/ML Intern Aug 2024 – Dec 2024\n'
 'Sandia National Laboratories Remote\n'
 '• Developed knowledge graph (KG) generation pipeline with internal LLM '
 'microservices to allow multi-hop\n'
 'reasoning in 3-stage retrieval augmented generation (RAG) pipeline\n'
 '• Extracted 30+ domain-specific seed topics from text corpus with BERTopic '
 'for KG subgraph creation\n'
 '• Achieved100%schema-compliantLLMoutputsviaprompt engineering '
 'andgrammar-contrained decoding\n'
 '• Packaged KG gene

### Identify Section Headings
Education, Experience, Skills, etc.

#### Normalize headings

In [31]:
heading_map = {
    # Experience
    r"(Work|Relevant|Professional)?\s*(Experience|History)": "Experience",
    r"(Employment|Career)\s*(History|Experience)": "Experience",
    r"(Internship|Internships|Intern Experiences?)": "Experience",
    r"(Freelance|Contract)\s*(Work|Experience)": "Experience",
    r"Work": "Experience",

    # Education
    r"(Education|Educational Background|Academic History|Academic Background)": "Education",
    r"(Certifications|Courses|Licenses|Trainings|Accreditations)": "Certifications",
    r"(Professional Development|Learning)": "Certifications",

    # Skills
    r"(Skills|Technical Skills|Key Competencies|Core Competencies|Abilities)": "Skills",
    r"(Technical Proficiencies|Technical Expertise|Expertise|Proficiencies)": "Skills",
    r"(Languages|Programming Languages)": "Skills",
    
    # Projects
    r"(Projects|Key Projects|Personal Projects|Side Projects)": "Projects",
    r"(Freelance Projects|Independent Projects|Portfolio)": "Projects",

    # Achievements and Awards
    r"(Achievements?|Awards?|Honors?|Accolades?|Recognitions?)": "Achievements",
    r"(Accomplishments|Milestones)": "Achievements",

    # Volunteer Work
    r"(Volunteer|Volunteering|Community( Service)?|Volunteer Experience)": "Volunteer Work",
    r"(Social Work|Non-Profit Work)": "Volunteer Work",

    # Leadership
    r"(Leadership|Leadership Experience|Leadership Roles|Positions of Responsibility)": "Leadership",
    r"(Managerial Experience|Team Leadership|Organizational Roles)": "Leadership",

    # Publications and Research
    r"(Publications?|Research|Academic Papers|Articles|Journals?)": "Publications",
    r"(Research Projects|Thesis|Dissertation)": "Research",

    # Interests and Hobbies
    r"(Interests?|Hobbies?|(Extracurricular|Collegiate) Activities)": "Interests",
    r"(Passions?|Leisure Activities)": "Interests",

    # Objective or Summary
    r"(Objective|Career Objective|Professional Objective)": "Summary",
    r"(Summary|Professional (Highlights|Profile|Summary)|Career Summary)": "Summary",

    # References
    r"(References?|Professional References|Referees?)": "References",
}


In [32]:
def normalize_heading(heading):
    for pattern, normalized_heading in heading_map.items():
        if re.search(pattern, heading, re.IGNORECASE):
            return normalized_heading
    return "Miscellaneous"

In [48]:
def extract_sections_by_heading(resume_text: str) -> dict[str, str]:
    heading_pattern = r"""
        ^(                                  
            [A-Z][a-z]+(?:\ [A-Z][a-z]+)?(?:[\s]*\n)    # Matches Captialized headings
            |
            [A-Z]{3,}(?:\ [A-Z]{2,})*(?::?[\s]*\n)      # Matches ALL CAPS headings 
        )
    """
    heading_regex = re.compile(heading_pattern, re.VERBOSE | re.MULTILINE)
    matches = list(re.finditer(heading_regex, resume_text))
    
    sections = defaultdict(list)
    for i, match in enumerate(matches):
        # The section starts at the end of the heading
        start = match.end()
        # The section ends at the start of the next heading or the end of the resume
        end = matches[i + 1].start() if i + 1 < len(matches) else len(resume_text)
        
        heading = match.group(1).strip()
        normalized_heading = normalize_heading(heading)
        
        if normalized_heading == "Miscellaneous":
            sections[normalized_heading].append(resume_text[start:end].strip())
        else :
            sections[normalized_heading] = resume_text[start:end].strip()
    
    return dict(sections)

In [49]:
resume_sections = extract_sections_by_heading(resume_text)
pprint(resume_sections)

{'Education': 'Texas A&M University May 2026\n'
              'BS in Computer Science, Minor in Statistics and Math College '
              'Station, TX\n'
              'Cumulative GPA: 4.0/4.0\n'
              'Honors: Dean’s Honor Roll, Engineering Honors (EH), Dean’s '
              'Excellence Award Semi-finalist\n'
              'Coursework: Data Structures & Algorithms, Software Engineering, '
              'Computer Systems, Discrete Math, Linear Algebra',
 'Experience': 'AI/ML Intern Aug 2024 – Dec 2024\n'
               'Sandia National Laboratories Remote\n'
               '• Developed knowledge graph (KG) generation pipeline with '
               'internal LLM microservices to allow multi-hop\n'
               'reasoning in 3-stage retrieval augmented generation (RAG) '
               'pipeline\n'
               '• Extracted 30+ domain-specific seed topics from text corpus '
               'with BERTopic for KG subgraph creation\n'
               '• Achieved100%schema-compl

### Important features for resume scoring
1. Keywords match job description
2. Validate that skills are used in context ("built Flask app with Python" vs "Python" on its own)
3. Match sentences under "Projects" and "Experience" to the job description
4. Match job titles with job description
5. Determine number of years of relevant experience
6. Education: check degree & GPA
7. Check that resume has essential sections (contact info, skills, experience, education)
8. Penalize resumes with typos