# Resume parser using NLP

### Niam Bashambu

In [69]:
#imports

import pdfplumber
import docx
import spacy
import re
from openai import OpenAI
import os

from dotenv import load_dotenv


In [4]:
#text extraction functions
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text.strip()

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs]).strip()

def extract_text(file_path):
    if file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    else:
        return None

In [5]:
nlp = spacy.load("en_core_web_sm")

def extract_contact_info(text):
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"

    email = re.findall(email_pattern, text)
    phone = re.findall(phone_pattern, text)

    return email[0] if email else None, phone[0] if phone else None

def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return None



In [6]:
def extract_education(text):
    doc = nlp(text)
    education = []
    for ent in doc.ents:
        if ent.label_ in ["ORG"]:  # Universities often tagged as ORG
            education.append(ent.text)
    return list(set(education))  # Remove duplicates

def extract_experience(text):
    doc = nlp(text)
    experience = []
    for ent in doc.ents:
        if ent.label_ in ["ORG", "GPE"]:  # Companies and locations often tagged as ORG/GPE
            experience.append(ent.text)
    return list(set(experience))

In [7]:
def extract_skills(text):
    doc = nlp(text)
    skills = []
    for ent in doc.ents:
        if ent.label_ in ["PRODUCT", "WORK_OF_ART"]:  # Often used for tools, tech names
            skills.append(ent.text)
    
    # Add general nouns (common for skills)
    skills.extend([token.text for token in doc if token.pos_ == "NOUN" and len(token.text) > 1])

    return list(set(skills))  # Remove duplicates

In [8]:
def parse_resume(file_path):
    text = extract_text(file_path)
    if not text:
        return {"error": "Unsupported file format"}

    name = extract_name(text)
    email, phone = extract_contact_info(text)
    education = extract_education(text)
    experience = extract_experience(text)
    skills = extract_skills(text)

    parsed_resume = {
        "Name": name,
        "Email": email,
        "Phone": phone,
        "Education": education,
        "Experience": experience,
        "Skills": skills
    }

    return parsed_resume


In [9]:
#testing
file_path = "example_resumes/NiamBashambuResumecopy.pdf"
parsed_data = parse_resume(file_path)
parsed_data

{'Name': 'Niam Bashambu',
 'Email': 'niambashambu@icloud.com',
 'Phone': '415-999-9281',
 'Education': ['• Collaborated',
  '• Conducted',
  'CSS',
  'StudyPlanGPT',
  'AI',
  'Saint Louis University',
  '• Implemented',
  'Khoury College of Computer Sciences',
  '• Trained',
  'NJ Jul',
  'Bay Takes',
  'OptiRun',
  'SQL',
  'API',
  'Business Statistics',
  'HTML',
  'Bachelor of Sciences: Data Science and Business Administration GPA',
  '• Contributed',
  'Northeastern University',
  'Financial Accounting',
  'Algorithms and Data, Discrete Structures',
  'Advanced Programming with Data',
  'Project Intern\n• Worked',
  'MA Sep',
  'Strava',
  'TikTok'],
 'Experience': ['• Collaborated',
  '• Conducted',
  'Node.js',
  'Boston',
  'Oakland',
  'CSS',
  'StudyPlanGPT',
  'Madrid',
  'AI',
  'Saint Louis University',
  '• Implemented',
  'Khoury College of Computer Sciences',
  '• Trained',
  'NJ Jul',
  'Bay Takes',
  'OptiRun',
  'Keras',
  'SQL',
  'Flask',
  'Camden',
  'API',
  'B

Obviously not very good, didn't get much meaning besides the name and email. 

In [20]:
#train data
file_path = "example_resumes/NiamBashambuResumecopy.pdf"
text = extract_text(file_path)
print(text)

Niam Bashambu
niambashambu@icloud.com • 415-999-9281 • GitHub • LinkedIn • niambashambu.com
EDUCATION
Northeastern University | Boston, MA Aug 2023 - Present
Khoury College of Computer Sciences Expected May 2027
Bachelor of Sciences: Data Science and Business Administration GPA: 3.5/4.0
Relevant course work: Advanced Programming with Data, Database Design, Foundations of Data Science,
Algorithms and Data, Discrete Structures, Financial Accounting, Business Statistics
Saint Louis University | Madrid, ES Aug 2023 - Dec 2023
Semester Study Abroad
Activities: Men’s Soccer
TECHNICAL SKILLS
Programming languages: Java, JavaScript, Python, SQL, HTML, CSS, Swift
Applications: IntelliJ IDEA, VS-Code, PyCharm, Git, Xcode, Jupyter Notebook, Docker
Frameworks and Libraries: React.js, TensorFlow, Node.js, NumPy, Pandas, Flask, Django, Keras, scikit-learn, Matplotlib
PROFESSIONAL EXPERIENCE
Belvidere Labs LLC | Boston, MA Sep 2024 - Oct 2024
Founder
• Created StudyPlanGPT, an app available on the Ap

## custom spaCy NER model

In [None]:

# Load the pre-trained SpaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_with_spacy(text):
    # Process the text to create a doc object
    doc = nlp(text)
    
    # Extract named entities
    entities = []
    for ent in doc.ents:
        entities.append({
            'text': ent.text,
            'label': ent.label_
        })
    
    return entities

# Preprocess the text
entities = preprocess_with_spacy(text)
entities

[{'text': 'Niam Bashambu', 'label': 'PERSON'},
 {'text': '415', 'label': 'CARDINAL'},
 {'text': 'Northeastern University', 'label': 'ORG'},
 {'text': 'Boston', 'label': 'GPE'},
 {'text': 'MA Aug', 'label': 'PERSON'},
 {'text': '2023', 'label': 'DATE'},
 {'text': 'Khoury College of Computer Sciences', 'label': 'ORG'},
 {'text': 'May 2027', 'label': 'DATE'},
 {'text': 'Bachelor of Sciences: Data Science and Business Administration GPA',
  'label': 'ORG'},
 {'text': '3.5/4.0', 'label': 'CARDINAL'},
 {'text': 'Advanced Programming with Data', 'label': 'ORG'},
 {'text': 'Algorithms and Data, Discrete Structures', 'label': 'ORG'},
 {'text': 'Financial Accounting', 'label': 'ORG'},
 {'text': 'Business Statistics', 'label': 'ORG'},
 {'text': 'Saint Louis University', 'label': 'ORG'},
 {'text': 'Madrid', 'label': 'GPE'},
 {'text': '2023 - Dec 2023', 'label': 'DATE'},
 {'text': 'Java', 'label': 'PERSON'},
 {'text': 'JavaScript', 'label': 'PERSON'},
 {'text': 'SQL', 'label': 'ORG'},
 {'text': 'HT

doesn't properly label everyting, could probably work on this to make it do that. 

In [78]:
api_key = os.environ.get("OPENAI_API_KEY")
load_dotenv()
client = OpenAI(api_key=api_key)

# Function to query GPT for contextual refinement
def refine_entities_with_gpt(entities, text):
    prompt = [{"role": "user", "content": f"Parse the following resume text and provide only the important information, labeling each section appropriately (e.g., name, skills, education, experience, etc.). Ensure the response is concise, with minimal detail, and does not exceed the 500-character limit. The response should be in JSON format, containing only relevant information. There will also be entities provided that could help: {entities}. Here is the resume text: {text}"}]
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use the appropriate model
        messages=prompt,
        max_tokens=500,
    )

    response = response.choices[0].message.content.strip()
    return response

# Refining the extracted entities
refined_entities = refine_entities_with_gpt(entities, text)
print("Refined Entities:", refined_entities)

Refined Entities: ```json
{
  "name": "Niam Bashambu",
  "contact": {
    "email": "niambashambu@icloud.com",
    "phone": "415-999-9281"
  },
  "education": [
    {
      "institution": "Northeastern University",
      "location": "Boston, MA",
      "start_date": "Aug 2023",
      "expected_graduation": "May 2027",
      "degree": "Bachelor of Sciences: Data Science and Business Administration",
      "GPA": "3.5/4.0"
    },
    {
      "institution": "Saint Louis University",
      "location": "Madrid, ES",
      "start_date": "Aug 2023",
      "end_date": "Dec 2023"
    }
  ],
  "skills": {
    "programming_languages": ["Java", "JavaScript", "Python", "SQL", "HTML", "CSS", "Swift"],
    "applications": ["IntelliJ IDEA", "VS-Code", "PyCharm", "Git", "Xcode", "Jupyter Notebook", "Docker"],
    "frameworks": ["React.js", "TensorFlow", "Node.js", "NumPy", "Pandas", "Flask", "Django", "Keras", "scikit-learn", "Matplotlib"]
  },
  "experience": [
    {
      "position": "Founder",
      

used openai API, and it did a way better job to orgnaize everything in a json format as well. 