In [1]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


## Extract text from the Resume(pdf)

In [2]:
import pdfplumber

def extract_text(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

## Clean Extracted Text

In [3]:
import re

def clean_text(text):
    # Remove bullet symbols like \x7f
    text = re.sub(r'\x7f', ' ', text)
    
    # Replace newlines with space
    text = re.sub(r'\n', ' ', text)
    
    # Remove standalone numbers (list artifacts)
    text = re.sub(r'\b\d+\b', ' ', text)
    
    # Normalize all whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


## Extract Email & Number

In [4]:
def extract_email(text):
    emails= re.findall(r'\S+@+\S+', text)
    return emails[0] if emails else None
    

def extract_number(text):
    phone_pattern = r'(\+?\d{1,3}[-\s]?)?\d{5}[-\s]?\d{5}'
    phones = re.findall(phone_pattern, text)
    return phones[0] if phones else "Not found"


## Extract Name

In [5]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [6]:
def extract_name(text):
    doc=nlp(text)
    for ent in doc.ents:
        if ent.label_=='PERSON':
            return ent.text
    return None

## Extract Skills

In [7]:
def extract_skills(text):
    with open('skills.txt') as f:
        skills=[s.strip().lower() for s in f.readlines()]

    text=text.lower()
    return [skill for skill in skills if skill in text]

In [8]:
def resume_parser(path):
    text = extract_text(path)
    text = clean_text(text)

    return {
        "Name": extract_name(text),
        "Email": extract_email(text),
        "Phone": extract_number(text),
        "Skills": extract_skills(text),
    }


In [9]:
resume_parser('sample_resume_ml_training.pdf')

{'Name': 'John Doe Data',
 'Email': 'johndoe@email.com',
 'Phone': 'Not found',
 'Skills': ['python', 'machine learning', 'sql', 'nlp']}