In [11]:
import pdfplumber
import re 
import spacy

def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

def extract_name(doc: spacy.tokens.Doc) -> str:
    """Extract the person's name using NER (Named Entity Recognition)."""
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    # Fallback: assume name is at the top of the resume (first line)
    first_line = doc.text.split('\n')[0].strip()
    return first_line if first_line else "Name not found"

def extract_email(text: str) -> str:
    """Extract email address using regex."""
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Email not found"

def extract_phone(text: str) -> str:
    """Extract phone number using regex."""
    phone_pattern = r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    match = re.search(phone_pattern, text)
    return match.group(0) if match else "Phone not found"

def extract_skills(doc, skills_list):
    """Extract skills by matching against a predefined list."""
    found_skills = set()
    text = doc.text.lower()
    for skill in skills_list:
        if skill.lower() in text:
            found_skills.add(skill)
    return list(found_skills) if found_skills else ["No skills matched"]

def extract_education(doc):
    """Extract education details."""
    education_keywords = {"university", "college", "institute", "school", "bachelor", "master", "phd", "degree"}
    education = []
    for ent in doc.ents:
        if ent.label_ in {"ORG", "DATE"} and any(keyword in ent.text.lower() for keyword in education_keywords):
            education.append(ent.text)
    return education if education else ["Education not found"]

def parse_resume(model, pdf_path, skills_list):
    """Main function to parse resume and extract information."""
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    if not text:
        return {"error": "No text extracted from resume"}

    # Process text with spaCy
    doc = model(text)

    # Extract information
    resume_data = {
        "name": extract_name(doc),
        "email": extract_email(text),
        "phone": extract_phone(text),
        "skills": extract_skills(doc, skills_list),
        "education": extract_education(doc)
    }
    return resume_data

In [2]:
path = '/Users/hainguyen/Desktop/Harry_Nguyen_Resume.pdf'

text = extract_text_from_pdf(path)
text

'Harry Nguyen\n§ hainguyen2903.github.io/gitprofile | (cid:239) linkedin.com/in/nguyenphuchai | # hainguyen29031412@gmail.com\nEDUCATION\nUniversity of Technology Sydney (UTS) August 2023 - August 2025\nMaster’s Degree in Data Science (Postgraduate Excellence International Scholarship)\nUniversity of Engineering and Technology, Vietnam National University August 2018 - August 2022\nBachelor’s Degree in Computer Science\nGPA: 3.51/4.0 Thesis score: 9.1/10\nSKILLS AND KNOWLEDGE\nBackground Knowledge Machine Learning, Computer Vision, Natural Language Processing\nData Analysis Skills Tableau, Seaborn, Matplotlib, AWS\nProgram Languages Python, SQL\nData Enigineer Skills Azure, Databrick, Airflow, dbt\nSoft Skills Teamwork, Self-Studying, Leadership, Problem Solving\nWORKING EXPERIENCE\nMachine Learning Engineer (SiliconCube Company) July 2022 - June 2023\n• Smart Parking System - Car Detection and License Plate Recognition\nAn automatic parking monitoring system, which includes license pl

In [12]:
model = spacy.load('en_core_web_sm')

# processed_text = model(text)
# processed_text

sample_skills = [
        "Python", "Java", "Machine Learning", "SQL", "JavaScript",
        "Project Management", "Data Analysis", "C++", "React", "AWS"
    ]


# Parse the resume
result = parse_resume(model, path, sample_skills)

# Print results
print("Extracted Resume Information:")
for key, value in result.items():
    print(f"{key.capitalize()}: {value}")

Extracted Resume Information:
Name: Harry Nguyen
§
Email: hainguyen29031412@gmail.com
Phone: Phone not found
Skills: ['SQL', 'Machine Learning', 'Data Analysis', 'Python', 'AWS']
Education: ['University of Technology Sydney', 'University of Engineering and Technology', 'Vietnam National University', 'Bachelor’s Degree']


In [13]:
path = '/Users/hainguyen/Downloads/Henry CV.pdf'

result = parse_resume(model, path, sample_skills)

# Print results
print("Extracted Resume Information:")
for key, value in result.items():
    print(f"{key.capitalize()}: {value}")

Extracted Resume Information:
Name: HENRY LE
Email: nhathoangle1312@gmail.com
Phone: Phone not found
Skills: ['SQL', 'Machine Learning', 'Data Analysis', 'Python', 'AWS']
Education: ['UNIVERSITY OF TECHNOLOGY SYDNEY', 'KENT INSTITUTE UNIVERSITY', 'Bachelor of Accounting\n• Focus', 'CURTIN UNIVERSITY']


In [1]:
import pandas as pd
df = pd.read_csv('datasets/linkedin-jobs-2023-2024/postings.csv')
df.head(3)

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0


In [3]:
df.columns

Index(['job_id', 'company_name', 'title', 'description', 'max_salary',
       'pay_period', 'location', 'company_id', 'views', 'med_salary',
       'min_salary', 'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'zip_code', 'fips'],
      dtype='object')

In [6]:
df['description']

116       Request: Data ArchitectLocation: San Francisco...
134       This opportunity is joining an innovation driv...
165       The Enterprise Data Infrastructure and Analyti...
283       Data Engineer with Kafka (W2 Only)💯% Remote\nM...
348       Company DescriptionPB Built is a residential c...
                                ...                        
123475    About This Featured Opportunity\n\nWe are look...
123580    Role Title: Data Engineering Lead for a global...
123727    Overview\n\nThe Credit Risk & Decision Science...
123770    Overview\n\nManage Navy Federal's BSA/AML and ...
123845    About Pinterest:\n\nMillions of people across ...
Name: description, Length: 2720, dtype: object

In [2]:
# all_titles = df['title'].unique()

# titles = ['Data Analyst', 'Data Scientist', 'Data Engineer', 'ML Engineer', 'Machine Learning Engineer', 'AI Engineer',
#           'AI Researcher']
# df_filter = df[df.title.isin(titles)]

title_keywords = ['Data ', 'AI ', 'ML ', 'Machine Learning ']

df = df[df.title.str.contains('|'.join(title_keywords), case=False, na=False)]
print(len(df))

2720


In [7]:
df.to_csv('job_descs.csv', index=False)