In [1]:
import pytesseract

from pdf2image import convert_from_path

from pytesseract import Output

from PIL import Image

import pandas as pd

import spacy

import re

import nltk
 
import string

In [2]:
def pdftoimage(path):

    try:
        pages = convert_from_path(path, 600)
        for i in range(len(pages)) :
            pages[i].save('page'+ str(i) +'.jpg', 'JPEG')
        # Simple image to string
        text_data = ''
        for i in range(len(pages)):
            text=pytesseract.image_to_string(Image.open(f'page{i}.jpg'))
            text_data += text + '/n'

        return text_data

    except:
        return 'could not run correctly'

In [3]:
def clean_text(text):

    try:
        text=re.sub(r'/n+','/n',text)
        text=re.sub(r' +',' ',text)
        text=re.sub(r'\+',' ',text)
        text=re.sub(r'[#^&[*()|«!>”{¢‘“}]', '', text)

        return text.strip()
    
    except:
        return 'error raised'

In [4]:
def extract_mail(text):

        match=re.search(r'[A-Za-z]+\S+@\S+',text)
        return str(match[0]) if match else 'no email found'

In [5]:
def extract_phone(text):
        match=re.search(r'\+?\d[\d\s\-()]{8,15}\S{0}',text)
        return str(match[0]) if match else 'no phone number found'


In [6]:
def extract_name(text):

    nlp=spacy.load('en_core_web_sm')
    doc=nlp(text)
    for ent in doc.ents:
        if ent.label_=='PERSON':
            return ent.text
    return 'no name found'


In [7]:
def extract_location(text):

    nlp=spacy.load('en_core_web_sm')
    doc=nlp(text)

    country,city='',''

    for ent in doc.ents:
        if ent.label_=='GPE'and not city:
            city=ent.text
        if ent.label_=='NORP'and not country :
            country=ent.text

    if city and country:
        return ', '.join([city,country])
    elif city :
        return city
    elif country :
        return country
    else :
        return 'no location found'

In [27]:
SKILLS_DB = [
    'machine learning',
    'data science',
    'python',
    'word',
    'excel',
    'English',
    'SQL',
    'ux designer',
    'Power BI',
    'data analysis',
    'spanish',
    'sales growth',
    'networking',
    'strategic planning',
    'talent development',
    'HTML',
    'CSS',
    'Javascript',
    'milling',
    'autocad',
    'SAP',
    'solidworks',
    'cat grooming',
    'dog grooming',
    'customer servie',
    'dog training',
    'sales and marketing',
    'visual design',
    'customer relationshop',
    'cleaning',
    'mathematics'
    
]
 

def extract_skills(text,SKILLS_DB):
    try :
        found_skills=[skill for skill in SKILLS_DB if skill.lower() in text.lower()]
        return found_skills 
    except :
        return 'no skills found'

In [28]:
edu_keys = [
    'school',
    'college',
    'univers',
    'academy',
    'faculty',
    'institute',
    'bachelor',
    'masters',
    'master',
    'licence',
    'high school',
    'university',
    'lycee',
    'institut',
    'doctorat',
    'PhD',
    'diploma',
    'diplome',
    'certificate',
    'certification',
    'certified',
    'certifie',
    'associetes degree'
]


def extract_education(text,edu_keys):
    try:
        lines=text.split('\n')
        education=[]
        for line in lines:
            for word in edu_keys:
                if word.lower() in line.lower() and line not in education:
                    education.append(line.strip())
        return education

    except :
        return 'no education history found'

In [None]:
keys = [
    'account manager',
    'ux designer',
    'junior ux designer',
    'senior ux designer',
    'civil engineer',
    'project engineer',
    'pet groomer',
    'event coordinator'

]


def extract_exp(text,keys):

    try:
        lines=text.split('\n')
        exp=[]
        for line in lines:
            for word in keys:
                if word.lower() in line.lower() and line not in exp:
                    exp.append(line.strip())
        return exp
    except:
        return 'no experience history found'

# here is an example of execution

In [None]:
text_data=pdftoimage("Example-1-PDF.pdf")

In [25]:
cleaned=clean_text(text_data)
cleaned

'Emmet Brickowski\nebrickowski@lcu.edu  123 456 - 7890  Bricksburg, DK  linkedin.com/emmetbrickowsk1\n\nOBJECTIVE: Fourth-year Civil Engineering student seeking a job as a Project Engineer at Bricksburg Construction\n\nEDUCATION\nLEGO City University LEGO City, DK\nBachelor of Science in Civil Engineering  GPA: 3.8/4.0 Expected Graduation: June 2024\n\nRelevant Coursework: Statics, Principles of Computer-Aided Design\n\nSKILLS\ne Manufacturing: CNC and Manual Milling, Drill Press, Lathe, Band Saw, Laser Cutting, 3D Printing\ne Software: AutoCAD, SAP2000, SolidWorks, Microsoft Suite, Google Suite\ne Languages: Fluent in English, Conversational Robot\n\nWORK EXPERIENCE\n\nCivil Engineer Intern, Bricksburg Construction, Bricksburg, DK June 2022 - September 2022\ne Created pre-construction civil engineering design plans using multiple technical software programs\n\nDeveloped familiarity with federal, state, and local regulation and ensured compliance by filing for permits\n\nRead and inter

In [24]:
extract_exp(cleaned,keys)

[]

In [13]:
extract_skills(cleaned,SKILLS_DB)

['excel',
 'English',
 'spanish',
 'sales growth',
 'networking',
 'strategic planning',
 'talent development']

In [14]:
extract_education(cleaned,edu_keys)

['Bachelor Degree in Advertising and Marketing',
 'University of Denver, Denver',
 'Advanced Diploma in Global Customer Relationship Management',
 'Business College of New York, Brooklyn',
 'Certified Key Accounts Manager CKAM, Udemy, Online, Udemy, Online',
 'Certificate in Project Management, Certified Institute of Project Managers']

In [15]:
extract_location(cleaned)

'Los Angeles, American'

In [16]:
extract_mail(cleaned)

'email@email.com'

In [17]:
extract_phone(cleaned)

'541 754-3010 '

In [18]:
extract_name(cleaned)

'Michelle Smith'

# the main function

In [20]:
import os

def data_extraction(folder,exp,edu,skills):
    results=[]

    for file in os.listdir(folder):
        if file.endswith('.pdf'):
            path=os.path.join(folder,file)
            text=pdftoimage(path)
            text=clean_text(text)
            parsed={
                'file':file,
                'name':extract_name(text),
                'location':extract_location(text),
                'email':extract_mail(text),
                'phone':extract_phone(text),
                'skills':extract_skills(text,skills),
                'education':extract_education(text,edu),
                'experience':extract_exp(text,exp)
            }

            results.append(parsed)
            
    return pd.DataFrame(results)

In [30]:
data_extraction('../phase1',keys,edu_keys,SKILLS_DB)

Unnamed: 0,file,name,location,email,phone,skills,education,experience
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,"[English, networking, milling, autocad, SAP, s...","[LEGO City University LEGO City, DK, Bachelor ...",[OBJECTIVE: Fourth-year Civil Engineering stud...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,"[python, excel]",[],[]
2,Example-1-PDF.pdf,Michelle Smith,"Los Angeles, American",email@email.com,541 754-3010,"[excel, English, spanish, sales growth, networ...","[Bachelor Degree in Advertising and Marketing,...",[Sep 2022 - Apr 2025 m Global Key Account Mana...
3,resume-example-12.pdf,Patty,"Orlando, American",example@email.com,890-555-0401\n\n,"[cat grooming, dog grooming, dog training, cle...",[],"[PET GROOMER, Professional and personable Pet ..."
4,Resume-Template-Modern.pdf,Templates Build,"Los Angeles, French",email@email.com,3868683442\n\n,"[English, sales and marketing, visual design]","[A in Hotel Event Management, University of N...",[Practical Event Coordinator with 5 years exp...
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,"[excel, ux designer, HTML, CSS, Javascript]",[Masters Degree in Human Computer Interaction ...,"[John Huber, UX Designer, Proactive and detail..."
6,Stockholm-Resum.pdf,Jason Miller,"Warehouse Sanitation, Spanish",email@email.com,3868683442\n,"[English, spanish, cleaning, mathematics]",[on packing records. Completed a certificate i...,[]
