In [1]:
import pytesseract

from pdf2image import convert_from_path

#from pytesseract import Output

from PIL import Image

import pandas as pd

import spacy

import re

import nltk
 
import string

In [2]:
def pdftoimage(path):

    try:
        pages = convert_from_path(path, 600)
        for i in range(len(pages)) :
            pages[i].save('page'+ str(i) +'.jpg', 'JPEG')
        # Simple image to string
        text_data = ''
        for i in range(len(pages)):
            text=pytesseract.image_to_string(Image.open(f'page{i}.jpg'))
            text_data += text + '/n'

        return text_data

    except:
        return 'could not run correctly'

In [3]:
def clean_text(text):

    try:
        text=re.sub(r'/n+','/n',text)
        text=re.sub(r' +',' ',text)
        text=re.sub(r'\+',' ',text)
        text=re.sub(r'[#^&[*()|«!>”{¢‘“}^_`:;~]', '', text)

        return text.strip()
    
    except:
        return 'error raised'

In [4]:
def extract_mail(text):
    
    nlp = spacy.load("en_core_web_sm")
    
    doc = nlp(text)

    for ent in doc:
        if ent.like_email:
            return ent.text
        
    return 'no email found'

In [5]:
def extract_URL(text):
    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    links=[]
    for ent in doc:
        if ent.like_url:
            links.append(ent.text)
    
    return links if links else 'no URL found'

In [6]:
def extract_phone(text):
        match=re.search(r'\+?\d[\d\s\-()]{8,15}\S{0}',text)
        return str(match[0]) if match else 'no phone number found'


In [7]:
def extract_name(text):

    nlp=spacy.load('en_core_web_sm')
    doc=nlp(text)

    for ent in doc.ents:
        if ent.label_=='PERSON':
            return ent.text
            
    return 'no name found'

In [8]:
def extract_location(text):

    nlp=spacy.load('en_core_web_sm')
    doc=nlp(text)

    country,city='',''

    for ent in doc.ents:
        if ent.label_=='GPE'and not city:
            city=ent.text
        #if ent.label_=='NORP'and not country :
           #country=ent.text

    if city and country:
        return ', '.join([city,country])
    elif city :
        return city
    elif country :
        return country
    else :
        return 'no location found'

In [None]:
SKILLS_DB = [
    'machine learning',
    'data science',
    'python',
    'word',
    'excel',
    'english',
    'SQL',
    'ux',
    'french',
    'power bi',
    'data analysis',
    'spanish',
    'sales growth',
    'networking',
    'strategic planning',
    'talent development',
    'html',
    'css',
    'javascript',
    'milling',
    'autocad',
    'sap',
    'solidworks',
    'cat grooming',
    'dog grooming',
    'customer servie',
    'dog training',
    'sales and marketing',
    'visual design',
    'customer relationshop',
    'cleaning',
    'mathematics'
    
]
 

def extract_skills(text,skills):
    stop_words = nltk.corpus.stopwords.words('english')
    word_tokens = nltk.tokenize.word_tokenize(text)
 
    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w.lower() not in stop_words and w.lower() not in string.punctuation and w.isalpha()]
 
    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens)))

    # we create a set to keep the results in.
    found_skills = []
 
    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB and token.lower() not in found_skills:
            found_skills.append(token.lower())
 
    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB and ngram.lower() not in found_skills:
            found_skills.append(ngram.lower())
 
    return set(found_skills)

In [10]:
def extract_education(text):

    edu_keys = ['school','college','univers','academy','faculty','institute','bachelor','masters','master','licence','high school',
    'university','lycee', 'institut', 'doctorat','PhD', 'diploma', 'diplome', 'certificate', 'certification','certified', 'certifie',
    'associetes degree','Process Engineering','GED']
    

    lines=text.split('\n')
    education=[]
    for line in lines:
        for word in edu_keys:
            if word.lower() in line.lower() and line not in education:
                education.append(line.strip())
    if education:
        return education
    else :
        return 'no education history found'

In [11]:
keys = [
    'account manager',
    'ux designer',
    'junior ux designer',
    'senior ux designer',
    'civil engineer',
    'project engineer',
    'pet groomer',
    'event coordinator',
    'researcher',
    'Assistant'

]


def extract_exp(text,keys):

    lines=text.split('\n')
    exp=[]
    for line in lines:
        for word in keys:
            if word.lower() in line.lower() and line not in exp:
                exp.append(line.strip())
    if exp:
        return exp
    else :
        return 'no experience history found'

# the main function

In [12]:
import os

def data_extraction(folder,exp,skills):
    results=[]

    for file in os.listdir(folder):
        if file.endswith('.pdf'):
            path=os.path.join(folder,file)
            text=pdftoimage(path)
            text=clean_text(text)
            parsed={
                'file':file,
                'name':extract_name(text),
                'location':extract_location(text),
                'email':extract_mail(text),
                'phone':extract_phone(text),
                'Links':extract_URL(text),
                'skills':extract_skills(text,skills),
                'education':extract_education(text),
                'experience':extract_exp(text,exp)
            }
            
            results.append(parsed)
            
    return pd.DataFrame(results)

In [13]:
data_extraction('../phase1',keys,SKILLS_DB)

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,[linkedin.com/emmetbrickowsk1],"{networking, solidworks, milling, autocad}","[LEGO City University LEGO City, DK, Bachelor ...",[OBJECTIVE Fourth-year Civil Engineering stude...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,"{excel, python}",[Process Engineering Intern June 2021 - Septem...,[Undergraduate Researcher — Shah Lab September...
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,"[resumeviking.com/templates, xeOrlukd.com/24QS...","{talent development, spanish, networking, sale...","[Bachelor Degree in Advertising and Marketing,...",[Sep 2022 - Apr 2025 m Global Key Account Mana...
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401\n\n,no URL found,"{cat grooming, dog grooming, dog training, cle...","[GED, Nashville High, Nashville, TN]","[PET GROOMER, Professional and personable Pet ..."
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442\n\n,no URL found,"{french, visual design}",[e Managed and coordinated 160 events per yea...,[Practical Event Coordinator with 5 years exp...
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,{ux},[Masters Degree in Human Computer Interaction ...,"[John Huber, UX Designer, Proactive and detail..."
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442\n,no URL found,"{mathematics, cleaning, spanish}",[on packing records. Completed a certificate i...,[Laboratory Inventory Assistant at Dunrea Labo...
