In [51]:
import pytesseract

from pdf2image import convert_from_path

#from pytesseract import Output

from PIL import Image

import pandas as pd

import spacy

import re

import nltk
 
import string

In [52]:
def pdftoimage(path):

    try:
        pages = convert_from_path(path, 600)
        for i in range(len(pages)) :
            pages[i].save('page'+ str(i) +'.jpg', 'JPEG')
        # Simple image to string
        text_data = ''
        for i in range(len(pages)):
            text=pytesseract.image_to_string(Image.open(f'page{i}.jpg'))
            text_data += text + '/n'

        return text_data

    except:
        return 'could not run correctly'

In [3]:
def clean_text(text):

    try:
        text=re.sub(r'/n+','/n',text)
        text=re.sub(r' +',' ',text)
        text=re.sub(r'\+',' ',text)
        text=re.sub(r'[#^&[*()|«!>”{¢‘“}^_`:;~]', '', text)

        return text.strip()
    
    except:
        return 'error raised'

In [4]:
def extract_mail(text):
    
    nlp = spacy.load("en_core_web_sm")
    
    doc = nlp(text)

    for ent in doc:
        if ent.like_email:
            return ent.text
        
    return 'no email found'

In [5]:
def extract_URL(text):
    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    links=[]
    for ent in doc:
        if ent.like_url:
            links.append(ent.text)
    
    return links if links else 'no URL found'

In [None]:
def extract_phone(text):
        match=re.search(r'\+?\d[\d\s\-()]{8,15}\S{0}',text)
        return str(match[0]) if match else 'no phone number found'

In [7]:
def extract_name(text):

    nlp=spacy.load('en_core_web_sm')
    doc=nlp(text)

    for ent in doc.ents:
        if ent.label_=='PERSON':
            return ent.text
            
    return 'no name found'

In [8]:
def extract_location(text):

    nlp=spacy.load('en_core_web_sm')
    doc=nlp(text)

    #country,city='',''
    city=''

    for ent in doc.ents:
        if ent.label_=='GPE'and not city:
            city=ent.text
        #if ent.label_=='NORP'and not country :
           #country=ent.text

    if city :
        return city
    else :
        return 'no location found'

In [9]:
SKILLS_DB = [
    'machine learning',
    'data science',
    'python',
    'word',
    'excel',
    'english',
    'SQL',
    'ux',
    'french',
    'power bi',
    'data analysis',
    'spanish',
    'sales growth',
    'networking',
    'strategic planning',
    'talent development',
    'html',
    'css',
    'javascript',
    'milling',
    'autocad',
    'sap',
    'solidworks',
    'cat grooming',
    'dog grooming',
    'customer servie',
    'dog training',
    'sales and marketing',
    'visual design',
    'customer relationshop',
    'cleaning',
    'mathematics'
    
]
 

def extract_skills(text,skills):
    stop_words = nltk.corpus.stopwords.words('english')
    word_tokens = nltk.tokenize.word_tokenize(text)
 
    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w.lower() not in stop_words and w.lower() not in string.punctuation and w.isalpha()]
 
    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens)))

    # we create a set to keep the results in.
    found_skills = []
 
    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB and token.lower() not in found_skills:
            found_skills.append(token.lower())
 
    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB and ngram.lower() not in found_skills:
            found_skills.append(ngram.lower())
 
    return set(found_skills)

In [17]:
def extract_education(text):

    edu_keys = ['school','college','univers','academy','faculty','institute','bachelor','masters','master','licence','high school',
    'university','lycee', 'institut', 'doctorat','PhD', 'diploma', 'diplome', 'certificate', 'certification','certified', 'certifie',
    'associetes degree','Process Engineering','GED']
    

    lines=text.split('\n')
    education=[]
    for line in lines:
        for word in edu_keys:
            if word.lower() in line.lower() and line not in education:
                education.append(line.strip())
    if education:
        return set(education)
    else :
        return 'no education history found'

In [18]:
keys = [
    'account manager',
    'ux designer',
    'junior ux designer',
    'senior ux designer',
    'civil engineer',
    'project engineer',
    'pet groomer',
    'event coordinator',
    'researcher',
    'Assistant'

]


def extract_exp(text,keys):

    lines=text.split('\n')
    exp=[]
    for line in lines:
        for word in keys:
            if word.lower() in line.lower() and line not in exp:
                exp.append(line.strip())
    if exp:
        return set(exp)
    else :
        return 'no experience history found'

# the main function

In [70]:
import os

def data_extraction(folder,exp,skills):
    results=[]

    for file in os.listdir(folder):
        if file.endswith('.pdf'):
            path=os.path.join(folder,file)
            text=pdftoimage(path)
            text=clean_text(text)
            parsed={
                'file':file,
                'name':extract_name(text),
                'location':extract_location(text),
                'email':extract_mail(text),
                'phone':extract_phone(text).strip(),
                'Links':extract_URL(text),
                'skills':extract_skills(text,skills),
                'education':extract_education(text),
                'experience':extract_exp(text,exp)
            }
            
            results.append(parsed)
            
    return pd.DataFrame(results)

In [71]:
res=data_extraction('../phase1',keys,SKILLS_DB)
res

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,[linkedin.com/emmetbrickowsk1],"{networking, english, autocad, solidworks, mil...",{Bachelor of Science in Civil Engineering GPA...,{Bachelor of Science in Civil Engineering GPA...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,"{excel, python}",{e Managed upstream strategies of semiconducto...,{Undergraduate Researcher September 2019 — Pre...
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,"[resumeviking.com/templates, xeOrlukd.com/24QS...","{sales growth, networking, spanish, strategic ...","{University of Denver, Denver, Certified Key A...",{Dec 2018 - Aug 2022 m Regional Account Manage...
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401,no URL found,"{cleaning, dog grooming, cat grooming, dog tra...","{GED, Nashville High, Nashville, TN}",{Patty's Pet Salon is a boutique pet grooming ...
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442,no URL found,"{english, french, visual design}","{Business Management, Hospitality Tourism Cer...","{Event Coordinator, Bright Event Rentals, Flor..."
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,"{css, javascript, ux}","{Mastered web and mobile technologies HTMLS, C...",{Status is an open source discovery tool for t...
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442,no URL found,"{cleaning, spanish, english, mathematics}",{on packing records. Completed a certificate i...,"{Full-time lab assistant in a small, regional ..."


In [72]:
res['skills']=res['skills'].apply(lambda x: ' | '.join(x) if 'found' not in x else x)
res['Links']=res['Links'].apply(lambda x: ' | '.join(x) if 'found' not in x else x)
res['education']=res['education'].apply(lambda x: ' | '.join(x) if 'found' not in x else x)
res['experience']=res['experience'].apply(lambda x: ' | '.join(x) if 'found' not in x else x)

In [73]:
res

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,linkedin.com/emmetbrickowsk1,networking | english | autocad | solidworks | ...,Bachelor of Science in Civil Engineering GPA ...,Bachelor of Science in Civil Engineering GPA ...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,excel | python,e Managed upstream strategies of semiconductor...,Undergraduate Researcher September 2019 — Pres...
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,resumeviking.com/templates | xeOrlukd.com/24QS...,sales growth | networking | spanish | strategi...,"University of Denver, Denver | Certified Key A...",Dec 2018 - Aug 2022 m Regional Account Manager...
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401,no URL found,cleaning | dog grooming | cat grooming | dog t...,"GED, Nashville High, Nashville, TN",Patty's Pet Salon is a boutique pet grooming s...
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442,no URL found,english | french | visual design,"Business Management, Hospitality Tourism Cert...","Event Coordinator, Bright Event Rentals, Flori..."
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,css | javascript | ux,"Mastered web and mobile technologies HTMLS, CS...",Status is an open source discovery tool for th...
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442,no URL found,cleaning | spanish | english | mathematics,on packing records. Completed a certificate in...,"Full-time lab assistant in a small, regional l..."


In [74]:
res.to_csv('AllResumeParsed.csv',index=False)

In [75]:
data=pd.read_csv('AllResumeParsed.csv')

data

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,linkedin.com/emmetbrickowsk1,networking | english | autocad | solidworks | ...,Bachelor of Science in Civil Engineering GPA ...,Bachelor of Science in Civil Engineering GPA ...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,excel | python,e Managed upstream strategies of semiconductor...,Undergraduate Researcher September 2019 — Pres...
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,resumeviking.com/templates | xeOrlukd.com/24QS...,sales growth | networking | spanish | strategi...,"University of Denver, Denver | Certified Key A...",Dec 2018 - Aug 2022 m Regional Account Manager...
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401,no URL found,cleaning | dog grooming | cat grooming | dog t...,"GED, Nashville High, Nashville, TN",Patty's Pet Salon is a boutique pet grooming s...
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442,no URL found,english | french | visual design,"Business Management, Hospitality Tourism Cert...","Event Coordinator, Bright Event Rentals, Flori..."
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,css | javascript | ux,"Mastered web and mobile technologies HTMLS, CS...",Status is an open source discovery tool for th...
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442,no URL found,cleaning | spanish | english | mathematics,on packing records. Completed a certificate in...,"Full-time lab assistant in a small, regional l..."


# creating a custom NER model

In [25]:
# Load a spacy model and chekc if it has ner

import spacy
nlp=spacy.load('en_core_web_sm')

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [26]:
ner=nlp.get_pipe("ner")

# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was working at Jumia", {"entities": [(17,23, "ORG")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Altran can be ordered in Amazon ", {"entities": [(0,6, "ORG")]}),
              ("I was born in Washinton", {"entities": [(14,22, "GPE")]}),
              ("I was born in New York", {"entities": [(14,22, "GPE")]}),
              ("I bought a present from Japan", {"entities": [(24,29, "GPE")]}),
              ("I work in Sony company", {"entities": [(10,14, "ORG")]}),
              ("I rented a tent in Agadir", {"entities": [(19,25, "GPE")]}),
              ("I worked before in Innovation Insitute", {"entities": [(19,38, "ORG")]}),
              ("I graduated from Engineering University", {"entities": [(17,39, "ORG")]}),
              ("I got certificated from GoMyCode", {"entities": [(25,33, "ORG")]}),
              ("certified data scientist From Udemy", {"entities": [(30,35, "ORG")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),

              ("Master Degree on Industrial engineering from EMSI",{"entities": [(46,49, "ORG")]}),
              ("Amazon Warehouse Associate at Amazon",{"entities": [(30,36, "ORG")]}),
              ("Los Angeles, CA 90291",{"entities": [(0,12, "GPE")]}),
              ("1515 Pacific Ave, Los Angeles, CA 90291, United",{"entities": [(18,29, "GPE")]}),
              ("Event Coordinator, Bright Event Rentals, Florida",{"entities": [(19,39, "ORG")]}),
              ("A in Hotel & Event Management, University of New York, Brooklyn",{"entities": [(31,53, "ORG")]}),
              ("Amazon Warehouse Associate at Amazon, Miami Gardens",{"entities": [(38,51, "GPE")]}),
              ("Laboratory Inventory Assistant at Dunrea Laboratories",{"entities": [(34,53, "ORG")]}),
              ("Associates Degree in Logistics and Supply Chain Fundamentals,Atlanta Technical College, Atlanta",{"entities": [(61,86, "ORG")]}),
              ("UX Designer, Real Vision Group",{"entities": [(13,30, "ORG")]}),
              ("Masters Degree in Human Computer Interaction (HCI), Pratt Institute",{"entities": [(52,67, "ORG")]}),
              ("Civil Engineer Intern, Bricksburg Construction, Bricksburg, DK",{"entities": [(23,46, "ORG")]}),

              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

In [29]:
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [30]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [31]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training.example import Example

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  epochs = 30
  # Training for 30 iterations
  for epoch in range(epochs):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}

    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        
        #texts, annotations = zip(*batch)

        for text, annotations in  batch:
                examples = []
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
                nlp.update(examples, drop = 0.15, losses = losses)
        
        #nlp.update(
                    #texts,  # batch of texts
                    #annotations,  # batch of annotations
                    #drop=0.5,  # dropout - make it harder to memorise data
                    #losses=losses,)
        print("Losses", losses)

Losses {'ner': np.float32(11.649366)}




Losses {'ner': np.float32(16.906828)}




Losses {'ner': np.float32(21.440039)}




Losses {'ner': np.float32(28.281784)}




Losses {'ner': np.float32(34.698456)}
Losses {'ner': np.float32(36.61091)}




Losses {'ner': np.float32(40.70984)}




Losses {'ner': np.float32(42.341454)}
Losses {'ner': np.float32(1.2701459)}
Losses {'ner': np.float32(2.7882957)}
Losses {'ner': np.float32(3.681367)}
Losses {'ner': np.float32(3.9350429)}
Losses {'ner': np.float32(5.035109)}
Losses {'ner': np.float32(7.727957)}
Losses {'ner': np.float32(10.348422)}
Losses {'ner': np.float32(10.348431)}
Losses {'ner': np.float32(1.6771953)}
Losses {'ner': np.float32(1.6772122)}
Losses {'ner': np.float32(3.6832511)}
Losses {'ner': np.float32(4.712706)}
Losses {'ner': np.float32(4.9273963)}
Losses {'ner': np.float32(5.4029584)}
Losses {'ner': np.float32(6.0799685)}
Losses {'ner': np.float32(6.08406)}
Losses {'ner': np.float32(1.2819985e-05)}
Losses {'ner': np.float32(0.0024706991)}
Losses {'ner': np.float32(0.01316647)}
Losses {'ner': np.float32(0.26266888)}
Losses {'ner': np.float32(0.30017796)}
Losses {'ner': np.float32(0.300178)}
Losses {'ner': np.float32(2.3429966)}
Losses {'ner': np.float32(2.342998)}
Losses {'ner': np.float32(2.1295186e-06)}
Losses

In [17]:
# saving model

output_dir='NERScapy_model'
nlp.to_disk(output_dir)

In [19]:
# Test data with expected entities

TEST_DATA = [
    ("Marjane is a big place to shopping", {"entities": [(0,7,"ORG")]}),
    ("I visited Korea this year", {"entities": [(11, 16, "GPE")]}),
    ("I bought some new gadgets from Aliexpress", {"entities": [(39, 49, "ORG")]}),
    ("I am driving a Toyota", {"entities": [(21, 27, "ORG")]}),
    ("I purchased a new shoes from Cotsco", {"entities": [(36, 42, "ORG")]}),
    ("washing machine are available in Netflix", {"entities": [(30, 37, "ORG")]}),
    ("I work at Infotel", {"entities": [(10, 17, "ORG")]}),
    ("I worked before in Segula", {"entities": [(20, 26, "ORG")]}),
    ("I bought a fancy dress from ZARA", {"entities": [(36, 41, "ORG")]}),
    ("I received as a present Nikon camera", {"entities": [(23, 28, "ORG")]}),
]

# creating a function to evaluate the model

In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score
    
def evaluate_model(test_data):

    nlp= spacy.load('NERScapy_model')

    true_entities = []
    pred_entities = []
    
    for text, annotations in test_data:
        doc = nlp(text)  # Use the full nlp object to obtain a Doc
        true_entities.extend(annotations["entities"])

        # Extract predicted entities
        pred_entities_in_text = [(ent.text, ent.label_) for ent in doc.ents]
        pred_entities.extend(pred_entities_in_text)

    x1=len(true_entities)
    x2=len(pred_entities)

    for i in range((x1-x2)-1,len(true_entities)-1):
        pred_entities.append('Nan')
    

    # Prepare labels for metrics
    y_true_labels = [ent[2] for ent in true_entities]
    #y_pred_labels = [ent[1] for ent in pred_entities]
    y_pred_labels = [ent[2] if idx < len(pred_entities) else 'Nan' for idx in range(len(true_entities))]



    # Handle cases with no predictions
    if not y_pred_labels:
        print("No predictions made. Precision, recall, and F1 score cannot be defined.")
        return

    # Calculate metrics
    precision = precision_score(y_true_labels, y_pred_labels, average='weighted', zero_division=0)
    recall = recall_score(y_true_labels, y_pred_labels, average='weighted', zero_division=0)
    f1 = f1_score(y_true_labels, y_pred_labels, average='weighted', zero_division=0)

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

In [23]:
# Evaluating the model performance

evaluate_model(TEST_DATA)

Precision: 0.81
Recall: 0.90
F1 Score: 0.85


In [24]:
# Save the  model to directory
print("Saved model as : ", output_dir)

Saved model as :  NERScapy_model


# using the model to categorize dataset entities

In [61]:
#using model with our parsed CVs

def extract_entities(text):

    nlp = spacy.load('NERScapy_model')

    list_org=[]

    try :
        doc = nlp(text)
        list_org=[ent.text for ent in doc.ents if ent.label_=='ORG']
        return ' | '.join(list_org)

    except:
        return list_org

In [76]:
data['edu_organisations']=[extract_entities(i) for i in data['education']]
data['exp_organisations']=[extract_entities(i) for i in data['experience']]

data

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience,edu_organisations,exp_organisations
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,linkedin.com/emmetbrickowsk1,networking | english | autocad | solidworks | ...,Bachelor of Science in Civil Engineering GPA ...,Bachelor of Science in Civil Engineering GPA ...,Model Bridge | Principles of Computer Aided De...,Bricksburg Construction | LEGO City University...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,excel | python,e Managed upstream strategies of semiconductor...,Undergraduate Researcher September 2019 — Pres...,Process Engineering Intern,
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,resumeviking.com/templates | xeOrlukd.com/24QS...,sales growth | networking | spanish | strategi...,"University of Denver, Denver | Certified Key A...",Dec 2018 - Aug 2022 m Regional Account Manager...,University of Denver | Certified Institute of ...,Manage | Presidion Petroleum | TX
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401,no URL found,cleaning | dog grooming | cat grooming | dog t...,"GED, Nashville High, Nashville, TN",Patty's Pet Salon is a boutique pet grooming s...,,Pet Groomer | Petco
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442,no URL found,english | french | visual design,"Business Management, Hospitality Tourism Cert...","Event Coordinator, Bright Event Rentals, Flori...","Wedding Coordination Planning, | University o...",Bright Event Rentals | Classic Party Rentals
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,css | javascript | ux,"Mastered web and mobile technologies HTMLS, CS...",Status is an open source discovery tool for th...,HTMLS | Pratt Institute New York | Pratt | Ins...,Status | Ethereum | Status New York | Real Vis...
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442,no URL found,cleaning | spanish | english | mathematics,on packing records. Completed a certificate in...,"Full-time lab assistant in a small, regional l...",Southern New Hampshire University | Graduate S...,Dunrea Laboratories


In [77]:
data.to_csv('processNER.csv',na_rep='NaN',index=False)

In [78]:
df=pd.read_csv('processNER.csv')

df.fillna('unknown', inplace=True)

df

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience,edu_organisations,exp_organisations
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,linkedin.com/emmetbrickowsk1,networking | english | autocad | solidworks | ...,Bachelor of Science in Civil Engineering GPA ...,Bachelor of Science in Civil Engineering GPA ...,Model Bridge | Principles of Computer Aided De...,Bricksburg Construction | LEGO City University...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,excel | python,e Managed upstream strategies of semiconductor...,Undergraduate Researcher September 2019 — Pres...,Process Engineering Intern,unknown
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,resumeviking.com/templates | xeOrlukd.com/24QS...,sales growth | networking | spanish | strategi...,"University of Denver, Denver | Certified Key A...",Dec 2018 - Aug 2022 m Regional Account Manager...,University of Denver | Certified Institute of ...,Manage | Presidion Petroleum | TX
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401,no URL found,cleaning | dog grooming | cat grooming | dog t...,"GED, Nashville High, Nashville, TN",Patty's Pet Salon is a boutique pet grooming s...,unknown,Pet Groomer | Petco
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442,no URL found,english | french | visual design,"Business Management, Hospitality Tourism Cert...","Event Coordinator, Bright Event Rentals, Flori...","Wedding Coordination Planning, | University o...",Bright Event Rentals | Classic Party Rentals
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,css | javascript | ux,"Mastered web and mobile technologies HTMLS, CS...",Status is an open source discovery tool for th...,HTMLS | Pratt Institute New York | Pratt | Ins...,Status | Ethereum | Status New York | Real Vis...
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442,no URL found,cleaning | spanish | english | mathematics,on packing records. Completed a certificate in...,"Full-time lab assistant in a small, regional l...",Southern New Hampshire University | Graduate S...,Dunrea Laboratories


# practice relationship extraction

In [37]:
def extract_relationship(text):

    import spacy

    nlp_NER = spacy.load('NERScapy_model')

    doc = nlp_NER(text)

    # Create a list to store extracted relationships
    relationships = []

    # Iterate through the sentences in the document
    for sent in doc.sents:
        # Iterate through the named entities (people, organizations etc.) in the sentence
        for ent in sent.ents:
            # Check if the entity has a known label and is a person or organization
            if ent.label_ in ["ORG"]:
                # Extract the relationship
                for token in sent:
                    if token.dep_ in ["attr", "nsubj", "dobj",'pobj','ROOT','compound','conj','auxpass','nummod','agent','prep','punct'] and token.text not in ent.text:
                        relationships.append((ent.text, token.text))


    lst=[]
    lst_=[]

    for i,j in relationships:
        dic={}
        dic[i]=[]
        for y,z in relationships:
            if i==y:
                dic[i].append(z)
        dic[i]=' '.join(dic[i])
        dic[i]=dic[i].split(',')
        if dic not in lst:
            lst.append(dic)

    
    for z in lst:
        for k,j in z.items(): 
            z[k]=[i.strip() for i in j ]
            z[k]=[i for i in z[k] if bool(i) == True]
            z[k]=list(set(z[k]))
        lst_.append(k+' '+' '.join(z[k]))


    relationships=[j[0] for j in relationships]

    return {
        'relation': set(relationships),
        'result': lst_
    } 

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score

def validate_relations(predicted_entities, true_entities):

    true_entities=[item.strip() for item in true_entities.split(',')]
    
    # Convert lists to sets for comparison
    predicted_set = set(predicted_entities['relation'])
    true_set = set(true_entities)

    
    # Calculate True Positives, False Positives, and False Negatives
    true_positives = predicted_set.intersection(true_set)
    false_positives = predicted_set - true_positives
    false_negatives = true_set - true_positives

    # Calculate Precision, Recall, and F1 Score
    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if (len(true_positives) + len(false_positives)) > 0 else 0
    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if (len(true_positives) + len(false_negatives)) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }


# testing the validation function on a sample

In [16]:
predected_edu=extract_relationship(df.loc[0,'education'])

predected_exp=extract_relationship(df.loc[2,'experience'])

print(predected_edu['relation'])

print(predected_exp['relation'])

{'Bachelor of Science in Civil Engineering', 'LEGO City University', 'Model Bridge', 'Principles of Computer Aided Design', 'LEGO City University Civil Engineers Chapter'}
{'Manage', 'TX', 'Presidion Petroleum'}


In [17]:
# i will take edu_organization as our true values

true_edu=df.loc[0,'edu_organisations']

true_exp=df.loc[2,'exp_organisations']

print(true_edu)

print(true_exp)

LEGO City University LEGO City, Principles of Computer Aided Design, LEGO City University, LEGO City University Civil Engineers Chapter
Presidion Petroleum, TX


# here the assess result

In [19]:
# Assess performance
performance_edu = validate_relations(predected_edu, true_edu)
performance_exp = validate_relations(predected_exp, true_exp)


# Print the results
print("Performance Metrics, education:")
print(f"Precision: {performance_edu['precision']:.2f}")
print(f"Recall: {performance_edu['recall']:.2f}")
print(f"F1 Score: {performance_edu['f1_score']:.2f}")

print()

print("Performance Metrics, experience :")
print(f"Precision: {performance_exp['precision']:.2f}")
print(f"Recall: {performance_exp['recall']:.2f}")
print(f"F1 Score: {performance_exp['f1_score']:.2f}")

Performance Metrics, education:
Precision: 0.60
Recall: 0.75
F1 Score: 0.67

Performance Metrics, experience :
Precision: 0.67
Recall: 1.00
F1 Score: 0.80


# let's apply the function in our dataset

In [5]:
df

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience,edu_organisations,exp_organisations
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,linkedin.com/emmetbrickowsk1,"solidworks, english, networking, autocad, milling","LEGO City University LEGO City, DK, Bachelor o...",OBJECTIVE Fourth-year Civil Engineering studen...,"LEGO City University LEGO City, Principles of ...","Bricksburg Construction, Bachelor of Science i..."
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,"python, excel",Process Engineering Intern June 2021 - Septemb...,Undergraduate Researcher — Shah Lab September ...,unknown,unknown
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,"resumeviking.com/templates, xeOrlukd.com/24QSB...","english, strategic planning, spanish, networki...","Bachelor Degree in Advertising and Marketing, ...",Sep 2022 - Apr 2025 m Global Key Account Manag...,"University of Denver, Business College of New ...","Presidion Petroleum, TX"
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401\n\n,no URL found,"cleaning, dog training, cat grooming, dog groo...","GED, Nashville High, Nashville, TN","PET GROOMER, Professional and personable Pet G...",unknown,"Pet Groomer, Pet Groomer, Pet Groomer, Pet Gro..."
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442\n\n,no URL found,"french, english, visual design",e Managed and coordinated 160 events per year...,Practical Event Coordinator with 5 years expe...,"University of New York, Business Management","Bright Event Rentals, Classic Party Rentals"
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,"css, ux, javascript",Masters Degree in Human Computer Interaction H...,"John Huber, UX Designer, Proactive and detaile...","Pratt Institute New York, HTMLS","Real Vision Group New York, SP Global New York..."
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442\n,no URL found,"cleaning, english, spanish, mathematics",on packing records. Completed a certificate in...,Laboratory Inventory Assistant at Dunrea Labor...,"Atlanta Technical College, Southern New Hampsh...",Dunrea Laboratories


In [79]:
df['edu_relations_org']=[' | '.join(extract_relationship(i)['result']) for i in df['education']]

df['exp_relations_org']=[' | '.join(extract_relationship(i)['result']) for i in df['experience']]

In [66]:
df

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience,edu_organisations,exp_organisations,edu_relations_org,exp_relations_org
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,linkedin.com/emmetbrickowsk1,networking | english | autocad | solidworks | ...,Bachelor of Science in Civil Engineering GPA ...,Bachelor of Science in Civil Engineering GPA ...,Model Bridge | Principles of Computer Aided De...,Bricksburg Construction | LEGO City University...,Model Bridge LEGO City DK 2022 - Bachelor of S...,Bricksburg Construction LEGO City 2022 - 2022 ...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,excel | python,e Managed upstream strategies of semiconductor...,Undergraduate Researcher September 2019 — Pres...,Process Engineering Intern,unknown,Process Engineering Intern strategies of semic...,
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,resumeviking.com/templates | xeOrlukd.com/24QS...,sales growth | networking | spanish | strategi...,"University of Denver, Denver | Certified Key A...",Dec 2018 - Aug 2022 m Regional Account Manager...,University of Denver | Certified Institute of ...,Manage | Presidion Petroleum | TX,University of Denver Udemy Brooklyn | Bachelor...,Manage Dec 2018 - Aug 2022 Regional Account | ...
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401\n\n,no URL found,cleaning | dog grooming | cat grooming | dog t...,"GED, Nashville High, Nashville, TN",Patty's Pet Salon is a boutique pet grooming s...,unknown,Pet Groomer | Petco,,Pet Groomer TN | grooming training . Petco Nas...
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442\n\n,no URL found,english | french | visual design,"Business Management, Hospitality Tourism Cert...","Event Coordinator, Bright Event Rentals, Flori...","Wedding Coordination Planning, | University o...",Bright Event Rentals | Classic Party Rentals,"Wedding Coordination Planning, Business Manag...",Bright Event Rentals Classic Party Coordinator...
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,css | javascript | ux,"Mastered web and mobile technologies HTMLS, CS...",Status is an open source discovery tool for th...,HTMLS | Pratt Institute New York | Pratt | Ins...,Status | Ethereum | Status New York | Real Vis...,HTMLS technologies Pratt Institute New York | ...,Status is source discovery tool for Ethereum b...
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442\n,no URL found,cleaning | spanish | english | mathematics,on packing records. Completed a certificate in...,"Full-time lab assistant in a small, regional l...",Southern New Hampshire University | Graduate S...,Dunrea Laboratories,Southern New Hampshire University | Management...,Dunrea Laboratories laboratory tasked with - |...


In [80]:
df.to_csv('RelationExtracted.csv',na_rep='NaN',index=False)

In [81]:
data_=pd.read_csv('RelationExtracted.csv')

data_.fillna('unknown', inplace=True)

data_

Unnamed: 0,file,name,location,email,phone,Links,skills,education,experience,edu_organisations,exp_organisations,edu_relations_org,exp_relations_org
0,EPS-Civil-Engineering.pdf,DK,Bricksburg,ebrickowski@lcu.edu,123 456 - 7890,linkedin.com/emmetbrickowsk1,networking | english | autocad | solidworks | ...,Bachelor of Science in Civil Engineering GPA ...,Bachelor of Science in Civil Engineering GPA ...,Model Bridge | Principles of Computer Aided De...,Bricksburg Construction | LEGO City University...,Model Bridge LEGO City DK 2022 - Bachelor of S...,Bricksburg Construction LEGO City 2022 - 2022 ...
1,EPSExamples.pdf,no name found,no location found,no email found,no phone number found,no URL found,excel | python,e Managed upstream strategies of semiconductor...,Undergraduate Researcher September 2019 — Pres...,Process Engineering Intern,unknown,Process Engineering Intern strategies of semic...,unknown
2,Example-1-PDF.pdf,Michelle Smith,Los Angeles,email@email.com,541 754-3010,resumeviking.com/templates | xeOrlukd.com/24QS...,sales growth | networking | spanish | strategi...,"University of Denver, Denver | Certified Key A...",Dec 2018 - Aug 2022 m Regional Account Manager...,University of Denver | Certified Institute of ...,Manage | Presidion Petroleum | TX,University of Denver Udemy Brooklyn | Bachelor...,Manage Dec 2018 - Aug 2022 Regional Account | ...
3,resume-example-12.pdf,Edmond Connor,Orlando,example@email.com,890-555-0401,no URL found,cleaning | dog grooming | cat grooming | dog t...,"GED, Nashville High, Nashville, TN",Patty's Pet Salon is a boutique pet grooming s...,unknown,Pet Groomer | Petco,unknown,Pet Groomer TN | grooming training . Petco Nas...
4,Resume-Template-Modern.pdf,Templates Build,Los Angeles,email@email.com,3868683442,no URL found,english | french | visual design,"Business Management, Hospitality Tourism Cert...","Event Coordinator, Bright Event Rentals, Flori...","Wedding Coordination Planning, | University o...",Bright Event Rentals | Classic Party Rentals,"Wedding Coordination Planning, Business Manag...",Bright Event Rentals Classic Party Coordinator...
5,resume.pdf,John Huber,New York,email@email.com,890-555-0401,no URL found,css | javascript | ux,"Mastered web and mobile technologies HTMLS, CS...",Status is an open source discovery tool for th...,HTMLS | Pratt Institute New York | Pratt | Ins...,Status | Ethereum | Status New York | Real Vis...,HTMLS technologies Pratt Institute New York | ...,Status is source discovery tool for Ethereum b...
6,Stockholm-Resum.pdf,Jason Miller,Warehouse Sanitation,email@email.com,3868683442,no URL found,cleaning | spanish | english | mathematics,on packing records. Completed a certificate in...,"Full-time lab assistant in a small, regional l...",Southern New Hampshire University | Graduate S...,Dunrea Laboratories,Southern New Hampshire University | Management...,Dunrea Laboratories laboratory tasked with - |...


In [82]:
data_.to_csv('FinalExtraction.csv',na_rep='NaN',index=False,sep=";")