## How to screen a pdf CV ?

In [1]:
# importing required modules
from PyPDF2 import PdfReader
import re
import spacy
from spacy.matcher import Matcher

In [2]:
def get_text_pdf(my_pdf):
    # creating a pdf reader object
    reader = PdfReader(my_pdf)
    
    output=[]
    for i in range(len(reader.pages)):

        # getting a specific page from the pdf file
        page = reader.pages[i]

        # extracting text from page
        text = page.extract_text()
        output.append(text)
    return output

In [3]:
get_text_pdf('Profile.pdf')[0]

"\xa0 \xa0\nCoordonnées\nwww.linkedin.com/in/\ncarolinemathius  (LinkedIn)\nPrincipales compétences\nAudit\nHuman Resources\nNegotiation\nLanguages\nAnglais  (Full Professional)\nItalien  (Full Professional)\nAlbanais  (Elementary)\nFrançais  (Native or Bilingual)Caroline Mathius\nEn formation Data Science\nAvignon, Provence-Alpes-Côte d’Azur, France\nRésumé\nVous êtes sur la page d'un profil atypique, bienvenue !\nJ'ai 30 ans et un parcours original : Tout commence en 2009,\nlorsque j’intègre l’IÉSEG School of\nManagement à Lille, avec un projet professionnel flou mais axé vers\nl’international. C’est en 1ère année de master que j’ai l’opportunité\nde partir en ERASMUS en Italie pendant un an, et c'est là que\nl'aventure commence !\nDepuis, je bouge, découvre le monde, et différents métiers. Je vous\nlaisse découvrir mon parcours ...\nPS : j'ai un fort intérêt pour le Développement Durable !\nExpérience\nLeroy Merlin\n4 ans 5 mois\nResponsable logistique\nmars 2020\xa0-\xa0Present\xa0

In [4]:
def get_features(my_text):
    
    cv_feat_dict={}

    cv_feat_dict['page_numbers']=len(my_text)
    
    my_text='\n'.join(my_text)
    
    cv_feat_dict['line_numbers']=my_text.count('\n')
    
    my_text_ok=my_text.replace('\n',' ')
    my_text_ok=re.sub(r' +', ' ', my_text_ok)
    
    #count_1
    count_1=len([1 for my_word in my_text_ok.split() if len(my_word)==1])
    pc_1=count_1/len(my_text_ok.split())
    if pc_1>0.5:
        my_text_ok=my_text_ok.replace(' ','')

    cv_feat_dict['word_numbers']=len([s for s in re.split("[() ,|;\W]+", my_text_ok)])
    cv_feat_dict['unique_upper_words']=list({i for i in [my_word for my_word in my_text_ok.split() if my_word.isupper()]})

    #get name
    match_name= re.search("[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]",my_text)
    if match_name:
        cv_feat_dict['name']=match_name.group()
    else:
        cv_feat_dict['name']='name not found'
    
    my_text=my_text_ok.lower()

    #remove accents
    repl = str.maketrans("àâéèêëûôöïç","aaeeeeuooic")
    my_text_ok=my_text.translate(repl)
    

    # get email
    match_email= re.search('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',my_text_ok)
    if match_email:
        cv_feat_dict['email']=match_email.group()
    else:
        cv_feat_dict['email']='email not found'

    # get phone number
    match_fr_phone= re.search('(?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2})',my_text_ok)
    if match_fr_phone:
        cv_feat_dict['french_phone']=match_fr_phone.group()
    else:
        cv_feat_dict['french_phone']='french phone not found'
        
    match_any_phone= re.search('[\+]?[\(]?[0-9]{2,3}[)]?[-\s\.]?[0-9]{2,3}[-\s\.]?[0-9]{3,6}[-\s\.]?[0-9]{3,6}',my_text_ok)
    if match_any_phone:
        cv_feat_dict['other_phone']=match_any_phone.group()
    else:
        cv_feat_dict['other_phone']='other phone not found'

    # get github account
    if 'github' in my_text_ok:
        cv_feat_dict['has_github']='github'
        match_github= re.search('https://github.com+[/a-zA-Z0-9]+',my_text_ok)
        if match_github:
            cv_feat_dict['github_account']=match_github.group()
        else:
            cv_feat_dict['github_account']='github account not found'
    else:
        cv_feat_dict['has_github']='github not mentionned'
        cv_feat_dict['github_account']='github account not found'

    # get linkedin account
    if 'linkedin' in my_text_ok:
        cv_feat_dict['has_linkedin']='linkedin'
    else:
        cv_feat_dict['has_linkedin']='linkedin not mentionned'

    #count key words from a competence list
    list_keycomp=['ia ','ai ','data','datascience','data scienc','datascient','data eng','python',' r ','sql','docker','cloud','aws', 'azure','ml', 'algorithm', 'algo','statisti','keras','pytorch','machine learning','tensorflow','opencv','computer vision','pandas','numpy','nlp', 'dl ', 'deeplearning','deep learn','neural net','neurone','time serie']
    cv_feat_dict['the_data_comp']=[my_comp for my_comp in list_keycomp if my_comp in my_text_ok]

    #count key words from a diploma list
    list_keydiploma=['phd','docteur','master','iut','dut','ingenie','msc','bac','license','maitrise','master2', 'ecole','école','superieu','reconvers']
    cv_feat_dict['the_data_diploma']=[my_dipl for my_dipl in list_keydiploma if my_dipl in my_text_ok]
    
    #count key words from a language list
    list_keylang=['francais','french','anglais','english','allemand','german','indien','indian','arabe','arabic','espagnol','spanish','italien','italian','chinois','chinese']
    cv_feat_dict['the_data_lang']=[my_lang for my_lang in list_keylang if my_lang in my_text_ok]
    
    #count manager experience
    list_keymgt=['management','manageur','manager','team','equipe','mgr ']
    cv_feat_dict['the_data_mgt']=[my_mgt for my_mgt in list_keymgt if my_mgt in my_text_ok]
                
    return my_text_ok, cv_feat_dict

In [5]:
my_text_ok,feat=get_features(get_text_pdf('Profile.pdf'))

In [6]:
my_text_ok

'\xa0 \xa0 coordonnees www.linkedin.com/in/ carolinemathius (linkedin) principales competences audit human resources negotiation languages anglais (full professional) italien (full professional) albanais (elementary) francais (native or bilingual)caroline mathius en formation data science avignon, provence-alpes-cote d’azur, france resume vous etes sur la page d\'un profil atypique, bienvenue ! j\'ai 30 ans et un parcours original : tout commence en 2009, lorsque j’integre l’ieseg school of management a lille, avec un projet professionnel flou mais axe vers l’international. c’est en 1ere annee de master que j’ai l’opportunite de partir en erasmus en italie pendant un an, et c\'est la que l\'aventure commence ! depuis, je bouge, decouvre le monde, et differents metiers. je vous laisse decouvrir mon parcours ... ps : j\'ai un fort interet pour le developpement durable ! experience leroy merlin 4 ans 5 mois responsable logistique mars 2020\xa0-\xa0present\xa0 (3 ans 1 mois) avignon, prove

In [7]:
feat

{'page_numbers': 4,
 'line_numbers': 133,
 'word_numbers': 666,
 'unique_upper_words': ['ITA',
  'LEROY',
  'MERLIN',
  'IÉSEG',
  'IDL,',
  'GROUPE',
  'CV,',
  'IFRS',
  'SS/13',
  'BLACHERE',
  'ERASMUS',
  'RH)',
  'RH',
  'GAAP,',
  'PS',
  'HRM'],
 'name': 'Audit\nHuman Re',
 'email': 'email not found',
 'french_phone': 'french phone not found',
 'other_phone': 'other phone not found',
 'has_github': 'github not mentionned',
 'github_account': 'github account not found',
 'has_linkedin': 'linkedin',
 'the_data_comp': ['ia ', 'ai ', 'data', 'data scienc'],
 'the_data_diploma': ['master', 'bac', 'ecole'],
 'the_data_lang': ['francais', 'anglais', 'italien'],
 'the_data_mgt': ['management', 'equipe']}

### essai avec Spacy - recherche d'entités

In [8]:
nlp = spacy.load("fr_core_news_sm")

In [9]:
text,feat_cv=get_features(get_text_pdf('Ahmed_Farjallah_CV.pdf'))

In [10]:
print(text), print(feat_cv)

ahmed farjallah data scientist etudiant en science des donnees enthousiaste et motive a la recherche d'un stage de ﬁn d'etudes qui me permettra d'utiliser et d'ameliorer les competences et les forces que j'ai acquises en tant qu'etudiant. ahmed.farjallah@esprit.tn +216 23 277 171 tunis, tunisie ahmedfarjallah.webipie.me/ linkedin.com/in/ahmed-farjallah- datascientist competences python r keras tensorﬂow opencv flask django powerbi git azure hadoop plotly dash nlp time series analysis dl ml langues arabe anglais francais interets jeu d'echec lire cinema sport education ingenierie informatique specialisee en datascience ecole superieure privee d'ingenierie et de technologies 09/2018 - 01/2023 , tunis,tunisia experience professionnelle data scientist avaxia consulting group 07/2022 - 09/2022 , tunis,tunisie analyser de l'ensemble des donnees a l'aide de seaborn, matplotlib, et plotly. utiliser la bibliotheque de tableaux de bord pour aﬃcher les resultats de l'analyse dans un tableau de bo

(None, None)

In [11]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

ahmed farjallah data scientist PER
ahmed.farjallah@esprit.tn LOC
tunisie LOC
python r keras tensorﬂow MISC
opencv flask django powerbi git azure hadoop ORG
time series analysis dl ml langues arabe anglais francais MISC
tunis LOC
avaxia consulting group 07/2022 PER
tunis LOC
tunisie LOC
seaborn PER
matplotlib MISC
data scientist avaxia consulting group MISC
tunis LOC
tunisie LOC
data scientist MISC
tuni LOC
tunisie LOC
cybersecurity MISC
tunis LOC
tunisie LOC
windows MISC
cis ORG
spark streaming PER


In [34]:
text = get_text_pdf('CV - Flandin.pdf')[0]
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

EXPERIENCE ORG
Paris LOC
Réalisation des études OA ORG
Eurocodes MISC
DQE MISC
l’ LOC
Suivi ORG
d’ MISC
VISA MISC
d’ MISC
d’ MISC
l’autoroute MISC
A3 LOC
Préparation ORG
Conduite PER
EXE ORG
l’ LOC
Janvier PER
Méthodes probabilistes MISC
Polytechnique Montréal ORG
Méthodes LOC
Méthodes LOC
l’ LOC
l’ LOC
Monte-Carlo LOC
Janvier 2018 MISC
Analyse de 
Structures MISC
Conception Béton MISC
d’ MISC
Présentation des TD en Analyse de structures MISC
l’ LOC
EF ORG
Présentation des TD MISC
Conception Béton Armé MISC
FORMATION ORG
Applied MSc in PER
Science & Artificial 
 ORG
France LOC
Polytechnique Montréal   
Maîtrise ORG
Spécialisation PER
Montréal LOC
Canada LOC
Métiers ParisTech ORG
Formation MISC
CPGE PSI ORG
Lycée Joffre LOC
Montpellier LOC
France LOC
Solides PER
Connaissances en Machine learning 
appliqué MISC
Structural Health Monitoring ORG
Software 
Programmation ORG
Python MISC
VBA MISC
Structure: Sofisitk MISC
ST1 MISC
Robot SA LOC
Dessin LOC
AutoCAD MISC
Revit PER
Pack office MISC

In [19]:
# load pre-trained model
nlp = spacy.load("fr_core_news_sm")

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [[{'ENT_TYPE': 'PER'},{'ENT_TYPE': 'PER'}]]
    
    matcher.add('NAME',pattern)
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        name=span.text.split()
        if len(name)>1:
            match_name= re.search("^([a-zA-Z]{2,}\s[a-zA-Z]{3,}'?-?[a-zA-Z]{2,}\s?([a-zA-Z]{3,})?)",span.text)
            if match_name:
                the_name=span.text
            else:
                the_name='not found'
        else:
            the_name='not found'
    return the_name

In [12]:
def get_features_cv(my_text):
    
    cv_feat_dict={}

    cv_feat_dict['page_numbers']=len(my_text)
    
    my_text='\n'.join(my_text)
    
    cv_feat_dict['line_numbers']=my_text.count('\n')
    
    my_text_ok=my_text.replace('\n',' ')
    my_text_ok=re.sub(r' +', ' ', my_text_ok)
    
    #count_1
    count_1=len([1 for my_word in my_text_ok.split() if len(my_word)==1])
    pc_1=count_1/len(my_text_ok.split())
    if pc_1>0.5:
        my_text_ok=my_text_ok.replace(' ','')

    cv_feat_dict['word_numbers']=len([s for s in re.split("[() ,|;\W]+", my_text_ok)])
    cv_feat_dict['unique_upper_words']=list({i for i in [my_word for my_word in my_text_ok.split() if my_word.isupper()]})

    #get name
    cv_feat_dict['name'] = extract_name(my_text_ok)
    
    my_text=my_text_ok.lower()

    #remove accents
    repl = str.maketrans("àâéèêëûôöïç","aaeeeeuooic")
    my_text_ok=my_text.translate(repl)
    

    # get email
    match_email= re.search('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',my_text_ok)
    if match_email:
        cv_feat_dict['email']=match_email.group()
    else:
        cv_feat_dict['email']='email not found'

    # get phone number
    match_fr_phone= re.search('(?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2})',my_text_ok)
    if match_fr_phone:
        cv_feat_dict['french_phone']=match_fr_phone.group()
    else:
        cv_feat_dict['french_phone']='french phone not found'
        
    match_any_phone= re.search('[\+]?[\(]?[0-9]{2,3}[)]?[-\s\.]?[0-9]{2,3}[-\s\.]?[0-9]{3,6}[-\s\.]?[0-9]{3,6}',my_text_ok)
    if match_any_phone:
        cv_feat_dict['other_phone']=match_any_phone.group()
    else:
        cv_feat_dict['other_phone']='other phone not found'

    # get github account
    if 'github' in my_text_ok:
        cv_feat_dict['has_github']='github'
        match_github= re.search('https://github.com+[/a-zA-Z0-9]+',my_text_ok)
        if match_github:
            cv_feat_dict['github_account']=match_github.group()
        else:
            cv_feat_dict['github_account']='github account not found'
    else:
        cv_feat_dict['has_github']='github not mentionned'
        cv_feat_dict['github_account']='github account not found'

    # get linkedin account
    if 'linkedin' in my_text_ok:
        cv_feat_dict['has_linkedin']='linkedin'
    else:
        cv_feat_dict['has_linkedin']='linkedin not mentionned'

    #count key words from a competence list
    list_keycomp=['ia ','ai ','data','datascience','data scienc','datascient','data eng','python',' r ','sql','docker','cloud','aws', 'azure','ml', 'algorithm', 'algo','statisti','keras','pytorch','machine learning','tensorflow','opencv','computer vision','pandas','numpy','nlp', 'dl ', 'deeplearning','deep learn','neural net','neurone','time serie']
    cv_feat_dict['the_data_comp']=[my_comp for my_comp in list_keycomp if my_comp in my_text_ok]

    #count key words from a diploma list
    list_keydiploma=['phd','docteur','master','iut','dut','ingenie','msc','bac','license','maitrise','master2', 'ecole','école','superieu','reconvers']
    cv_feat_dict['the_data_diploma']=[my_dipl for my_dipl in list_keydiploma if my_dipl in my_text_ok]
    
    #count key words from a language list
    list_keylang=['francais','french','anglais','english','allemand','german','indien','indian','arabe','arabic','espagnol','spanish','italien','italian','chinois','chinese']
    cv_feat_dict['the_data_lang']=[my_lang for my_lang in list_keylang if my_lang in my_text_ok]
    
    #count manager experience
    list_keymgt=['management','manageur','manager','team','equipe','mgr ']
    cv_feat_dict['the_data_mgt']=[my_mgt for my_mgt in list_keymgt if my_mgt in my_text_ok]
                
    return my_text_ok, cv_feat_dict

In [20]:
text,feat_cv=get_features_cv(get_text_pdf('CV - Flandin.pdf'))

In [21]:
feat_cv

{'page_numbers': 1,
 'line_numbers': 113,
 'word_numbers': 481,
 'unique_upper_words': ['TD',
  'EXPERIENCE',
  'ATOUTS',
  'FORMATION',
  'ARTELIA',
  'MS',
  'CPGE',
  'A3',
  'ST1;',
  '(VISA),',
  'B1',
  'OA',
  'SA;',
  'B2',
  'CAO',
  'INTERETS',
  'FLANDIN',
  'DQE,',
  'TOEIC',
  'EF',
  'PSI*',
  'VBA'],
 'name': 'Louis FLANDIN',
 'email': 'louis.flandin19@gmail.com',
 'french_phone': '06 75 83 48 31',
 'other_phone': 'other phone not found',
 'has_github': 'github not mentionned',
 'github_account': 'github account not found',
 'has_linkedin': 'linkedin not mentionned',
 'the_data_comp': ['ia ',
  'ai ',
  'data',
  'data scienc',
  'python',
  'machine learning'],
 'the_data_diploma': ['ingenie', 'msc', 'reconvers'],
 'the_data_lang': ['anglais', 'espagnol'],
 'the_data_mgt': []}