# 2 - Combination of Machine Learning and Rule-based NER

In [1]:
import re
import nltk
import spacy
import string
import pandas as pd
from nltk.corpus import stopwords
stop = stopwords.words('english')
from spacy.matcher import Matcher, PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor
import warnings
warnings.filterwarnings("ignore")

#python -m spacy download en_core_web_sm (cmd) 
#python -m spacy download en_core_web_md
#python -m spacy download en_core_web_lg

In [2]:
# convert the pdf to dataframe
import fitz
def pdf_to_text(document):
    doc = fitz.open(document)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import warnings
warnings.filterwarnings("ignore")

def getWordnetPos(words):
    tag=pos_tag([words])[0][1][0].upper()
    tag_dict={"J":wordnet.ADJ,
              "N":wordnet.NOUN,
              "V":wordnet.VERB,
              "R":wordnet.ADV
             }
    return tag_dict.get(tag,wordnet.NOUN)

def cv_preprocessing(cv_data):
    #Tokenization
    tokenized_text=word_tokenize(cv_data)

    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    filter_text=[]
    for token in tokenized_text:
        if token not in stop_words:
            filter_text.append(token)

    #POS and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatizeResults=[lemmatizer.lemmatize(token,getWordnetPos(token)) for token in filter_text]
    return ' '.join(lemmatizeResults)

In [4]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [5]:
def extract_names(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and last name are always proper nouns
    # `'?'` for the key `'OP'` = meaning it’s optional, which it may or may not be present in the text being matched
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN', 'OP': '?'}]

    matcher.add('NAME', [pattern])
    
    matches = matcher(nlp_text)
    
    names = []
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        if len(span) == 1:
            names.append(span.text)
        else:
            names.append(span.text.title())
            
    if any(char in string.punctuation for char in names[1]):
            return names[0]
    else:
            return names[:2]

def extract_mobile_number(resume_text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return number
        else:
            return number

def extract_email(resume_text):
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
    matches = re.findall(pattern, resume_text)
    # print(matches)
    return matches

degree_patterns = [
    [{"LOWER": "bachelor"}, {"LOWER": "of"}, {"POS": "NOUN"}],
    [{"LOWER": "bachelor"}, {"LOWER": "degree"}],
    [{"LOWER": "bachelor"}, {"LOWER": "'s"}],
    [{"LOWER": "bs"}],
    [{"LOWER": "master"}, {"LOWER": "of"}, {"POS": "NOUN"}],
    [{"LOWER": "master"}, {"LOWER": "degree"}],
    [{"LOWER": "master"}, {"LOWER": "'s"}],
    [{"LOWER": "master's"}],
    [{"LOWER": "mba"}],
    [{"LOWER": "phd"}],
    [{"LOWER": "doctor"}, {"LOWER": "of"}, {"POS": "NOUN"}],
    [{"LOWER": "doctorate"}],
    [{"LOWER": "bachelor"}, {"LOWER": "of"}, {"LOWER": "science"}, {"LOWER": "in"}, {"LOWER": "computer"}, {"LOWER": "science"}],
    [{"LOWER": "bachelor"}, {"LOWER": "of"}, {"LOWER": "computer"}, {"LOWER": "science"}]
]

matcher.add("DEGREE", degree_patterns)

def extract_degree(resume_text):
    degree_matches = []
    nlp_text = nlp(resume_text)
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        degree_matches.append(nlp_text[start:end].text)
    
    valid_degrees = [degree for degree in degree_matches if degree.lower().startswith(('bachelor', 'master', 'doctor'))]
    return valid_degrees

def extract_grad_years(resume_text):
    doc = nlp(resume_text)
    grad_years = []
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            grad_years.append(ent.text)
    return grad_years

def extract_locations(resume_text):
    doc = nlp(resume_text)
    locations = []
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            locations.append(ent.text)
    return locations 

def extract_organization(text):
    nlp = spacy.load('en_core_web_md')
    doc = nlp(text)
    orgs = []
    for ent in doc.ents:
        if ent.label_ == 'ORG':
            orgs.append(ent.text)
    return orgs

def extract_company(resume_text):
    resume_text = extract_organization(resume_text)
    corpus_file = open("company.txt", "r")
    corpus = corpus_file.read()
    corpus = corpus.split('\n')
    matches = []
    for text in corpus:
        if any(keyword.lower() == text.lower() for keyword in resume_text):
            matches.append(text)
    return matches

def extract_designations(resume_text):
    doc = nlp(resume_text)
    nouns = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            nouns.append(ent.text)
    corpus_file = open("job-titles.txt", "r")
    corpus = corpus_file.read()
    corpus = corpus.split('\n')
    matching_job_titles = []
    for title in corpus:
        if any(noun.lower() == title.lower() for noun in nouns):
            matching_job_titles.append(title)
    return matching_job_titles


In [6]:

def get_skills_and_scores(resume_text):
    nlp = spacy.load("en_core_web_lg")
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
    annotations = skill_extractor.annotate(resume_text)
    skills_full = [match['doc_node_value'] for match in annotations['results']['full_matches']]
    skills_partial = [match['doc_node_value'] for match in annotations['results']['ngram_scored']]
    score_full = [match['score'] for match in annotations['results']['full_matches']]
    score_partial = [match['score'] for match in annotations['results']['ngram_scored']]
    skills = skills_full + skills_partial
    scores = score_full + score_partial
    return skills, scores

def get_sections(text):
    # Define regular expressions to match section headers
    summary_regex = r"(Professional Summary|Summary)"
    objective_regex = r"(Objective|Career Objective)"
    education_regex = r"(Education|Academic Background|Academic Qualifications)"
    work_experience_regex = r"(PROFESSIONAL EXPERIENCE|Work Experience|Professional Experience|(^|\n)([ \t]*)(EXPERIENCE)([ \t]*)(\n|$))"
    skills_regex = r"(Skills|Technical Skill(s|-set)?|Computer Skill(s|-set)?)"

    # Make section header regexes case insensitive
    summary_regex = re.compile(summary_regex, re.IGNORECASE)
    objective_regex = re.compile(objective_regex, re.IGNORECASE)
    education_regex = re.compile(education_regex, re.IGNORECASE)
    work_experience_regex = re.compile(work_experience_regex, re.IGNORECASE)
    skills_regex = re.compile(r"(?i)Skills|Technical Skills|Computer Skills|Technical skill-set")

    # Initialize current position and current section header
    current_position = 0
    current_header = 'Summary'

    # Initialize dictionary to hold extracted sections
    sections = {}

    # Loop through section headers and extract text between them
    for match in re.finditer('|'.join([summary_regex.pattern, objective_regex.pattern,
                                       education_regex.pattern, work_experience_regex.pattern,
                                       skills_regex.pattern]), text):
        section_text = text[current_position:match.start()].strip()
        sections[current_header] = section_text
        current_position = match.end()
        current_header = match.group(0)

    # Extract text for last section
    section_text = text[current_position:].strip()
    sections[current_header] = section_text
    
    return sections

def get_skills_section(resume_text):
    sections = get_sections(resume_text)
    skills_regex = r"(Skills|Technical Skill(s|-set)?|Computer Skill(s|-set)?)"
    skills_regex = re.compile(r"(?i)Skills|Technical Skills|Computer Skills|Technical skill-set")
    skill_sections=[]
    for key in sections.keys():
        if re.match(skills_regex, key):
            skill_sections.append(sections[key])
    return (' '.join(skill_sections))



In [7]:
import time

def ner_ml_rule(file_name, resume_text):
    start_time = time.time()
    # get each entites
    name = extract_names(resume_text)
    phone_num = extract_mobile_number(resume_text)
    email = extract_email(resume_text)
    qualifications = extract_degree(resume_text)
    graduated_year = extract_grad_years(resume_text)
    location = extract_locations(resume_text)
    skills, scores = get_skills_and_scores(get_skills_section(pdf_to_text(file_name)))
    university = extract_organization(resume_text)
    company = extract_company(resume_text)
    designation = extract_designations(resume_text)
    
    keywords = ["institution", "college", "university"]
    university = [item for item in university if any(keyword in item.lower() for keyword in keywords)]
    
    # print out the result
    print("=================================== RESULT OF ML+Rule-BASED NER ===================================")
    print("Name: " , name)
    print("\nPhone Number: " , phone_num)
    print("\nEmail: " , set(email))
    print("\nQualifications: " , qualifications)
    print("\nGraduation Year: " , set(graduated_year))
    print("\nLocation: " , set(location))
    print("\nSkills: " , set(skills))
    print("\nTotal Scores: " , sum(scores))
    print("\nUniversity: " , university)
    print("\nCompany: " , set(company))
    print("\nDesignation: " , set(designation))
    print("======================================== END OF RB+ML NER ========================================")
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(elapsed_time))
    # return the values
    return file_name,name,phone_num,email,qualifications,graduated_year,location, skills, university, company, designation

## 2 - Training 

In [8]:
# Create an empty DataFrame with the desired column names
train_df = pd.DataFrame(columns=['file_name','name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])

In [9]:
train_df.loc[0] = ner_ml_rule('resume\R1.pdf', pdf_to_text('resume\R1.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Amrinder', 'Amrinder Pelia']

Phone Number:  7037430795

Email:  {'Praveen@indiquesolutions.com', 'amirindersingh1234@gmail.com'}

Qualifications:  []

Graduation Year:  {'Around 10 years', 'weekly', '5010', 'Dec 2010', 'daily', '4010 to 5010', 'Windows 2008/', '4010 / 5010', 'May 2012 - April 2013', 'Jan 2011 - Apr 2012', '2015', 'May 2013- Feb 2015'}

Location:  {'MS Visio', 'Axure', 'MS Project', 'Specialty', 'Jackson', 'Engineering', 'Cincinnati', 'Norristown'}

Skills:  {'methodologies waterfall', 'functional specification', 'defect tracking', 'concept of operation', 'testing tools', 'scrum', 'software development methodology', 'data modeling', 'business requirement', 'microsoft sql server', 'quality center', 'test script', 'compatibility', 'gap analysis', 'agile', 'access', 'fault tolerance', 'requirement analysis', 'project plans', 'transl

In [10]:
train_df.loc[1] = ner_ml_rule('resume\R2.pdf', pdf_to_text('resume\R2.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Mounika', 'Mounika Surender']

Phone Number:  7175595134

Email:  {'smounika.ba@gmail.com'}

Qualifications:  ['Masters', 'Master']

Graduation Year:  {'weekly', 'More than 7 years', 'daily', '4010-5010', '4010', '2015', 'under 65', '7', 'July 2014', 'Jan 2010-Sep 2011 \nRole', 'Jan 2013 – June 2014 \nRole:', '2010', 'Jan 2016', '65', '2012', '5010', '2000/2003', '10.0', '11'}

Location:  {'Miami', 'Long Beach', 'USA', 'US', 'IA', 'Walnut Creek', 'SRS', 'SDLC', 'Hartford'}

Skills:  {'motivated self', 'self starter', 'methodology agile', 'c', 'testing tools', 'designing test', 'scrum', 'databases oracle', 'safe', 'business modeling', 'script', 'business tools', 'test script', 'quality center', 'test case', 'test plans test', 'HTML', 'windows xp', 'access', 'dos', 'pl sql', 'source', 'uml', 'server windows', 'sql', 'professional', 'collaboratively

In [11]:
train_df.loc[2] = ner_ml_rule('resume\R3.pdf', pdf_to_text('resume\R3.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Sarath', 'Sarath Kumar']

Phone Number:  4693249282

Email:  {'sarathm.java@gmail.com'}

Qualifications:  ['Bachelor']

Graduation Year:  {'spring', 'Struts, spring', 'Struts', '7+ Years', 'Spring'}

Location:  {'Grids', 'JBoss', 'Singleton', 'Messages', 'Build', 'Kansas', 'Created POJO', 'MO', 'Clover', 'JERSEY', 'Mockito', 'KY', 'Louisville', 'JDBC', 'Maven', 'India', 'JSON', 'Facade', 'Intellij', 'Multi', 'Angular.js', 'Marietta', 'Jersey', 'jQuery', 'Junit', 'Used Jersey', 'JSTL', 'Servlets', 'Toolbars', 'Button', 'JQUERY', 'Business', 'iBatis', 'Node.js', 'Backbone.js', 'PostgreSQL', 'Database', 'UI', 'Bridge', 'XSLT', 'Birmingham'}

Skills:  set()

Total Scores:  0

University:  []

Company:  {'IBM', 'Apple', 'Oracle', 'Microsoft'}

Designation:  set()
Execution time: 16.32 seconds


In [12]:
train_df.loc[3] = ner_ml_rule('resume\R4.pdf', pdf_to_text('resume\R4.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Ravi', 'Ravi Reddy']

Phone Number:  4025000041

Email:  {'rvk.reddy240@gmail.com'}

Qualifications:  []

Graduation Year:  {'2000', 'the day', '3.0/4.0', '2002', 'Oct 2011- July 2012 \n', '2005/2008', 'May 2007–June 2009', 'June 2009-', '9+ years'}

Location:  {'Zurich', 'Invoice', 'Singleton', 'Apache', 'Façade', 'Dallas', 'Chesterfield', 'New York City', 'CSS3', 'Windows', 'Maven', 'India', 'Linux', 'Ant', 'Log4j', 'New York', 'New Delhi', 'Junit', 'NoSQL', 'sudo', 'Solaris', 'Git', 'Preparation', 'Mercurial', 'XSLT', 'Design', 'Jenkins', 'Atlanta'}

Skills:  {'node js', 'eclipse', 'c++', 'database oracle', 'application server', 'systems windows', 'c', 'perforce', 'jbuilder', 'java', 'microsoft visio', 'php', 'IBM', 'subversion', 'JMS', 'build tool', 'web technologies', 'html5', 'maven', 'PVCS', 'web application', 'unix', 'Case', 'xpath', 'pl 

In [13]:
train_df.loc[4] = ner_ml_rule('resume\R5.pdf', pdf_to_text('resume\R5.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Suresh', 'Suresh Basetti']

Phone Number:  19259003354

Email:  {'sureshkumar.basetti@gmail.com'}

Qualifications:  ['Master', 'Master', 'Master', 'Master', 'Master', 'Master', 'Master', 'Master', 'Master', 'Master', 'Master', 'Master']

Graduation Year:  {'June 2004', 'Scala', 'Jan 2010 – June 2014', '16 years', 'July 2004 -  ', 'July 2014', '900 3354', 'every 3 weeks', 'daily', 'Dec 2009', 'Sept 2005', 'Feb 2017 - Present', 'Oct 2005'}

Location:  {'Japan', 'Cloudhub', 'Client', 'Zookeeper', 'Sablime', 'UK', 'Tuxedo', 'JDBC', 'Maven', 'India', 'Oak Brook', 'Ireland', 'Linux', 'Oozie', 'Red Hat', 'Sweden', 'Log4j', 'JDK', 'USA', 'Spark', 'PO', 'Impala', 'Ericsson', 'XSLT', 'SDLC', 'UNIX', 'Hungary'}

Skills:  {'physical', 'received', 'red hat linux', 'HTML', 'coordination', 'team management', 'certify scrum master', 'certified scrum', 'environme

In [14]:
train_df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R1.pdf,"[Amrinder, Amrinder Pelia]",7037430795,"[amirindersingh1234@gmail.com, Praveen@indique...",[],"[Around 10 years, weekly, Windows 2008/, 2015,...","[MS Visio, MS Project, Engineering, Cincinnati...","[business requirement, system requirement, sys...",[Michigan State University],[Microsoft],[system admin]
1,resume\R2.pdf,"[Mounika, Mounika Surender]",7175595134,"[smounika.ba@gmail.com, smounika.ba@gmail.com,...","[Masters, Master]","[More than 7 years, 4010-5010, 2000/2003, 7, 1...","[Long Beach, USA, IA, Miami, SRS, Hartford, SD...","[self starter, test script, test case, pl sql,...",[],"[HP, Molina Healthcare, Oracle, Humana]","[physician, scrum master]"
2,resume\R3.pdf,"[Sarath, Sarath Kumar]",4693249282,[sarathm.java@gmail.com],[Bachelor],"[7+ Years, Struts, spring, Struts, spring, Spr...","[Singleton, Bridge, Multi, Angular.js, Backbon...",[],[],"[Oracle, IBM, Apple, Microsoft]",[]
3,resume\R4.pdf,"[Ravi, Ravi Reddy]",4025000041,[rvk.reddy240@gmail.com],[],"[9+ years, 3.0/4.0, 2005/2008, 2002, 2000, the...","[Design, Singleton, Ant, Maven, NoSQL, Prepara...","[pl sql, web application, application server, ...",[],"[Oracle, Amazon, IBM]",[]
4,resume\R5.pdf,"[Suresh, Suresh Basetti]",19259003354,"[sureshkumar.basetti@gmail.com, sureshkumar.ba...","[Master, Master, Master, Master, Master, Maste...","[900 3354, 16 years, 900 3354, Scala, Feb 2017...","[SDLC, UNIX, Linux, Sweden, Hungary, Ireland, ...","[technical documentation, release note, resour...",[Osmania University],"[HP, Oracle, McDonald's, Microsoft, TCS]","[java developer, scrum master]"


## 3 - Testing and validate result

### 3.1 Without Preprocessing

In [15]:
# Create an empty DataFrame with the desired column names
df = pd.DataFrame(columns=['file_name','name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])

In [16]:
df.loc[0] = ner_ml_rule('resume\R6.pdf', pdf_to_text('resume\R6.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Abiral', 'Abiral Pandey']

Phone Number:  9402423303

Email:  {'abiral.pandey88@gmail.com'}

Qualifications:  ['Bachelor', 'Bachelor of Computer Science']

Graduation Year:  {'December 2015', 'January 2012', 'April 2016', 'November 2014', 'May 2013 – October 2014', 'December 2015 -  March 2016', '2005/2008', 'Struts, Spring', '6 years', 'Struts 1.x/2.x, Spring 2.5/3.0', 'Spring', 'the spring', 'April 2013'}

Location:  {'Denton', 'Singleton', 'Kansas City', 'Epsilon', 'Angular', 'US', 'Account Transfer', 'Woonsocket', 'Node', 'Mockito', 'Ogden', 'Irving', 'Texas', 'CSS3', 'JDBC', 'Maven', 'Axis-2', 'DAO Pattern', 'Linux', 'Log4j', 'Jersey', 'OOP', 'Utah', 'Web Framework', 'JSTL', 'NoSQL', 'Rhode Island', 'Ant', 'Missouri', 'Jenkins'}

Skills:  {'node js', 'eclipse', 'systems windows', 'solaris', 'design patterns', 'XHTML', 'javabeans', 'log4j', '

In [17]:
df.loc[1] = ner_ml_rule('resume\R7.pdf', pdf_to_text('resume\R7.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Adelina', 'Adelina Erimia']

Phone Number:  4693317851

Email:  {'erimia@msn.com'}

Qualifications:  ['Master']

Graduation Year:  {'10 years', 'weekly', 'daily'}

Location:  {'Legal', 'Hartford', 'Savannah', 'Bermuda', 'Romania', 'US'}

Skills:  {'smartsheet', 'agile', 'scrum', 'microsoft word', 'problem management', 'com', 'ppm'}

Total Scores:  6.549387753009796

University:  ['The George Washington University School of Business - Project Management \n \nLiceul Industrial']

Company:  set()

Designation:  {'scrum master', 'green belt'}
Execution time: 10.39 seconds


In [18]:
df.loc[2] = ner_ml_rule('resume\R8.pdf', pdf_to_text('resume\R8.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Jagan', 'Jagan S']

Phone Number:  12484121658

Email:  {'jagansrconsult@gmail.com'}

Qualifications:  ['Master', 'Master', 'Bachelor']

Graduation Year:  {'3 year', 'May 1994 to Jan 1996', 'Jan ‘05', 'May 1998', 'Mar 2003 to Dec 2004', 'day', '3 years', 'Nov 2011', 'May 2014', '2012', '2003', '2005-2015', '2014', 'Mar 1996', 'Feb 2010', 'Jan 2005', '90+', '20 YEARS'}

Location:  {'Early Career', 'CHICAGO', 'Charlotte', 'Madras', 'US', 'Riyadh', 'Saudi Arabia', 'Louisville', 'CICS', 'Detroit', 'India', 'Nashville', 'USA', 'New York', 'Farmington Hills', 'Peregrine', 'Center', 'Philadelphia', 'SQL', 'NC', 'Providedprogram', 'PA'}

Skills:  {'it portfolio management', 'collaborative', 'portfolio management', 'budget', 'leadership', 'information technology', 'banking', 'team building', 'manufacturing automotive', 'process improvement', 'management t

In [19]:
df.loc[3] = ner_ml_rule('resume\R9.pdf', pdf_to_text('resume\R9.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  Harshitha

Phone Number:  6505176475

Email:  {'harshithac511@gmail.com'}

Qualifications:  ['Bachelor']

Graduation Year:  {'8 years', 'July ‘16', 'Spring', 'daily'}

Location:  {'INDIA', 'Singleton', 'Knockout.js', 'Build', 'Spring JDBC', 'Charlotte', 'chicago', 'Bootstrap', 'CSS3', 'MySQL, PostgreSQL', 'JDBC', 'Maven', 'India', 'JSON', 'Linux', 'Cassandra, Groovy', 'Multi', 'Ant', 'Log4j', 'Collections', 'Junit', 'JSTL', 'Servlets', 'JAX_RS, JERSEY', 'Thyme', 'JQUERY', 'San Antonio', 'Git', 'NC', 'UI', 'Bridge', 'XML(SAX', 'Jenkins'}

Skills:  {'eclipse', 'github', 'application server', 'systems windows', 'java sql', 'design patterns', 'testing tools', 'log4j', 'jquery', 'jbuilder', 'apache cxf', 'web service', 'java', 'collections', 'script', 'apache tomcat', 'hibernate', 'JMS', 'build tool', 'html5', 'maven', 'web application', 'unix', 'postgr

In [20]:
df.loc[4] = ner_ml_rule('resume\R10.pdf', pdf_to_text('resume\R10.pdf'))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Mohid', 'Phone']

Phone Number:  8572284961

Email:  {'mohid0089300@gmail.com'}

Qualifications:  []

Graduation Year:  {'August', '2005', 'Struts', '10 to June ‘11', '8 years', 'Spring'}

Location:  {'Singleton', 'Oracle', 'Fargo', 'Client', 'Bootstrap', 'Mockito', 'Maven', 'Nordstrom', 'Linux', 'WEBLOGIC', 'HTML5', 'Log4j', 'JavaScript', 'ND', 'Seattle', 'Watertown', 'UI', 'Database', 'DB', 'Burlingame'}

Skills:  {'eclipse', 'limited', 'methodology agile', 'application server', 'c', 'business objects', 'design patterns', 'databases oracle', 'parse', 'java', 'web service', 'java classes', 'scripting', 'design documents', 'JMS', 'bootstrap', 'html5', 'web technologies', 'application development', 'query language', 'HTML', 'design experience', 'hibernate query language', 'pl sql', 'cobol', 'module', 'web server', 'web development', 'SOAP', 'clien

In [21]:
df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R6.pdf,"[Abiral, Abiral Pandey]",9402423303,[abiral.pandey88@gmail.com],"[Bachelor, Bachelor of Computer Science]","[6 years, Struts, Spring, Struts 1.x/2.x, Spri...","[Woonsocket, Rhode Island, US, Jersey, Singlet...","[pl sql, unix shell, shell script, web framewo...",[University of North Texas],"[Oracle, IBM, Microsoft]",[java developer]
1,resume\R7.pdf,"[Adelina, Adelina Erimia]",4693317851,"[erimia@msn.com, erimia@msn.com]",[Master],"[10 years, daily, weekly]","[Savannah, Bermuda, Savannah, Legal, US, Roman...","[microsoft word, problem management, com, ppm,...",[The George Washington University School of Bu...,[],"[green belt, scrum master]"
2,resume\R8.pdf,"[Jagan, Jagan S]",12484121658,"[jagansrconsult@gmail.com, jagansrconsult@gmai...","[Master, Master, Bachelor]","[20 YEARS, 2005-2015, 2014, Jan ‘05, 2012, May...","[US, US, CHICAGO, CICS, Philadelphia, PA, NC, ...","[process improvement, risk mitigation, it port...","[MS Central Michigan University, MSUniversity,...","[Oracle, General Motors, Ally Financial]",[]
3,resume\R9.pdf,Harshitha,6505176475,[harshithac511@gmail.com],[Bachelor],"[8 years, daily, July ‘16, Spring, Spring]","[Singleton, Bridge, Build, Collections, JDBC, ...","[pl sql, custom tag, application server, apach...",[],"[Oracle, Splunk, IBM, Wells Fargo]",[]
4,resume\R10.pdf,"[Mohid, Phone]",8572284961,[mohid0089300@gmail.com],[],"[8 years, Spring, Struts, Spring, Spring, Spri...","[Maven, HTML5, Bootstrap, Client, Singleton, O...","[web service, web service, application develop...",[],"[Oracle, IBM, Nordstrom]",[]


### 3.2 With Preprocessing

In [22]:
# Create an empty DataFrame with the desired column names
df2 = pd.DataFrame(columns=['file_name','name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])

In [23]:
df2.loc[0] = ner_ml_rule('resume\R6.pdf', cv_preprocessing(pdf_to_text('resume\R6.pdf')))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Abiral', 'Abiral Pandey']

Phone Number:  9402423303

Email:  set()

Qualifications:  ['Bachelor', 'Bachelor Computer']

Graduation Year:  {'Struts , Spring', '6 year', 'January 2012', 'April 2016', 'Struts 1.x/2.x , Spring 2.5/3.0', 'November 2014', 'May 2013 – October 2014', '2005/2008', 'December 2015 - March 2016', 'Spring', 'December 2015', 'April 2013'}

Location:  {'Denton', 'Singleton', 'Oracle Database', 'Kansas City', 'US', 'Account Transfer', 'Woonsocket', 'Node', 'Mockito', 'Ogden', 'Irving', 'Texas', 'Relational', 'Pennsylvania', 'JDBC', 'Maven', 'Log4j', 'Rhode Island Full', 'Jersey', 'OOP', 'Utah', 'Web Framework', 'JSTL', 'NoSQL', 'Spring AOP', 'Rhode Island', 'Ant', 'Missouri', 'Jenkins'}

Skills:  {'node js', 'eclipse', 'systems windows', 'solaris', 'design patterns', 'XHTML', 'javabeans', 'log4j', 'build automation', 'shell scr

In [24]:
df2.loc[1] = ner_ml_rule('resume\R7.pdf', cv_preprocessing(pdf_to_text('resume\R7.pdf')))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Adelina', 'Adelina Erimia']

Phone Number:  4693317851

Email:  set()

Qualifications:  ['Master']

Graduation Year:  {'weekly', '10 year', 'daily'}

Location:  {'Gulfstream', 'Legal', 'Hartford', 'Savannah', 'Romania', 'US'}

Skills:  {'smartsheet', 'agile', 'scrum', 'microsoft word', 'problem management', 'com', 'ppm'}

Total Scores:  6.549387753009796

University:  ['The George Washington University School Business - Project Management Liceul Industrial']

Company:  {'UnitedHealth Group'}

Designation:  {'green belt'}
Execution time: 9.81 seconds


In [25]:
df2.loc[2] = ner_ml_rule('resume\R8.pdf', cv_preprocessing(pdf_to_text('resume\R8.pdf')))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Jagan', 'Jagan S']

Phone Number:  12484121658

Email:  set()

Qualifications:  ['Master', 'Master Science', 'Master', 'Master Science', 'Bachelor', 'Bachelor Science']

Graduation Year:  {'Jan ‘ 05', '3 year', '1998 Feb 2003', 'May 1998', 'May 1994 Jan 1996', '2003', '2012 May 2014', '2005-2015', 'Mar 1996', 'Feb 2010', 'Jan 2005', '2004'}

Location:  {'CHICAGO', 'Client', 'Charlotte', 'US', 'Riyadh', 'Saudi Arabia', 'Louisville', 'CICS', 'Detroit', 'India', 'MI', 'Nashville', 'USA', 'New York', 'Farmington Hills', 'Peregrine', 'Philadelphia', 'SQL', 'NC', 'PA'}

Skills:  {'it portfolio management', 'collaborative', 'portfolio management', 'budget', 'leadership', 'information technology', 'banking', 'team building', 'manufacturing automotive', 'process improvement', 'management team', 'e', 'risk mitigation', 'timeline', 'innovative', 'value adde

In [26]:
df2.loc[3] = ner_ml_rule('resume\R9.pdf', cv_preprocessing(pdf_to_text('resume\R9.pdf')))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Harshitha', 'Harshitha Email']

Phone Number:  6505176475

Email:  set()

Qualifications:  ['Bachelor']

Graduation Year:  {'spring', '11-April', '8 year', 'daily', 'Spring'}

Location:  {'Singleton', 'Knockout.js', 'Charlotte', 'chicago', 'Cassandra , Groovy', 'Bootstrap', 'CSS3', 'JDBC', 'Maven', 'India', 'JSON', 'MySQL , PostgreSQL', 'Linux', 'Multi', 'Log4j', 'Collections', 'Junit', 'JAX_RS , JERSEY', 'JSTL', 'Servlets', 'Thyme', 'JQUERY', 'San Antonio', 'Git', 'NC', 'UI', 'Bridge', 'Ant'}

Skills:  {'eclipse', 'github', 'application server', 'systems windows', 'java sql', 'design patterns', 'testing tools', 'log4j', 'jquery', 'jbuilder', 'apache cxf', 'web service', 'java', 'collections', 'script', 'apache tomcat', 'hibernate', 'JMS', 'build tool', 'html5', 'maven', 'web application', 'unix', 'postgresql', 'jax ws', 'pl sql', 'xsd', 'uml too

In [27]:
df2.loc[4] = ner_ml_rule('resume\R10.pdf', cv_preprocessing(pdf_to_text('resume\R10.pdf')))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Mohid', 'Mohid Phone']

Phone Number:  8572284961

Email:  set()

Qualifications:  []

Graduation Year:  {'2005', '8 year', '10 June ‘ 11', '1.x 2.x ,', 'Spring'}

Location:  {'WEBLOGIC', 'XSLT', 'Log4j', 'Singleton', 'Mockito', 'Fargo', 'Maven', 'DB', 'Burlingame', 'Factory Patterns', 'Seattle', 'Nordstrom', 'Watertown', 'Linux'}

Skills:  {'eclipse', 'limited', 'methodology agile', 'application server', 'c', 'business objects', 'design patterns', 'databases oracle', 'parse', 'java', 'web service', 'java classes', 'scripting', 'design documents', 'JMS', 'bootstrap', 'html5', 'web technologies', 'application development', 'query language', 'HTML', 'design experience', 'hibernate query language', 'pl sql', 'cobol', 'module', 'web server', 'web development', 'SOAP', 'client server', 'javascript', 'angular', 'rational application developer', 'perl',

## 4 - Compare testing result

In [28]:
# before preprocessing
df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R6.pdf,"[Abiral, Abiral Pandey]",9402423303,[abiral.pandey88@gmail.com],"[Bachelor, Bachelor of Computer Science]","[6 years, Struts, Spring, Struts 1.x/2.x, Spri...","[Woonsocket, Rhode Island, US, Jersey, Singlet...","[pl sql, unix shell, shell script, web framewo...",[University of North Texas],"[Oracle, IBM, Microsoft]",[java developer]
1,resume\R7.pdf,"[Adelina, Adelina Erimia]",4693317851,"[erimia@msn.com, erimia@msn.com]",[Master],"[10 years, daily, weekly]","[Savannah, Bermuda, Savannah, Legal, US, Roman...","[microsoft word, problem management, com, ppm,...",[The George Washington University School of Bu...,[],"[green belt, scrum master]"
2,resume\R8.pdf,"[Jagan, Jagan S]",12484121658,"[jagansrconsult@gmail.com, jagansrconsult@gmai...","[Master, Master, Bachelor]","[20 YEARS, 2005-2015, 2014, Jan ‘05, 2012, May...","[US, US, CHICAGO, CICS, Philadelphia, PA, NC, ...","[process improvement, risk mitigation, it port...","[MS Central Michigan University, MSUniversity,...","[Oracle, General Motors, Ally Financial]",[]
3,resume\R9.pdf,Harshitha,6505176475,[harshithac511@gmail.com],[Bachelor],"[8 years, daily, July ‘16, Spring, Spring]","[Singleton, Bridge, Build, Collections, JDBC, ...","[pl sql, custom tag, application server, apach...",[],"[Oracle, Splunk, IBM, Wells Fargo]",[]
4,resume\R10.pdf,"[Mohid, Phone]",8572284961,[mohid0089300@gmail.com],[],"[8 years, Spring, Struts, Spring, Spring, Spri...","[Maven, HTML5, Bootstrap, Client, Singleton, O...","[web service, web service, application develop...",[],"[Oracle, IBM, Nordstrom]",[]


In [29]:
# after preprocessing
df2

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R6.pdf,"[Abiral, Abiral Pandey]",9402423303,[],"[Bachelor, Bachelor Computer]","[6 year, Struts , Spring, Struts 1.x/2.x , Spr...","[Woonsocket, Rhode Island, US, Jersey, Singlet...","[pl sql, unix shell, shell script, web framewo...",[University North Texas],"[Oracle, IBM]",[]
1,resume\R7.pdf,"[Adelina, Adelina Erimia]",4693317851,[],[Master],"[10 year, daily, weekly]","[Savannah, Legal, Gulfstream, US, Romania, Har...","[microsoft word, problem management, com, ppm,...",[The George Washington University School Busin...,[UnitedHealth Group],[green belt]
2,resume\R8.pdf,"[Jagan, Jagan S]",12484121658,[],"[Master, Master Science, Master, Master Scienc...","[2005-2015, Jan ‘ 05, 2012 May 2014, 3 year, J...","[US, US, CHICAGO, CICS, Philadelphia, PA, Nash...","[process improvement, risk mitigation, it port...",[MS Central Michigan University],"[Oracle, Ally Financial]",[]
3,resume\R9.pdf,"[Harshitha, Harshitha Email]",6505176475,[],[Bachelor],"[8 year, daily, spring, Spring, Spring, 11-Apr...","[Singleton, Bridge, Collections, JDBC, Multi, ...","[pl sql, custom tag, application server, apach...",[],"[Oracle, Splunk, IBM, Wells Fargo]",[]
4,resume\R10.pdf,"[Mohid, Mohid Phone]",8572284961,[],[],"[8 year, 1.x 2.x ,, Spring, Spring, 2005, 10 J...","[Maven, Singleton, WEBLOGIC, Watertown, Single...","[web service, web service, application develop...",[],"[Oracle, IBM, Nordstrom]",[java developer]


## Fine-tuned the pre-trained model

In [30]:
# import spacy
# from spacy.training import Example
# from spacy.util import minibatch, compounding

# # Load the pre-trained model
# nlp = spacy.load("en_core_web_sm")

# # Add your entity labels
# nlp.add_label("PRODUCT")
# nlp.add_label("BRAND")

# # Prepare your data
# train_data = [...]  # List of training examples in JSON format

# # Initialize the training
# optimizer = nlp.begin_training()
# for i in range(10):
#     losses = {}
#     batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
#     for batch in batches:
#         for text, annotations in batch:
#             example = Example.from_dict(nlp.make_doc(text), annotations)
#             nlp.update([example], sgd=optimizer, losses=losses)
#     print("Losses", losses)

# # Evaluate the model
# eval_data = [...]  # List of evaluation examples in JSON format
# scores = nlp.evaluate(eval_data)
# print(scores)
