# 1 - Rule-based NER

In [None]:
# convert the pdf to dataframe
import fitz
def pdf_to_text(document):
    doc = fitz.open(document)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import warnings
warnings.filterwarnings("ignore")

def getWordnetPos(words):
    tag=pos_tag([words])[0][1][0].upper()
    tag_dict={"J":wordnet.ADJ,
              "N":wordnet.NOUN,
              "V":wordnet.VERB,
              "R":wordnet.ADV
             }
    return tag_dict.get(tag,wordnet.NOUN)

def cv_preprocessing(cv_data):
    #Tokenization
    tokenized_text=word_tokenize(cv_data)

    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    filter_text=[]
    for token in tokenized_text:
        if token not in stop_words:
            filter_text.append(token)

    #POS and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatizeResults=[lemmatizer.lemmatize(token,getWordnetPos(token)) for token in filter_text]
    return ' '.join(lemmatizeResults)

## 1.0 Rule based Functions

In [None]:
import re
from geotext import GeoText
import pandas as pd

def get_candidate_name(resume_text):
    name_regex = r'(?P<last_name>[A-Z][a-z]+)\s(?P<first_name>(?:[A-Z][a-z]+\.?\s?)+)|(?P<name>(?:[A-Z][a-z]+\.?\s?)+)'
    matches = re.findall(name_regex, resume_text)
    for match in matches:
        if match[0] != "":
            if match[0] not in {"He", "His", "She", "Her", "It", "They", "Their", "The", "A", "An"}:
                return(match[0] + " " +  match[1])
            
def get_phone_number(resume_text):
    pattern = r"\(\d{3}\) \d{3}-\d{4}|\d{3}-\d{3}-\d{4}"
    matches = re.findall(pattern, resume_text)
    #print(matches)
    return set(matches)


def get_email(resume_text):
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
    matches = re.findall(pattern, resume_text)
    # print(matches)
    return set(matches)


def get_qualifications(resume_text):
    # Define regular expressions for different degree types
    bachelors_regex = r"(Bachelor.*)"
    masters_regex = r"(Master.*)"
    phd_regex = r"(Doctor.*)"
    diploma_regex = r"(Diploma.*)"

    # Search for degree types in the resume text
    bachelors_match = re.search(bachelors_regex, resume_text, re.IGNORECASE)
    masters_match = re.search(masters_regex, resume_text, re.IGNORECASE)
    phd_match = re.search(phd_regex, resume_text, re.IGNORECASE)
    diploma_match = re.search(diploma_regex, resume_text, re.IGNORECASE)

    # Determine the highest education level
    if phd_match:
        education_level = "PhD"
    elif masters_match:
        education_level = "Master's degree"
    elif diploma_match and bachelors_match:
        education_level = "Bachelor's degree and Diploma"
    elif bachelors_match:
        education_level = "Bachelor's degree"
    elif diploma_match:
        education_level = "Diploma"
    else:
        education_level = "Unknown"

    return education_level


def get_graduation_year(resume_text):
    # Define a regex pattern to match graduation year
    pattern = r"(?<!\d)(20\d{2}|19\d{2})(?!\d)"
    
    # Use regex to find all graduation year matches in the resume text
    matches = re.findall(pattern, resume_text)
    
    # Return a list of unique graduation year matches
    unique_matches = set(matches)
    graduation_years = [int(match) for match in unique_matches]
    return set(graduation_years)


def get_location(resume_text):
    places = GeoText(resume_text).cities
    return set(places)


def get_skills(resume_text):
    corpus_file = open("skill_set.txt", "r")
    corpus = corpus_file.read()
    corpus = corpus.split('\n')
    return [keyword for keyword in corpus if keyword in resume_text]


def get_university_name(resume_text):
    pattern = r"(?<!\w)[A-Z][a-z']*(?:\s+[A-Z][a-z']*)*(?!\w)(?:\s+(?:College|University))"
    matches = re.findall(pattern, resume_text)
    return list(set(matches))


def get_company(resume_text):
    corpus_file = open("company.txt", "r")
    corpus = corpus_file.read()
    corpus = corpus.split('\n')
    return [keyword for keyword in corpus if keyword in resume_text]


def create_ngrams(text, n):
    # clean the text by removing non-alphanumeric characters and converting to lowercase
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    # create n-grams
    ngrams = []
    words = cleaned_text.split()
    for i in range(len(words) - n + 1):
        ngrams.append(' '.join(words[i:i+n]))
    return ngrams

def get_occupation(resume_text):
    # load job title corpus
    with open('job-titles.txt') as f:
        job_titles = f.read().splitlines()

    # create n-grams of the resume text
    resume_ngrams = set(create_ngrams(resume_text, 2) + create_ngrams(resume_text, 3))
    # match n-grams against job titles
    matching_job_titles = []
    for title in job_titles:
        if any(ngram == title.lower() for ngram in resume_ngrams):
            matching_job_titles.append(title)
    return matching_job_titles

In [None]:
import time

def rule_based_ner(file_name, resume_text):
    start_time = time.time()
    
    # get each entites
    name = get_candidate_name(resume_text)
    phone_num = get_phone_number(resume_text)
    email = get_email(resume_text)
    qualifications = get_qualifications(resume_text)
    graduated_year = get_graduation_year(resume_text)
    location = get_location(resume_text)
    skills = get_skills(resume_text)
    university = get_university_name(resume_text)
    company = get_company(resume_text)
    designation = get_occupation(resume_text)
    
    
    # print out the result
    print("=================================== RESULT OF RULE-BASED NER ===================================")
    print("Name: " , name)
    print("\nPhone Number: " , phone_num)
    print("\nEmail: " , email)
    print("\nQualifications: " , qualifications)
    print("\nGraduation Year: " , graduated_year)
    print("\nLocation: " , location)
    print("\nSkills: " , skills)
    print("\nUniversity: " , university)
    print("\nCompany: " , company)
    print("\nDesignation: " , designation)
    print("======================================== END OF RB NER ========================================")
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(elapsed_time))
    # return the values
    return file_name, name,phone_num,email,qualifications,graduated_year,location, skills,university, company, designation
    

In [None]:

def get_candidate_name(resume_text):
    name_regex = r'(?P<last_name>[A-Z][a-z]+)\s(?P<first_name>(?:[A-Z][a-z]+\.?\s?)+)|(?P<name>(?:[A-Z][a-z]+\.?\s?)+)|(?P<all_caps_name>[A-Z]+)'
    matches = re.findall(name_regex, resume_text)
    for match in matches:
        if match[0] != "":
            if match[0] not in {"He", "His", "She", "Her", "It", "They", "Their", "The", "A", "An"}:
                return(match[0] + " " +  match[1])

### Create dataframe to store the result 

In [None]:
# Create an empty DataFrame with the desired column names
df = pd.DataFrame(columns=['file_name','name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])

## 2 - Training set

In [None]:
# Create an empty DataFrame with the desired column names
train_df = pd.DataFrame(columns=['file_name','name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])

In [None]:
train_df.loc[0] = rule_based_ner('resume\R1.pdf', pdf_to_text('resume\R1.pdf'))

Name:  Amrinder Pelia 

Phone Number:  {'703-743-0795'}

Email:  {'Praveen@indiquesolutions.com', 'amirindersingh1234@gmail.com'}

Qualifications:  Bachelor's degree

Graduation Year:  {2003, 2007, 2008, 2010, 2011, 2012, 2013, 2015}

Location:  {'University', 'Eagan', 'Cincinnati', 'Norristown', 'Jackson', 'Austin'}

Skills:  ['R', 'Microsoft SQL Server', 'Go', 'C', 'J', 'Java', 'SQL', '']

University:  ['Michigan State University']

Company:  ['HP', 'Adobe', 'Oracle', 'Microsoft', '']

Designation:  ['business analyst', 'project management', 'project manager', 'quality assurance', 'risk analyst', 'senior business analyst', 'system admin', 'system analyst', 'system architect', 'team lead', 'technical lead', 'test director']
Execution time: 39.85 seconds


In [None]:
train_df.loc[1] = rule_based_ner('resume\R2.pdf', pdf_to_text('resume\R2.pdf'))

Name:  Mounika Surender 

Phone Number:  {'717-559-5134'}

Email:  {'smounika.ba@gmail.com'}

Qualifications:  Master's degree

Graduation Year:  {2016, 2000, 2003, 2010, 2011, 2012, 2013, 2014, 2015}

Location:  {'Long Beach', 'Walnut Creek', 'Hartford', 'Miami', 'Manage', 'Irvine'}

Skills:  ['Microsoft Office', 'Microsoft PowerPoint', 'Microsoft Word', 'R', 'Go', 'C', 'Microsoft Excel', 'J', 'Microsoft Visio', 'SQL', '']

University:  []

Company:  ['HP', 'Assurant', 'Molina Healthcare', 'Oracle', 'ATI', 'Gap', 'HNI', 'UnitedHealth Group', 'Humana', 'Microsoft', 'Advanta', '']

Designation:  ['business analyst', 'business systems analyst', 'change management', 'data management', 'director oracle', 'lead business analyst', 'management sme', 'primary care physician', 'project management', 'scrum master', 'systems analyst', 'test director']
Execution time: 50.50 seconds


In [None]:
train_df.loc[2] = rule_based_ner('resume\R3.pdf', pdf_to_text('resume\R3.pdf'))

Name:  Sarath Kumar 

Phone Number:  {'469-324-9282'}

Email:  {'sarathm.java@gmail.com'}

Qualifications:  Bachelor's degree

Graduation Year:  {2000, 2003}

Location:  {'Marietta', 'Mongo', 'Ajax', 'Kansas', 'Spring', 'Louisville', 'Birmingham'}

Skills:  ['MongoDB', 'JavaScript', 'Oracle Database', 'Node.js', 'IBM WebSphere', 'PostgreSQL', 'Spring Boot', 'Jenkins CI', 'Linux', 'Spring Framework', 'jQuery', 'AJAX', 'RESTful API', 'Git', 'MySQL', 'UNIX', 'R', 'IBM Rational', 'GitHub', 'React', 'TestNG', 'Eclipse IDE', 'C', 'Apache Tomcat', 'J', 'Microsoft Visio', 'Backbone.js', 'JUnit', 'Selenium', 'Bootstrap', 'C++', 'Java', 'SQL', '']

University:  []

Company:  ['Oracle', 'PPL', 'IBM', 'Intel', 'Pool', 'HNI', 'Microsoft', '']

Designation:  ['business analyst', 'business objects', 'full stack developer', 'java developer', 'project management', 'service provider', 'software developer']
Execution time: 67.95 seconds


In [None]:
train_df.loc[3] = rule_based_ner('resume\R4.pdf', pdf_to_text('resume\R4.pdf'))

Name:  Ravi Reddy 

Phone Number:  {'(402) 500-0041'}

Email:  {'rvk.reddy240@gmail.com'}

Qualifications:  Bachelor's degree

Graduation Year:  {2000, 2002, 2003, 2005, 2007, 2008, 2009, 2011, 2012, 2014}

Location:  {'Best', 'New York', 'Enterprise', 'Chesterfield', 'Ajax', 'Atlanta', 'Hudson', 'Dallas', 'Manage', 'New Delhi', 'Spring', 'Vista'}

Skills:  ['JavaScript', 'Python', 'Node.js', 'Linux', 'Spring Framework', 'NoSQL', 'Oracle PL/SQL', 'AJAX', 'PHP', 'Git', 'MySQL', 'UNIX', 'R', 'Apache Maven', 'Perl', 'Django', 'Scala', 'C', 'Apache Tomcat', 'J', 'Microsoft Visio', 'JUnit', 'Selenium', 'C++', 'Java', 'SQL', '']

University:  []

Company:  ['HP', 'Reinsurance Group of America', 'Oracle', 'Amazon', 'IBM', 'ATI', 'HNI', 'Microsoft', '']

Designation:  ['application developer', 'business analyst', 'development engineer', 'insurance agent', 'j2ee developer', 'java developer', 'order entry', 'project management', 'software development engineer', 'software engineer', 'sql develope

In [None]:
train_df.loc[4] = rule_based_ner('resume\R5.pdf', pdf_to_text('resume\R5.pdf'))

Name:  Suresh Basetti 

Phone Number:  set()

Email:  {'sureshkumar.basetti@gmail.com'}

Qualifications:  Master's degree

Graduation Year:  {2017, 2001, 2002, 2004, 2005, 2009, 2010, 2014}

Location:  {'Vista'}

Skills:  ['JavaScript', 'Python', 'Dig', 'Linux', 'UNIX Shell', 'Ubuntu', 'Git', 'UNIX', 'R', 'GitHub', 'Go', 'Perl', 'Scala', 'C', 'J', 'JUnit', 'C#', 'C++', 'Java', 'SQL', '']

University:  ['Osmania University']

Company:  ['HP', 'Oracle', 'Broadcom', 'Amazon', "McDonald's", 'ATI', 'HNI', 'Microsoft', 'TCS', '']

Designation:  ['agile coach', 'agile project manager', 'certified scrum master', 'change management', 'delivery manager', 'hadoop administrator', 'java developer', 'network administrator', 'process owner', 'product owner', 'project management', 'project manager', 'scrum master', 'technical lead']
Execution time: 33.08 seconds


In [None]:
# some [] because the original dataset does not exist that 
train_df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R1.pdf,Amrinder Pelia,{703-743-0795},"{Praveen@indiquesolutions.com, amirindersingh1...",Bachelor's degree,"{2003, 2007, 2008, 2010, 2011, 2012, 2013, 2015}","{University, Eagan, Cincinnati, Norristown, Ja...","[R, Microsoft SQL Server, Go, C, J, Java, SQL, ]",[Michigan State University],"[HP, Adobe, Oracle, Microsoft, ]","[business analyst, project management, project..."
1,resume\R2.pdf,Mounika Surender,{717-559-5134},{smounika.ba@gmail.com},Master's degree,"{2016, 2000, 2003, 2010, 2011, 2012, 2013, 201...","{Long Beach, Walnut Creek, Hartford, Miami, Ma...","[Microsoft Office, Microsoft PowerPoint, Micro...",[],"[HP, Assurant, Molina Healthcare, Oracle, ATI,...","[business analyst, business systems analyst, c..."
2,resume\R3.pdf,Sarath Kumar,{469-324-9282},{sarathm.java@gmail.com},Bachelor's degree,"{2000, 2003}","{Marietta, Mongo, Ajax, Kansas, Spring, Louisv...","[MongoDB, JavaScript, Oracle Database, Node.js...",[],"[Oracle, PPL, IBM, Intel, Pool, HNI, Microsoft, ]","[business analyst, business objects, full stac..."
3,resume\R4.pdf,Ravi Reddy,{(402) 500-0041},{rvk.reddy240@gmail.com},Bachelor's degree,"{2000, 2002, 2003, 2005, 2007, 2008, 2009, 201...","{Best, New York, Enterprise, Chesterfield, Aja...","[JavaScript, Python, Node.js, Linux, Spring Fr...",[],"[HP, Reinsurance Group of America, Oracle, Ama...","[application developer, business analyst, deve..."
4,resume\R5.pdf,Suresh Basetti,{},{sureshkumar.basetti@gmail.com},Master's degree,"{2017, 2001, 2002, 2004, 2005, 2009, 2010, 2014}",{Vista},"[JavaScript, Python, Dig, Linux, UNIX Shell, U...",[Osmania University],"[HP, Oracle, Broadcom, Amazon, McDonald's, ATI...","[agile coach, agile project manager, certified..."


## 3 - Testing on Resume and validate the accuracy

Take 2-3 resume to test on the result, to see whether the rule-based function is performing well and is able to extract the info.

### 3.1 Without Preprocessed

In [None]:
df.loc[0] = rule_based_ner('resume\R6.pdf', pdf_to_text('resume\R6.pdf'))

Name:  Abiral Pandey 

Phone Number:  {'940-242-3303'}

Email:  {'abiral.pandey88@gmail.com'}

Qualifications:  Bachelor's degree

Graduation Year:  {2016, 2000, 2005, 2008, 2012, 2013, 2014, 2015}

Location:  {'Woonsocket', 'Mongo', 'Texas', 'University', 'March', 'Ajax', 'Denton', 'Missouri', 'Ogden', 'Irving', 'Spring', 'Kansas City'}

Skills:  ['MongoDB', 'JavaScript', 'Oracle Database', 'IBM WebSphere', 'PostgreSQL', 'Linux', 'Spring Framework', 'UNIX Shell', 'NoSQL', 'jQuery', 'Ada', 'AJAX', 'Git', 'MySQL', 'UNIX', 'R', 'GitHub', 'Go', 'IBM DB2', 'Eclipse IDE', 'C', 'Route', 'Apache Tomcat', 'J', 'JUnit', 'Bootstrap', 'Java', 'SQL', '']

University:  []

Company:  ['HP', 'Oracle', 'Toll Brothers', 'IBM', 'HNI', 'Visa', 'Microsoft', '']

Designation:  ['business analyst', 'business objects', 'j2ee developer', 'java developer', 'junior java developer', 'software engineer']
Execution time: 41.08 seconds


In [None]:
df.loc[1] = rule_based_ner('resume\R7.pdf', pdf_to_text('resume\R7.pdf'))

Name:  Adelina Erimia

Phone Number:  {'469-331-7851'}

Email:  {'erimia@msn.com'}

Qualifications:  Master's degree

Graduation Year:  {2016, 2018, 2002, 2008, 2009, 2011, 2012, 2015}

Location:  {'Hartford', 'Savannah'}

Skills:  ['Smartsheet', 'Microsoft Word', 'R', 'C', 'J', '']

University:  ['Six Sigma Green Belt \nThe George Washington University']

Company:  ['UnitedHealth Group', 'Microsoft', 'ITI', '']

Designation:  ['board of directors', 'business strategist', 'green belt', 'it project manager', 'management professional', 'project coordinator', 'project management', 'project management professional', 'project manager', 'scrum master', 'service provider', 'support manager']
Execution time: 19.05 seconds


In [None]:
df.loc[2] = rule_based_ner('resume\R8.pdf',pdf_to_text('resume\R8.pdf'))

Name:  Special Accomplishments

Phone Number:  {'(248) 412-1658'}

Email:  {'jagansrconsult@gmail.com'}

Qualifications:  Master's degree

Graduation Year:  {1980, 1994, 1996, 1998, 2003, 2004, 2005, 1982, 2010, 2011, 2012, 2014, 2015}

Location:  {'Charlotte', 'New York', 'Enterprise', 'Detroit', 'University', 'Manage', 'Farmington Hills', 'Chennai', 'Riyadh', 'Louisville', 'Nashville', 'Philadelphia'}

Skills:  ['R', 'C', 'J', 'SQL', '']

University:  ['Central Michigan University']

Company:  ['Oracle', 'General Motors', 'Snap', 'TIAA', 'Comerica', 'Ally Financial', 'ATI', 'Humana', 'ITI', 'ACC', '']

Designation:  ['account manager', 'change management', 'delivery manager', 'development analyst', 'manager software', 'network architect', 'oracle dba', 'program manager', 'project management', 'project manager', 'senior project manager', 'test manager']
Execution time: 26.24 seconds


In [None]:
df.loc[3] = rule_based_ner('resume\R9.pdf', pdf_to_text('resume\R9.pdf'))

Name:  Full Stack Java Developer 

Phone Number:  {'650-517-6475'}

Email:  {'harshithac511@gmail.com'}

Qualifications:  Bachelor's degree

Graduation Year:  set()

Location:  {'Mongo', 'Charlotte', 'March', 'Ajax', 'San Antonio', 'Hudson', 'Date', 'Spring'}

Skills:  ['MongoDB', 'JavaScript', 'IBM WebSphere', 'PostgreSQL', 'Spring Boot', 'Linux', 'Spring Framework', 'jQuery', 'AJAX', 'RESTful API', 'Git', 'MySQL', 'UNIX', 'R', 'IBM Rational', 'GitHub', 'React', 'Eclipse IDE', 'C', 'Apache Tomcat', 'J', 'Microsoft Visio', 'JUnit', 'Bootstrap', 'Java', 'SQL', '']

University:  []

Company:  ['Oracle', 'USAA', 'Splunk', 'IBM', 'APA', 'ATI', 'HNI', 'Wells Fargo', 'Microsoft', 'Symphony', '']

Designation:  ['business objects', 'full stack developer', 'java developer', 'project management', 'software developer']
Execution time: 51.02 seconds


In [None]:
df.loc[4] = rule_based_ner('resume\R10.pdf', pdf_to_text('resume\R10.pdf'))

Name:  Java Developer 

Phone Number:  {'857-228-4961'}

Email:  {'mohid0089300@gmail.com'}

Qualifications:  Unknown

Graduation Year:  {2005}

Location:  {'Mongo', 'Fargo', 'Ajax', 'Seattle', 'Burlingame', 'Watertown', 'Spring'}

Skills:  ['JavaScript', 'Spring Boot', 'Linux', 'AJAX', 'RESTful API', 'MySQL', 'UNIX', 'R', 'Go', 'Perl', 'Eclipse IDE', 'C', 'J', 'Apache Struts', 'JUnit', 'Bootstrap', 'C++', 'Java', 'SQL', '']

University:  []

Company:  ['Oracle', 'PPL', 'Amazon', 'IBM', 'Nordstrom', 'ATI', '']

Designation:  ['application developer', 'business objects', 'healthcare management', 'java developer', 'linux system admin', 'oracle developer', 'system admin', 'systems analyst']
Execution time: 41.08 seconds


In [None]:
df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R6.pdf,Abiral Pandey,{940-242-3303},{abiral.pandey88@gmail.com},Bachelor's degree,"{2016, 2000, 2005, 2008, 2012, 2013, 2014, 2015}","{Woonsocket, Mongo, Texas, University, March, ...","[MongoDB, JavaScript, Oracle Database, IBM Web...",[],"[HP, Oracle, Toll Brothers, IBM, HNI, Visa, Mi...","[business analyst, business objects, j2ee deve..."
1,resume\R7.pdf,Adelina Erimia,{469-331-7851},{erimia@msn.com},Master's degree,"{2016, 2018, 2002, 2008, 2009, 2011, 2012, 2015}","{Hartford, Savannah}","[Smartsheet, Microsoft Word, R, C, J, ]",[Six Sigma Green Belt \nThe George Washington ...,"[UnitedHealth Group, Microsoft, ITI, ]","[board of directors, business strategist, gree..."
2,resume\R8.pdf,Special Accomplishments,{(248) 412-1658},{jagansrconsult@gmail.com},Master's degree,"{1980, 1994, 1996, 1998, 2003, 2004, 2005, 198...","{Charlotte, New York, Enterprise, Detroit, Uni...","[R, C, J, SQL, ]",[Central Michigan University],"[Oracle, General Motors, Snap, TIAA, Comerica,...","[account manager, change management, delivery ..."
3,resume\R9.pdf,Full Stack Java Developer,{650-517-6475},{harshithac511@gmail.com},Bachelor's degree,{},"{Mongo, Charlotte, March, Ajax, San Antonio, H...","[MongoDB, JavaScript, IBM WebSphere, PostgreSQ...",[],"[Oracle, USAA, Splunk, IBM, APA, ATI, HNI, Wel...","[business objects, full stack developer, java ..."
4,resume\R10.pdf,Java Developer,{857-228-4961},{mohid0089300@gmail.com},Unknown,{2005},"{Mongo, Fargo, Ajax, Seattle, Burlingame, Wate...","[JavaScript, Spring Boot, Linux, AJAX, RESTful...",[],"[Oracle, PPL, Amazon, IBM, Nordstrom, ATI, ]","[application developer, business objects, heal..."


### 3.2 With Preprocessing

In [None]:
# Create an empty DataFrame with the desired column names
df2 = pd.DataFrame(columns=['file_name','name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])

In [None]:
df2.loc[0] = rule_based_ner('resume\R6.pdf',cv_preprocessing(pdf_to_text('resume\R6.pdf')))

Name:  Abiral Pandey Email 

Phone Number:  {'940-242-3303'}

Email:  set()

Qualifications:  Bachelor's degree

Graduation Year:  {2016, 2000, 2005, 2008, 2012, 2013, 2014, 2015}

Location:  {'Woonsocket', 'Mongo', 'Texas', 'March', 'Ajax', 'Denton', 'Ogden', 'Irving', 'Spring', 'Kansas City'}

Skills:  ['MongoDB', 'JavaScript', 'Oracle Database', 'IBM WebSphere', 'PostgreSQL', 'Linux', 'Spring Framework', 'UNIX Shell', 'NoSQL', 'jQuery', 'Ada', 'AJAX', 'Git', 'MySQL', 'UNIX', 'R', 'GitHub', 'Go', 'IBM DB2', 'Eclipse IDE', 'C', 'Route', 'Apache Tomcat', 'J', 'JUnit', 'Bootstrap', 'Java', 'SQL', '']

University:  []

Company:  ['HP', 'Oracle', 'Toll Brothers', 'IBM', 'HNI', 'Visa', 'Microsoft', '']

Designation:  ['business analyst', 'j2ee developer', 'java developer', 'junior java developer', 'software engineer', 'web designer']
Execution time: 34.66 seconds


In [None]:
df2.loc[1] = rule_based_ner('resume\R7.pdf',cv_preprocessing(pdf_to_text('resume\R7.pdf')))

Name:  Adelina Erimia 

Phone Number:  {'469-331-7851'}

Email:  set()

Qualifications:  Master's degree

Graduation Year:  {2016, 2018, 2002, 2008, 2009, 2011, 2012, 2015}

Location:  {'Savannah'}

Skills:  ['Smartsheet', 'Microsoft Word', 'R', 'C', 'J', '']

University:  ['Six Sigma Green Belt The George Washington University']

Company:  ['UnitedHealth Group', 'Microsoft', 'ITI', '']

Designation:  ['business strategist', 'client executive', 'green belt', 'it project manager', 'management professional', 'project coordinator', 'project management', 'project management professional', 'project manager', 'scrum master', 'service provider', 'support manager', 'support team member', 'team member']
Execution time: 12.98 seconds


In [None]:
df2.loc[2] = rule_based_ner('resume\R8.pdf',cv_preprocessing(pdf_to_text('resume\R8.pdf')))

Name:  Special Accomplishments 

Phone Number:  set()

Email:  set()

Qualifications:  Master's degree

Graduation Year:  {1980, 1994, 1996, 1998, 2003, 2004, 2005, 1982, 2010, 2011, 2012, 2014, 2015}

Location:  {'Charlotte', 'New York', 'Enterprise', 'Detroit', 'University', 'Manage', 'Farmington Hills', 'Chennai', 'Riyadh', 'Louisville', 'Nashville', 'Philadelphia'}

Skills:  ['R', 'C', 'J', 'SQL', '']

University:  ['Central Michigan University']

Company:  ['Oracle', 'General Motors', 'Snap', 'TIAA', 'Comerica', 'Ally Financial', 'ATI', 'Humana', 'ITI', 'ACC', '']

Designation:  ['account manager', 'change management', 'delivery manager', 'development analyst', 'manager software', 'network architect', 'oracle dba', 'program manager', 'project management', 'project manager', 'senior project manager', 'service provider', 'team member', 'test manager']
Execution time: 23.42 seconds


In [None]:
df2.loc[3] = rule_based_ner('resume\R9.pdf',cv_preprocessing(pdf_to_text('resume\R9.pdf')))

Name:  Harshitha Email 

Phone Number:  {'650-517-6475'}

Email:  set()

Qualifications:  Bachelor's degree

Graduation Year:  set()

Location:  {'Mongo', 'Charlotte', 'March', 'Ajax', 'San Antonio', 'Hudson', 'Spring'}

Skills:  ['MongoDB', 'JavaScript', 'IBM WebSphere', 'PostgreSQL', 'Spring Boot', 'Linux', 'Spring Framework', 'jQuery', 'AJAX', 'RESTful API', 'Git', 'MySQL', 'UNIX', 'R', 'IBM Rational', 'Hibernate ORM', 'GitHub', 'React', 'Eclipse IDE', 'C', 'Apache Tomcat', 'J', 'Microsoft Visio', 'JUnit', 'Bootstrap', 'Java', 'SQL', '']

University:  []

Company:  ['Oracle', 'USAA', 'Splunk', 'IBM', 'APA', 'ATI', 'HNI', 'Wells Fargo', 'Microsoft', 'Symphony', '']

Designation:  ['aws developer', 'business objects', 'full stack developer', 'java developer', 'project management', 'software developer', 'team member']
Execution time: 48.51 seconds


In [None]:
df2.loc[4] = rule_based_ner('resume\R10.pdf',cv_preprocessing(pdf_to_text('resume\R10.pdf')))

Name:  Mohid Phone 

Phone Number:  {'857-228-4961'}

Email:  set()

Qualifications:  Unknown

Graduation Year:  {2005}

Location:  {'Mongo', 'Fargo', 'Ajax', 'Seattle', 'Burlingame', 'Watertown', 'Spring'}

Skills:  ['JavaScript', 'Spring Boot', 'Linux', 'AJAX', 'RESTful API', 'MySQL', 'UNIX', 'R', 'Hibernate ORM', 'Go', 'Perl', 'Eclipse IDE', 'C', 'Apache Tomcat', 'J', 'Apache Struts', 'JUnit', 'Bootstrap', 'C++', 'Java', 'SQL', '']

University:  []

Company:  ['Oracle', 'PPL', 'Amazon', 'IBM', 'Nordstrom', 'ATI', '']

Designation:  ['application developer', 'business analyst', 'healthcare management', 'java developer', 'linux system admin', 'oracle developer', 'system admin', 'systems analyst', 'team member']
Execution time: 41.13 seconds


In [None]:
df2

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R6.pdf,Abiral Pandey Email,{940-242-3303},{},Bachelor's degree,"{2016, 2000, 2005, 2008, 2012, 2013, 2014, 2015}","{Woonsocket, Mongo, Texas, March, Ajax, Denton...","[MongoDB, JavaScript, Oracle Database, IBM Web...",[],"[HP, Oracle, Toll Brothers, IBM, HNI, Visa, Mi...","[business analyst, j2ee developer, java develo..."
1,resume\R7.pdf,Adelina Erimia,{469-331-7851},{},Master's degree,"{2016, 2018, 2002, 2008, 2009, 2011, 2012, 2015}",{Savannah},"[Smartsheet, Microsoft Word, R, C, J, ]",[Six Sigma Green Belt The George Washington Un...,"[UnitedHealth Group, Microsoft, ITI, ]","[business strategist, client executive, green ..."
2,resume\R8.pdf,Special Accomplishments,{},{},Master's degree,"{1980, 1994, 1996, 1998, 2003, 2004, 2005, 198...","{Charlotte, New York, Enterprise, Detroit, Uni...","[R, C, J, SQL, ]",[Central Michigan University],"[Oracle, General Motors, Snap, TIAA, Comerica,...","[account manager, change management, delivery ..."
3,resume\R9.pdf,Harshitha Email,{650-517-6475},{},Bachelor's degree,{},"{Mongo, Charlotte, March, Ajax, San Antonio, H...","[MongoDB, JavaScript, IBM WebSphere, PostgreSQ...",[],"[Oracle, USAA, Splunk, IBM, APA, ATI, HNI, Wel...","[aws developer, business objects, full stack d..."
4,resume\R10.pdf,Mohid Phone,{857-228-4961},{},Unknown,{2005},"{Mongo, Fargo, Ajax, Seattle, Burlingame, Wate...","[JavaScript, Spring Boot, Linux, AJAX, RESTful...",[],"[Oracle, PPL, Amazon, IBM, Nordstrom, ATI, ]","[application developer, business analyst, heal..."


## 4 - Compare Testing Results

Get all the resumes in pdf format and perform preprocessing, and finally save it into dataframe.

In [None]:
# before preprocessing 
df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R6.pdf,Abiral Pandey,{940-242-3303},{abiral.pandey88@gmail.com},Bachelor's degree,"{2016, 2000, 2005, 2008, 2012, 2013, 2014, 2015}","{Woonsocket, Mongo, Texas, University, March, ...","[MongoDB, JavaScript, Oracle Database, IBM Web...",[],"[HP, Oracle, Toll Brothers, IBM, HNI, Visa, Mi...","[business analyst, business objects, j2ee deve..."
1,resume\R7.pdf,Adelina Erimia,{469-331-7851},{erimia@msn.com},Master's degree,"{2016, 2018, 2002, 2008, 2009, 2011, 2012, 2015}","{Hartford, Savannah}","[Smartsheet, Microsoft Word, R, C, J, ]",[Six Sigma Green Belt \nThe George Washington ...,"[UnitedHealth Group, Microsoft, ITI, ]","[board of directors, business strategist, gree..."
2,resume\R8.pdf,Special Accomplishments,{(248) 412-1658},{jagansrconsult@gmail.com},Master's degree,"{1980, 1994, 1996, 1998, 2003, 2004, 2005, 198...","{Charlotte, New York, Enterprise, Detroit, Uni...","[R, C, J, SQL, ]",[Central Michigan University],"[Oracle, General Motors, Snap, TIAA, Comerica,...","[account manager, change management, delivery ..."
3,resume\R9.pdf,Full Stack Java Developer,{650-517-6475},{harshithac511@gmail.com},Bachelor's degree,{},"{Mongo, Charlotte, March, Ajax, San Antonio, H...","[MongoDB, JavaScript, IBM WebSphere, PostgreSQ...",[],"[Oracle, USAA, Splunk, IBM, APA, ATI, HNI, Wel...","[business objects, full stack developer, java ..."
4,resume\R10.pdf,Java Developer,{857-228-4961},{mohid0089300@gmail.com},Unknown,{2005},"{Mongo, Fargo, Ajax, Seattle, Burlingame, Wate...","[JavaScript, Spring Boot, Linux, AJAX, RESTful...",[],"[Oracle, PPL, Amazon, IBM, Nordstrom, ATI, ]","[application developer, business objects, heal..."


In [None]:
# after preprocessing
df2

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,resume\R6.pdf,Abiral Pandey Email,{940-242-3303},{},Bachelor's degree,"{2016, 2000, 2005, 2008, 2012, 2013, 2014, 2015}","{Woonsocket, Mongo, Texas, March, Ajax, Denton...","[MongoDB, JavaScript, Oracle Database, IBM Web...",[],"[HP, Oracle, Toll Brothers, IBM, HNI, Visa, Mi...","[business analyst, j2ee developer, java develo..."
1,resume\R7.pdf,Adelina Erimia,{469-331-7851},{},Master's degree,"{2016, 2018, 2002, 2008, 2009, 2011, 2012, 2015}",{Savannah},"[Smartsheet, Microsoft Word, R, C, J, ]",[Six Sigma Green Belt The George Washington Un...,"[UnitedHealth Group, Microsoft, ITI, ]","[business strategist, client executive, green ..."
2,resume\R8.pdf,Special Accomplishments,{},{},Master's degree,"{1980, 1994, 1996, 1998, 2003, 2004, 2005, 198...","{Charlotte, New York, Enterprise, Detroit, Uni...","[R, C, J, SQL, ]",[Central Michigan University],"[Oracle, General Motors, Snap, TIAA, Comerica,...","[account manager, change management, delivery ..."
3,resume\R9.pdf,Harshitha Email,{650-517-6475},{},Bachelor's degree,{},"{Mongo, Charlotte, March, Ajax, San Antonio, H...","[MongoDB, JavaScript, IBM WebSphere, PostgreSQ...",[],"[Oracle, USAA, Splunk, IBM, APA, ATI, HNI, Wel...","[aws developer, business objects, full stack d..."
4,resume\R10.pdf,Mohid Phone,{857-228-4961},{},Unknown,{2005},"{Mongo, Fargo, Ajax, Seattle, Burlingame, Wate...","[JavaScript, Spring Boot, Linux, AJAX, RESTful...",[],"[Oracle, PPL, Amazon, IBM, Nordstrom, ATI, ]","[application developer, business analyst, heal..."
