In [1]:
## pdfminer for converting pdf files to html file
from pdfminer.layout import LAParams
from pdfminer.converter import HTMLConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
import io

import re
## bs4 for manipulating HTML tree
from bs4 import BeautifulSoup
## spacy for extraing keywords
import spacy
from spacy.matcher import Matcher
## for getting current time
from datetime import datetime
import calendar

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
model = spacy.load('en_core_web_lg')

In [4]:
# is_heading_upper_case = False
education_token = model('education')
skills_token = model('skills')
experience_token = model('experience')
achievement_token = model('achievement')

In [41]:
fname = '/path/resume.pdf'

In [6]:
def read_file(fname):
    rsrcmgr = PDFResourceManager(caching=False)
    retstr = io.StringIO()
    codec = False
    laparams = LAParams()
    scale = 1.0
    layoutmode='normal'
    imagewriter = None
    resume_html = ''
    with open(fname, "rb") as fp:
        device = HTMLConverter(fp, retstr, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            resume_html =  retstr.getvalue()
    parsed_html = BeautifulSoup(resume_html,"html.parser")
    return parsed_html.body

In [7]:
def create_dict(data):
    data = data.replace('"',"'")
    data = data.replace('; ','", "')
    data = data.replace(';','')
    data = data.replace(":",'":"')
    return eval('{"'+data+'"}')

In [8]:
def get_text_size_and_frequency(htmldata):
    font_size_frequency = {}
    for span_data in htmldata.find_all('span'):
        text_style = create_dict(span_data.attrs['style'])
        if 'font-size' in text_style:
            font_size = int(text_style['font-size'].replace('px',''))
            if font_size in font_size_frequency:
                font_size_frequency[font_size] = font_size_frequency[font_size]+1
            else:
                font_size_frequency[font_size] = 1
    return font_size_frequency

In [9]:
def get_maximum_frequency(font_size_dict):
    max_freq = 0
    max_freq_size = 0
    for key in font_size_dict:
        if max_freq < font_size_dict[key]:
            max_freq = font_size_dict[key]
            max_freq_size = key
    return max_freq, max_freq_size

In [10]:
def is_heading(text_style_dict, data, max_frequency_size_font):
    if 'font-size' in text_style_dict and ((max_frequency_size_font < int(text_style_dict['font-size'].replace('px',''))) or (data.text.isupper())):
        return True
    return False

In [11]:
def is_key_present(text,key_token):
    tokens = model(text.lower().replace('\n',' '))
    for token in tokens:
        if token.similarity(key_token)>0.65:
            return True
    return False

In [12]:
def is_resume_heading(text_style_dict, data, max_frequency_size_font):
    is_heading_upper_case = False
    if is_heading(text_style_dict,data, max_frequency_size_font) and (is_key_present(data.text,experience_token) or is_key_present(data.text,skills_token) or is_key_present(data.text,education_token)):
        if data.text.isupper():
            is_heading_upper_case = True
        return True, is_heading_upper_case
    return False, is_heading_upper_case

In [13]:
def get_all_possible_main_heading(htmldata,max_frequency_size_font):
    possible_heading_with_frequency = {}
    is_heading_upper_case = False
    for span_data in htmldata.find_all('span'):
        text_style = create_dict(span_data.attrs['style'])
        if is_resume_heading(text_style, span_data, max_frequency_size_font)[0]:
            is_heading_upper_case = is_resume_heading(text_style, span_data, max_frequency_size_font)[1]
            font_size = int(text_style['font-size'].replace('px',''))
            if font_size in possible_heading_with_frequency:
                possible_heading_with_frequency[font_size]+=1
            else:
                possible_heading_with_frequency[font_size]=1
    return possible_heading_with_frequency, is_heading_upper_case

In [14]:
def is_of_heading_size(text_style_dict, data, heading_size, is_heading_upper_case):
    if 'font-size' in text_style_dict and (heading_size == int(text_style_dict['font-size'].replace('px',''))) and ((is_heading_upper_case and data.text.isupper()) or (not is_heading_upper_case and not data.text.isupper())):
        return True
    return False

In [15]:
def is_name(text_style_dict, data, is_first_heading, max_frequency_size_font):
    if is_heading(text_style_dict, data, max_frequency_size_font):
        if is_first_heading:
            return True
        return False
    return False

In [16]:
def is_education(text_style_dict, data, heading_size, is_heading_upper_case):
    if is_of_heading_size(text_style_dict,data,heading_size,is_heading_upper_case) and is_key_present(data.text,education_token):
        return True
    return False

In [17]:
def is_skills(text_style_dict, data, heading_size, is_heading_upper_case):
    if is_of_heading_size(text_style_dict,data,heading_size,is_heading_upper_case) and is_key_present(data.text,skills_token):
        return True
    return False

In [18]:
def is_experienced(text_style_dict, data, heading_size, is_heading_upper_case):
    if is_of_heading_size(text_style_dict,data,heading_size,is_heading_upper_case) and is_key_present(data.text,experience_token):
        return True
    return False

In [19]:
def get_mail_id(data):
    match = re.search(r'[\w\.-]+@[\w\.-]+', data)
    return match.group(0).replace('\n',' ')

In [20]:
def get_contact_number(data):
    mobile_number = re.findall(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', data)
    return mobile_number[0].replace('\n',' ')

In [21]:
def experience_matching_pattern(year_keywords,year_synonyms):
    matcher = Matcher(model.vocab)
    matcher.add("month_year_punct_present", None, [{"LOWER": {"IN":year_keywords}}, {"LIKE_NUM": True}, {"IS_PUNCT":True}, {"LOWER": "present"}])
    matcher.add("month_year", None, [{"LOWER": {"IN":year_keywords}}, {"LIKE_NUM": True}])
    matcher.add("year_punct_month", None, [{"LIKE_NUM": True}, {"IS_PUNCT":True}, {"LOWER": {"IN":year_keywords}}])
    matcher.add("year_month", None, [{"LIKE_NUM": True}, {"LOWER": {"IN":year_keywords}}])
    matcher.add("year_exp", None, [{"LIKE_NUM": True}, {"LOWER": {"IN":year_synonyms}}])
    matcher.add("year_punct_exp", None, [{"LIKE_NUM": True}, {"IS_PUNCT":False}, {"LOWER": {"IN":year_synonyms}}])
    return matcher

In [22]:
def get_year_matched_tokens(years, matcher):
    matches = []
    for i in years:
        doc = model(i.replace('.',' '))
        matches.append(matcher(doc))
    return matches

In [23]:
def add_tokens_if_absent(experience, candidate_info):
    for i in candidate_info:
        tokens = model(i.replace('.',' '))
        for ent in tokens.ents:
            if ent.label_ == 'DATE':
                experience.append(ent.text)
    return experience

In [24]:
def get_candidates_experience_in_year(candidate_info, matcher):
    matches = get_year_matched_tokens(candidate_info,matcher)
    experience = []
    for i,match in enumerate(matches):
        if len(match)>0:
            experience.append(candidate_info[i])
    if len(experience) == 0:
        experience = add_tokens_if_absent(experience, candidate_info)

    return experience

In [25]:
def get_present_time():
    now = datetime.now()
    current_time = now.strftime("%b %Y")
    return current_time

In [26]:
def convert_year(year):
    month_val = dict((v.lower(),k) for k,v in enumerate(calendar.month_abbr))
    if year == 'present':
        month_year = get_present_time().strip().lower().split(' ')
    else:
        month_year = year.strip().lower().split(' ')
    if len(month_year)>1:
        return float(month_year[1])+0.001*float(month_val.get(month_year[0]))
    else:
        return float(month_year[0])+0.001*float(month_val.get(''))

In [27]:
def get_year_diff(years):
    time_diff = []
    for year in years:
        year = year.lower().split('-')
        if len(year)>1:
            start = convert_year(year[0].strip())
            end = convert_year(year[1].strip())
            time_diff.append((str(start)+'-'+str(end),str(round(end-start,3))))
        else:
            time_diff.append((str(year),str(0)))
    return time_diff

In [28]:
def get_years_from_education_and_experience(candidate_info, matcher):
    return get_year_diff(get_candidates_experience_in_year(candidate_info, matcher))

## not catered for cases:
<ul>
    <li>([('2018', '2018'),('2013', '2013')], [])</li>
    <li>([('2018.001-2019.009', '1.008'),('2013', '2013')], [])</li>
</ul>

In [29]:
def get_time_interval(experience):
    exp = 0.0
    for i in experience:
        if len(i[0].split('-'))>1:
            exp+=float(i[1])
    final_exp = int(str(exp).split('.')[0])+int(int(str(exp).split('.')[1])/12)+0.01*int(int(str(exp).split('.')[1])%12)
    return final_exp

In [30]:
def preprocess_skills_data(skill_set):
    skills = skill_set
    skill_set = []
    for skill in skills:
        skill_set.extend(skill.split(','))
    return skill_set

In [35]:
def preprocess_experience_data(data, matcher):
    data = get_years_from_education_and_experience(data,matcher)
    return get_time_interval(data)

In [36]:
def extract_candidate_details(htmldata, max_frequency_size_font, heading_size, is_heading_upper_case):
    candidate_info = {}
    is_first_heading = True
    heading_found = False
    key = None
    year_keywords = ["jan","feb","mar","apr","may","jun","jul","aug","sept","oct","nov","dec"]
    year_synonyms = ["yr","yrs","year","years"]
    matcher = experience_matching_pattern(year_keywords, year_synonyms)
    for span_data in htmldata.find_all('span'):
        text_style = create_dict(span_data.attrs['style'])
        if is_name(text_style, span_data, is_first_heading, max_frequency_size_font):
            candidate_info['name'] = span_data.text.replace('\n',' ').strip()
            is_first_heading = False
        elif is_education(text_style, span_data, heading_size, is_heading_upper_case):
            key = 'education'
            candidate_info[key] = []
        elif is_skills(text_style, span_data, heading_size, is_heading_upper_case):
            key = 'skills'
            candidate_info[key] = []
        elif is_experienced(text_style, span_data, heading_size, is_heading_upper_case):
            key = 'experience'
            candidate_info[key] = []
        elif is_of_heading_size(text_style, span_data, heading_size, is_heading_upper_case):
            key = None
        elif key is not None:
            if len(span_data.text.replace('\n',' ')) > 0:
                candidate_info[key].append(span_data.text.replace('\n',' ').strip())
    
    candidate_info['mail_id'] = get_mail_id(htmldata.get_text().replace('\n',' '))
    candidate_info['contact_number'] = get_contact_number(htmldata.get_text().replace('\n',' '))
    if 'skills' in candidate_info:
        candidate_info['skills'] = preprocess_skills_data(candidate_info['skills'])
    if 'experience' in candidate_info:
        candidate_info['experience'] = preprocess_experience_data(candidate_info['experience'], matcher)
    if 'education' in candidate_info:
        candidate_info['education'] = preprocess_experience_data(candidate_info['education'], matcher)
    return candidate_info

In [39]:
def extract_text(filePath):
    parsed_html = read_file(filePath)
    font_size_frequency = get_text_size_and_frequency(parsed_html)
    max_frequency_font, max_frequency_size_font = get_maximum_frequency(font_size_frequency)
    possible_heading_with_frequency, is_heading_upper_case = get_all_possible_main_heading(parsed_html,max_frequency_size_font)
    heading_size = get_maximum_frequency(possible_heading_with_frequency)[1]
    candidate_info = extract_candidate_details(parsed_html,max_frequency_size_font,heading_size,is_heading_upper_case)
    return candidate_info

In [40]:
extract_text(fname)

{'name': 'Kumar Mangalam',
 'experience': 1.09,
 'education': 4.0,
 'skills': ['Java',
  ' SQL',
  ' Python',
  'Bootstrap',
  ' jQuery',
  ' d3.js',
  '\xa0 HTML5',
  ' CSS3',
  ' Javascript',
  '\xa0 Kibana',
  'MongoDB',
  ' OrientDB',
  ' MySQL',
  '\xa0 ElasticSearch',
  'Docker',
  ' gUnicorn',
  ' Nginx',
  '\xa0 Airflow',
  'Pandas',
  ' Numpy',
  ' Matplotlib',
  '\xa0 Seaborn',
  ' Sklearn',
  ' OpenCV',
  '\xa0 Tensorflow',
  'RNN',
  ' LSTM',
  ' CNN',
  ' Attention',
  '\xa0 Transformer',
  ' YOLO',
  ' SNN',
  '\xa0 Transfer learning',
  ' Statistical\xa0 classification'],
 'mail_id': 'manguatwork@gmail.com',
 'contact_number': '7004329208'}