In [1]:
import tika
import re
from tika import parser
tika.initVM()


def extract_text_from_resume(file_path):
    results = parser.from_file(filename=file_path)
    document_text = results['content']
    
    document_text = remove_metadata(document_text)
    return document_text

def remove_metadata(text):
    # Define regular expressions to match metadata and unwanted generic strings
    # regex_list = []
    regex_list = [
        r'^Title:.*$',
        r'^Author:.*$',
        r'^CreationDate:.*$',
        r'^ModDate:.*$',
        r'^Producer:.*$',
        r'^Keywords:.*$',
        r'^Subject:.*$',
        r'^Content-Type:.*$',
        r'^Resume.*$',
        r'^CV.*$',
    ]
    
    # Remove matching patterns from the text
    for regex in regex_list:
        text = re.sub(regex, '', text, flags=re.MULTILINE)
    
    return text.strip()


In [2]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
import re
import pandas as pd

nlp = spacy.load('en_core_web_trf')

def extract_name(nlp, text):
    # Process the text with spaCy
    doc = nlp(text)

    # Initialize an empty dictionary to store the extracted names
    names = {
        'first_name': '',
        'middle_name': '',
        'last_name': ''
    }

    # Look for entities in the text that are labeled as a person (PERSON)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            # Split the entity text into tokens and determine the first, middle, and last names
            tokens = ent.text.split()
            if len(tokens) == 1:
                # If there is only one token, assume it is the first name
                names['first_name'] = tokens[0]
            elif len(tokens) == 2:
                # If there are two tokens, assume the first is the first name and the second is the last name
                names['first_name'] = tokens[0]
                names['last_name'] = tokens[1]
            elif len(tokens) == 3:
                # If there are three tokens, assume the first is the first name, the second is the middle name, and the third is the last name
                names['first_name'] = tokens[0]
                names['middle_name'] = tokens[1]
                names['last_name'] = tokens[2]
            else:
                # If there are more than three tokens, assume the last three are the middle and last name
                names['first_name'] = tokens[0]
                names['middle_name'] = ' '.join(tokens[1:-1])
                names['last_name'] = tokens[-1]

            break

    return names


def extract_contact_info(text):
    # Extract phone number using regular expression
    phone_regex = r'\b(?:\d[ -.]*){9,}\b'
    phone_number = re.findall(phone_regex, text)

    # Extract email address using regular expression
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    email = re.findall(email_regex, text)

    contact_info = {
        'phone': phone_number,
        'email': email
    }

    return contact_info


def extract_skills(nlp, resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]

    # reading the csv file
    data = pd.read_csv("../data/skills.csv")

    # extract values
    skills = list(data.columns.values)

    skillset = []

    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)

    # check for bi-grams and tri-grams (example: machine learning)
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(text.lower()) for text in skills]
    matcher.add('Skill', None, *patterns)
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        skillset.append(span.text)

    return [i.capitalize() for i in set([i.lower() for i in skillset])]



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_resume(resume_text):
    # Extract name
    name = extract_name(nlp, resume_text)

    # Extract contact info
    contact_info = extract_contact_info(resume_text)

    # Extract skills
    skills = extract_skills(nlp, resume_text)

    # # Extract education
    # education = extract_education(resume_text)

    # # Extract work experience
    # work_experience = extract_work_experience(resume_text)

    # Create a dictionary containing all the extracted data fields
    resume_dict = {
        'name': name,
        'contact_info': contact_info,
        'skills': skills,
        # 'education': education,
        # 'work_experience': work_experience
    }

    return resume_dict


In [4]:
nlp = spacy.load('en_core_web_trf')
file_path = '../data/Resume/mb.pdf'
extracted_text = extract_text_from_resume(file_path)
doc = nlp(extracted_text)
text = " ".join(
    [token.text for token in doc if not token.is_stop and not token.is_punct])
clean_text = re.sub('\s+', ' ', text).strip()

In [5]:
resume_info = extract_resume(clean_text)

print(resume_info)

{'name': {'first_name': 'Manas', 'middle_name': '', 'last_name': 'Bhattarai'}, 'contact_info': {'phone': ['9779862304880 ', '2011 12 2013 14'], 'email': ['bhattaraimanas@gmail.com', 'bhattaraimanas@gmail.com']}, 'skills': ['Pillow', 'Keras', 'R', 'Analysis', 'Windows', 'Unix', 'Flask', 'Tkinter', 'Java', 'English', 'Css', 'Matplotlib', 'Linux', 'Javascript', 'Tableau', 'Numpy', 'Python', 'Tensorflow', 'Excel', 'International', 'Cloud', 'Mysql', 'Opencv', 'Engineering', 'System', 'Networking', 'C++', 'Technical', 'Research', 'Github', 'Programming', 'Database', 'C', 'Email', 'Html']}


In [7]:
from extract_resume_info import get_clean_txt

# define path to resume file
resume_file_path = "../data/Resume/dada.pdf"

# call get_clean_txt() function to extract and clean text from the resume file
resume_text = get_clean_txt(resume_file_path)

# print the cleaned text
print(resume_text)


b'Anish Dahal \n \n\n Address Sanepa Lalitpur \n Contact Number +977 9817535981 \n Email anishdahal441@gmail.com \n LinkedIn https://www.linkedin.com/in/anish-dahal/ \n\n \n\n SUMMARY \n \n Passionate learn new challenging things Hardworking dedicated motivated achieve \n perfection \n\n \n\n EDUCATION \n \n IOE Thapathali Campus Kathmandu \n Bachelors Electronics Communication Engineering \n Nov 2017 Present \n\n \n CCRC Kathmandu \n +2 \n Jun 2015 2017 \n\n \n Vishwa Jyoti Higher Secondary School \n SLC \n Apr 2003 2015 \n\n \n\n PROJECTS \n \n Minor Project Dynamic Maze Solving D*-Lite Dead End Exclusion Algorithm \n group project 4 people incorporated bot maze design Webots application \n C++ programming language implementing algorithms maze exploring \n maze solving \n\n \n Major Project Data Driven Approach Isolating Vocals Instruments Music \n year long project methods Signal processing approach \n Machine Learning approach Python Programming Language completing project \n\n \n\