In [179]:
from tika import parser
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from spacy.matcher import Matcher


In [180]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)


In [181]:
def extract_text_from_resume(file_path):
    results = parser.from_file(filename=file_path)
    document_text = results['content']
    return document_text

In [182]:
def preprocess_text(extracted_text):
    
    # Define regex pattern to match list symbols and punctuation
    pattern = r'^[\s\t]*[•*+\-–>❖➢★■›»○✓✔✗✘x;:,�()\'\"“”‘’][\s\t]+'

    # Remove list symbols and punctuation using regex
    clean_text = re.sub(pattern, '', extracted_text, flags=re.MULTILINE)

    # Remove stop words and extra spaces
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(clean_text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    clean_text = ' '.join(filtered_words)

    # Remove extra spaces between email addresses
    email_pattern = r'(?<![^\s])\s*([A-Za-z0-9._%+-]+)\s*@\s*([A-Za-z0-9.-]+\.[A-Z|a-z]{2,})(?![^\s])'
    clean_text = re.sub(email_pattern, r'\1@\2', clean_text)

    # Remove periods that are not part of a sentence or email address
    clean_text = re.sub(r'(?<!@)\.(?!\w|\s)', '', clean_text)

    return clean_text

In [183]:
def extract_name(resume_text):
    nlp = spacy.load("en_core_web_sm")
    matcher = Matcher(nlp.vocab)   
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN', 'OP': '+'}]
    matcher.add('FULL_NAME', [pattern])

    doc = nlp(resume_text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        full_name = doc[start:end]
        # Make sure the full name corresponds to a single person name
        if len(full_name) > 1:
            continue
        print('Full name:', full_name.text)
        print('First name:', full_name.text)

# def extract_name(resume_text):
#     nlp = spacy.load("en_core_web_sm")
#     matcher = Matcher(nlp.vocab)   
#     # First name and Last name are always Proper Nouns
#     pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN', 'OP': '?'}]
#     matcher.add('FULL_NAME', [pattern])


#     doc = nlp(resume_text)
#     matches = matcher(doc)
#     for match_id, start, end in matches:
#         full_name = doc[start:end]
#         print('Full name:', full_name.text)
#         if len(full_name) == 2:
#             first_name, last_name = full_name
#             middle_name = None
#         else:
#             first_name, middle_name, last_name = full_name
#         print('First name:', first_name.text)
#         if middle_name is None:
#             print('Middle name: None')
#         else:
#             print('Middle name:', middle_name.text)
#         print('Last name:', last_name.text)


In [184]:
resume ='../data/raw/dada.pdf'
extracted_text = extract_text_from_resume(resume)
clean_text = preprocess_text(extracted_text)
extract_name(extracted_text)
