In [274]:
import tika
import re
from tika import parser
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
tika.initVM()
import spacy

In [275]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/mr.jpu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mr.jpu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mr.jpu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/mr.jpu/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/mr.jpu/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [276]:
def extract_names(resume_text):
    # Tokenize the text into sentences and words
    sentences = nltk.sent_tokenize(resume_text)
    words = [nltk.word_tokenize(sentence) for sentence in sentences]

    # Apply part-of-speech tagging to the words
    tagged_words = [nltk.pos_tag(sentence) for sentence in words]

    # Use NLTK's named entity recognition to extract named entities
    named_entities = nltk.ne_chunk_sents(tagged_words, binary=False)

    # Find named entities that are people
    people = []
    for sentence in named_entities:
        for chunk in sentence:
            if type(chunk) == nltk.tree.Tree and chunk.label() == 'PERSON':
                person = ' '.join([c[0] for c in chunk])
                people.append(person)

    return people

In [277]:
def extract_text_from_resume(file_path):
    results = parser.from_file(filename=file_path)
    document_text = results['content']
    
    return document_text

In [278]:
EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+', re.IGNORECASE)

def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)

In [279]:
def preprocess_text(extracted_text):
    
    # Define regex pattern to match list symbols and punctuation
    pattern = r'^[\s\t]*[•*+\-–>❖➢★■›»○✓✔✗✘x;:,�()\'\"“”‘’][\s\t]+'

    # Remove list symbols and punctuation using regex
    clean_text = re.sub(pattern, '', extracted_text, flags=re.MULTILINE)

    # Remove stop words and extra spaces
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(clean_text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    clean_text = ' '.join(filtered_words)

    # Remove extra spaces between email addresses
    email_pattern = r'(?<![^\s])\s*([A-Za-z0-9._%+-]+)\s*@\s*([A-Za-z0-9.-]+\.[A-Z|a-z]{2,})(?![^\s])'
    clean_text = re.sub(email_pattern, r'\1@\2', clean_text)

    # Remove periods that are not part of a sentence or email address
    clean_text = re.sub(r'(?<!@)\.(?!\w|\s)', '', clean_text)

    return clean_text

In [280]:
clean_text

'Customer Services Director – Jan 2009 Curriculum Vitae ASMITA UPRETY Mob : 971529615457 Email : asmi_kim07@yahoo.com Abudhabi , UAE CAREER OBJECTIVES associated progress continuously accepting higher responsibilities challenges . Looking towards assignment wherein utilize skills achieve set targets , organizational goals results thereof . Currently looking suitable position reputable ambitious company . KEY SKILLS & COMPETENCIES Excellent communication skills networking power good interpersonal skills . Strong negotiation power . Enthusiastic , self-motivated , active excellent soft skills . Great ability develop business networking skills . Customer Focused . Pleasing personality convincing traits . Ability handle offensive costumers . Excellent Communication skill . Proven ability undertake position responsibility leadership . Enthusiastic ambitious achieve career goals EDUCATIONAL QUALIFICATION [ High School Leaving Certificate Lalpani Secondary School , Nepal ] . [ Intermediate le

In [281]:
resume ='../data/raw/cvcv.pdf'
extracted_text = extract_text_from_resume(resume)
clean_text = preprocess_text(extracted_text)
a_names = extract_names(extracted_text)
b_names = extract_names(clean_text)
emails = extract_emails(clean_text)

In [282]:
a_names

['Customer',
 'Lalpani Secondary School',
 'Nepal',
 'Nepal',
 'Degree',
 'Tribhuvan University',
 'Nepal',
 'System',
 'Vista',
 'Excel',
 'Bernstien Marketing',
 'Beauty Advisor',
 'Katrina Fashion',
 'Dubai',
 'Cash Management',
 'Nepali',
 'Husband Visa',
 'Asmita Uprety',
 'Jyoti Prakash Uprety']

In [283]:
b_names

['Customer',
 'Customer',
 'School Leaving Certificate Lalpani',
 'Nepal',
 'Nepal',
 'Tribhuvan University',
 'Nepal',
 'System',
 'Vista',
 'Excel',
 'Bernstien Marketing',
 'Beauty Advisor Focus',
 'Handle',
 'Katrina Fashion',
 'Dubai',
 'Cash',
 'Nepali',
 'Husband Visa',
 'Asmita Uprety',
 'Jyoti Prakash Uprety']

In [284]:
emails

['asmi_kim07@yahoo.com']