In [None]:
# Ishan Chaudhary (20103059) -- B3



import nltk  #natural language toolkit
import re
import subprocess # spawn new process
from pdfminer.high_level import extract_text
import requests

#re: a way to work with patterns and text.
#pdfminer.high_level: A library for extracting text from PDF files.


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

#NLTK data packages
# pre-trained models for tokenization.
# part-of-speech tagging - labeling each word
# NER -process of identifying and categorizing named entities 
# "a", "an", "the", "and", "or"



# Extracting text from the pdf

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

# Extracting the names from the text

def extract_names(txt):
    person_names = []

    for sent in nltk.sent_tokenize(txt): #to split the document into sentences.NE recognised at sentence level
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): #sentence into individual words, then applies part-of-speech tagging, then ne of chunk
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':  # chunk represents a contiguous sequence of words that have been labeled with the same named entity tag
                person_names.append(
                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves()) # leaves of the chunk (i.e., the individual words)
                )
    return person_names

#Extracting the phone number

PHONE_REG = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')
# may start with + or (
#start 1-9
# 8 characters
# end with 0-9



def extract_phone_number(resume_text):
    phone = re.findall(PHONE_REG, resume_text) #search the input text for all substrings that match the PHONE_REG pattern.

    if phone: #if phone no. was found
        number = ''.join(phone[0])#extracts the first number from the phone list, joins its individual components into a single string

        if resume_text.find(number) >= 0 and len(number) < 15:
            return number
    return None

#Extracting email address

EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+') #@ domain name , then . Top Level domain


def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)


def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)


#Extracting skills

SKILLS_DB = [
    'machine learning',
    'data science',
    'python',
    'word',
    'excel',
    'English',
    'c',
    'sql',
    'matlab',
    'javascript',
    'office',
]




def extract_skills(input_text):
    # stop words list
    stop_words = set(nltk.corpus.stopwords.words('english'))  #stopwords from english txt file

    # tokenizing involves splitting sentences and words from the body of the text.
    word_tokens = nltk.tokenize.word_tokenize(input_text)


    filtered_tokens = [w for w in word_tokens if w not in stop_words] #remove the word token if it is in stop words

    # remove the punctuation
    filtered_tokens = [w for w in word_tokens if w.isalpha()]

    # generate bigrams and trigrams (such as artificial intelligence)
    #below line
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))
    # nltk.everygrams(sequence, min_len, max_len)
    # we create a set to keep the results in.
    found_skills = set()

    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB: #lowercase
            found_skills.add(token)

    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)
## Adding both tokens and ngrams to found skill set.
    return found_skills


#Extracting Education

RESERVED_WORDS = [
    'school',
    'college',
    'univers',
    'academy',
    'faculty',
    'institute',
    'faculdades',
    'Schola',
    'schule',
    'lise',
    'lyceum',
    'lycee',
    'polytechnic',
    'kolej',
    'Ã¼nivers',
    'okul',

]





def extract_education(input_text):
    organizations = []

    # first get all the organization names using nltk
    for sent in nltk.sent_tokenize(input_text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                organizations.append(' '.join(c[0] for c in chunk.leaves()))

    # we search for each bigram and trigram for reserved words
    # (college, university etc...)
    education = set()
    for org in organizations:
        for word in RESERVED_WORDS:
            if org.lower().find(word) >= 0: #returns a non-negative integer, which means that the reserved word was found in the organization string, 
                education.add(org)

    return education


if __name__ == '__main__': #to execute some code only if the file was run directly, and not imported.
    text = extract_text_from_pdf('resume.pdf')
    education_information = extract_education(text)
    skills = extract_skills(text)
    names = extract_names(text)
    phone_number = extract_phone_number(text)
    emails = extract_emails(text)


if emails:
        print(emails[0])

if names:
        print(names[0] + ' ' + names[1]) # first and second name

if skills:
    print('Skills: ' + str(skills))

if education_information:
    print(education_information)










[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [WinError 10060] A connection attempt failed because
[nltk_data]     the connected party did not properly respond after a
[nltk_data]     period of time, or established connection failed
[nltk_data]     because connected host has failed to respond>
[nltk_data] Error loading maxent_ne_chunker: <urlopen error [WinError
[nltk_data]     10060] A connection attempt failed because the
[nltk_data]     connected party did not properly respond after a
[nltk_data]     period of time, or established connection failed
[nltk_data]     because connected host has failed to respond>
[nltk_data] Error loading wo

Ryan Nelson Arjuvo Limited
Skills: {'Python', 'C', 'Machine learning', 'machine learning', 'Machine Learning'}
{'Stanford University'}


In [8]:
pip install pdfminer


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
pip install --upgrade pdfminer





[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [9]:
pip install pdfminer.six

Collecting pdfminer.six
  Using cached pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
Collecting cryptography>=36.0.0
  Using cached cryptography-40.0.2-cp36-abi3-win_amd64.whl (2.6 MB)
Installing collected packages: cryptography, pdfminer.six
  Attempting uninstall: cryptography
    Found existing installation: cryptography 3.4.8
    Uninstalling cryptography-3.4.8:
      Successfully uninstalled cryptography-3.4.8
Successfully installed cryptography-40.0.2 pdfminer.six-20221105




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
