In [1]:
# import module for extracting text from non-text files
import textract
# import modules for handling language data
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import pandas as pd

In [2]:
# create a function for converting a selected PDF file to text
def read_file(filepath):
    # use tesseract to read PDF files
    encoded_text = textract.process(filepath, method='tesseract', encoding='utf-8')
    text = encoded_text.decode('utf-8')
    
    return text

In [3]:
# create a function to parse text data, common_notes is a user defined list that contains commonly found
# trivial words/signs that need to be removed 
def get_words(text, common_notes = []):
    # tokenize text data
    tokens = word_tokenize(text)
    # define punctuations that need to be removed from tokenized data
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '-', '.', "â€™"]
    # define common english stop words that need to be removed from tokenized data
    stop_words = stopwords.words('english')
    # remove puncuations, stop words, user-defined common_notes and integers
    keywords = [word.lower() for word in tokens if word.lower() not in stop_words and word not in punctuations 
                and word.lower() not in common_notes and not str.isdigit(word)]
    # create a stemmer object and lemmatize words
    wnl = WordNetLemmatizer()
    stem_keywords = [wnl.lemmatize(word) for word in keywords]
    
    return keywords

In [4]:
# define file path, common_notes and call functions
filepath = 'GRE_Text_Completion.pdf'

common_notes = ['a.', 'b.', 'c.', 'd.', 'e.', 'f.', 'g.', 'h.', 'i.', 'i', 'ii', 'iii']

text = read_file(filepath)
keywords = get_words(text, common_notes)

In [5]:
# create a dataframe of found keywords
words_df = pd.DataFrame(keywords, columns = ['Keyword'])
# count the frequency of each keyword, store in a new dataframe
words_frequency = pd.DataFrame(words_df['Keyword'].value_counts())
words_frequency.rename({'Keyword':'Frequency'}, axis = 1, inplace  = True)
# export to excel
words_frequency.to_excel('Frequent_Words.xlsx')

In [6]:
words_frequency.head(25)

Unnamed: 0,Frequency
hla,245
grew,245
|,244
section,241
>,137
one,134
many,131
even,115
often,102
new,91
