# NLP Project - Stage 3

## The ask



## Programme Overview



In [16]:
# Import packages
import urllib.request
from io import BytesIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import re
import operator
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy

## Get Text from remote PDF

### Get text from PDF document

In [2]:
#PDF to text Function. 
def pdf_to_text(path):
    '''
        Returns list of text for each page. Length == # pages.
    '''
    manager = PDFResourceManager()
    retstr = BytesIO()
    layout = LAParams(all_texts=True)
    device = TextConverter(manager, retstr, laparams=layout)
    filepath = open(path, 'rb')
    interpreter = PDFPageInterpreter(manager, device)
    
    text_list = []
    for page in PDFPage.get_pages(filepath, check_extractable=True):
        interpreter.process_page(page)
        text_list.append(retstr.getvalue().decode('utf-8','ignore'))

    filepath.close()
    device.close()
    retstr.close()
    return text_list

### Get PDF from remote URL

In [3]:
# Fetch PDF from remote URL function
def get_pdf_from_url(url,filename=None,print_text=False):
    # Takes a URL and saves the data locally.
    
    if filename == None:
        filename = url.split('/')[-1]
    
    webFile = urllib.request.urlopen(url)
    with open(filename,'wb') as localFile:
        contents = webFile.read()
        localFile.write(contents)
        
    webFile.close()
    
    # get text from the pdf file
    if print_text:
        text = pdf_to_text(filename)
        print(text)
    return filename

## Pre-processing

### Tokenise the data

Extracting tokens from the data allows us to gain a better understanding and summarise the data.

#### Different tokens we can have:
    
- words
- phrases
- sentences

Previously, this function returned the chosen type of token, however, it may be better to add all types of tokens to a table that can be filtered as desired.

In [4]:
# Tokenise
def tokenise(text):
    if type(text) == list:
        text = ' '.join(text)
    
    # initiate pd DataFrame
    tokens = pd.DataFrame(columns=['token','token_type','frequency'])
    
    # word tokens first
    words_list = re.findall("[\w']+", text.lower())# return list of words
    words_bot = bag_of_tokens(words_list)
    words_df = pd.DataFrame(data={'token':list(words_bot.keys()),'token_type':'word','frequency':list(words_bot.values())})
    tokens = tokens.append(words_df,ignore_index=True)
    
    sentences_list = re.compile('[.!?\n]').split(text.lower())# return list of sentences
    sentences_bot = bag_of_tokens(sentences_list)
    sentences_df = pd.DataFrame(data={'token':list(sentences_bot.keys()),'token_type':'sentence','frequency':list(sentences_bot.values())})
    tokens = tokens.append(sentences_df)
    
    return tokens

In [5]:
# Bag of words
def bag_of_tokens(token_list,sort=True,reverse=False):
    # Returns the frequencies of each token
    bag_of_tokens = {} # dictionary that will contain each token and its frequency.
    
    for token in token_list:
        token = token.strip(' \t\n\r\f')
        if len(token) >= 1 and token != ' ':
            if token in bag_of_tokens.keys():
                bag_of_tokens[token] += 1
            else:
                bag_of_tokens[token] = 1
    
    if sort:
        bag_of_tokens = dict(sorted(bag_of_tokens.items(), key=operator.itemgetter(1),reverse=reverse))

    return bag_of_tokens

In [6]:
# url = 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf'
url = 'http://www.africau.edu/images/default/sample.pdf'
filename = get_pdf_from_url(url)
pages = pdf_to_text(filename)

In [7]:
tokens = tokenise(pages)
tokens.sort_values('frequency',ascending=False).head(10)

Unnamed: 0,token,token_type,frequency
41,more,word,70
40,text,word,67
39,and,word,64
20,and more text,sentence,54
19,text,sentence,7
18,and more,sentence,7
36,pdf,word,5
35,a,word,5
37,file,word,5
38,boring,word,5


## Further pre-processing

Additional to extracting the text from the document and forming a structured table of various types of tokens, other techniques can be applied to reduce the variance in the specific words seen while paying more attention to the meaning behind the text.

Such techniques are, 'Stemming' and 'Lemmatisation'. The ideas behind them is to reduce each word to their root meaning. This is done in the following ways...

- <b>Stemming</b>

    Stemming algorithms are a set of specific rules and actions that are applied to reduce the word to its root. This method does not always return a meaning full word in the relevant language, however, the root should be the same for similar words.

    The most common algorithm for the English language is "Porter's Algorithm". This algorithm is very complex but it essentially comprises of 5 phases of word reductions, applied sequentially.
    

- <b>Lemmatisation</b>

    This method involves performing lookups for the word, against a database that maps similar words together. The result of this technique are the base, or dictionary representation, of the word; known as the 'lemma'.

[(Stemming and Lemmatisation - Standford University)](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)

#### Remove high frequency words

It is also common to reduce high frequency words that often don't contribute to the underlying meaning. There are various ways to do this, which are subject to the aim of the project in hand. The most common practise is to remove 'stop-words' such as "and", "a", "the" etc. These are words that are required for the text to make sense linguistically, but contribute very little to the actual meaning.

### Remove stop words
Makes use of NLTK library

In [8]:
def remove_stop_words(token_df):
    for index, row in token_df.iterrows():
        if row.token in stopwords.words('english'):
            token_df.drop(index,inplace=True)
    return token_df

In [9]:
without_stops = remove_stop_words(tokens)
without_stops.sort_values('frequency',ascending=False).head(10)

Unnamed: 0,token,token_type,frequency
40,text,word,67
19,text,sentence,7
38,boring,word,5
37,file,word,5
36,pdf,word,5
17,more text,sentence,4
27,simple,word,3
33,2,word,3
32,page,word,3
31,continued,word,3


### Stemming and Lemmatisation

Makes use of NLTK library

In [10]:
def porter_stem(token_df):
    porter = PorterStemmer()
    stems = token_df[0:0]
    for index, row in token_df.iterrows():
        if row.token_type in ['sentence','phrase']:
            stem = ''
            for word in row.token.split(' '):
                stem += ' ' + porter.stem(word)
            stem = stem.strip()
        else:
            stem = porter.stem(row.token)
        if stem in stems.token:
            stems.at[stems[stems.token==stem].index[0],'frequency'] += row.frequency
        else:
            stems = stems.append({'token':stem,'token_type':row.token_type,'frequency':row.frequency},ignore_index=True)
    return stems


In [11]:
# Before applying the POS (Part-of-Speech) tag, some words, such as, 'watching' would be lemmatised to 'watching' - clearly this is not useful.
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatise(token_df):
    lemmatiser = WordNetLemmatizer()
    stems = token_df[0:0]
    for index, row in token_df.iterrows():
        if row.token_type in ['sentence','phrase']:
            stem = ''
            for word in row.token.split(' '):
                stem += ' ' + lemmatiser.lemmatize(word,get_wordnet_pos(word))
            stem = stem.strip()
        else:
            stem = lemmatiser.lemmatize(row.token,get_wordnet_pos(row.token))
        if stem in stems.token:
            stems.at[stems[stems.token==stem].index[0],'frequency'] += row.frequency
        else:
            stems = stems.append({'token':stem,'token_type':row.token_type,'frequency':row.frequency},ignore_index=True)
    return stems

In [12]:
pstems = porter_stem(tokens)
lstems = lemmatise(tokens)
pstems.head(10)

Unnamed: 0,token,token_type,frequency
0,1,word,1
1,yet,word,1
2,oh,word,1
3,type,word,1
4,stuff,word,1
5,watch,word,1
6,paint,word,1
7,dri,word,1
8,littl,word,1
9,end,word,1


In [13]:
lstems.head(10)

Unnamed: 0,token,token_type,frequency
0,1,word,1
1,yet,word,1
2,oh,word,1
3,type,word,1
4,stuff,word,1
5,watch,word,1
6,paint,word,1
7,dry,word,1
8,little,word,1
9,end,word,1


#### Evaluation

The above two stemming methods, on the whole return similar output. However, it is worth noting that in experimentation, words such as 'caring' will return 'car' when using Porter Stemming and 'care' when using Lemmatisation. Clearly the latter is the correct result. In this case, Porter Stemming actually risks changing the meaning of the text.

## Entity Extraction

### Using spaCy

In [14]:
pages[0]

' A Simple PDF File \n\n This is a small demonstration .pdf file - \n\n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n\n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n\n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...\n\n\x0c'

In [15]:
nlp = spacy.load('en_core_web_sm')
text = nlp(pages[0])
for ent in text.ents:
    print(ent.text,ent.label_)

A Simple PDF File 

  ORG
2 CARDINAL


In [17]:
displacy.render(text,style="ent",jupyter=True)