In [1]:
###############################################################################
###   configuration setup
###############################################################################
import PyPDF2
import pandas as pd
import os
import pprint
pp = pprint.PrettyPrinter(indent=4)

# path = r'C:\Users\eight\Desktop\text_extraction'
# os.chdir(path)

In [2]:
###############################################################################
###   read in the PDF file
###############################################################################

### using PDFMiner
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from io import StringIO

#=== "open" opens a file in text format by default
#=== "rb" opens a file in binary for reading; "wb" opens a file in binary for writing
pdfFileObj = open('S_2020_151_E.pdf', 'rb') 

#=== create an output string
output_string = StringIO()

#=== use the PDFMiner functions
parser = PDFParser(pdfFileObj)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)

all_text = output_string.getvalue()

In [3]:
###############################################################################
###   text extraction + text cleaning
###############################################################################

#=== create function to clean text
import re

def clean_text(text_doc):
    """
    To remove
    x. newline characters
    x. extra spaces, extra punctuations
    x. bullet numbers, points
    x. footnotes
    x. reference to images, figures, tables
    x. remove websites
    
    Do not remove
    x. parentheses with texts included (may contain aliases)?
    x. non-English characters    
    """
    #=== character replacement
    cleaned_text_doc = text_doc.replace("_","")
    cleaned_text_doc = cleaned_text_doc.replace("\n","")
    cleaned_text_doc = cleaned_text_doc.replace("Source:","")
    cleaned_text_doc = cleaned_text_doc.replace("•", "")
    
    #=== regular expressions
    cleaned_text_doc = re.sub(r"\x0c(?:[\w]/[\w])*"," ",cleaned_text_doc) # remove such tags
    cleaned_text_doc = re.sub("[\d]{2,}[-/][\d]{2,}"," ",cleaned_text_doc) # remove instances like "123/234"
    cleaned_text_doc = re.sub("S/","",cleaned_text_doc) # remove such tags
    cleaned_text_doc = re.sub("[Ff]igure[s]*[\s][VIX]*"," ",cleaned_text_doc) # remove ref to figures
    cleaned_text_doc = re.sub("para(?:s)[.][\s]+[\d]+[-]*[\d]*"," ",cleaned_text_doc) # remove ref to figures
    cleaned_text_doc = re.sub("([\(\[]).*?([\)\]])","",cleaned_text_doc) # remove texts in brackets ("less greedy")
    cleaned_text_doc = re.sub("http(?:s).*?\s+","",cleaned_text_doc) # remove website links (http)
    cleaned_text_doc = re.sub("www.*?\s+","",cleaned_text_doc) # remove website links (www)
    cleaned_text_doc = re.sub("[\dVIX]+[\s]*[.]{1}\s+"," ",cleaned_text_doc) # remove bullet numbering/roman lettering
    cleaned_text_doc = re.sub("\s+[\W]+\s+"," ",cleaned_text_doc) # remove "floating punctuations
    cleaned_text_doc = re.sub("[\s]{2,}"," ",cleaned_text_doc) # remove consecutive spaces
    
    return(cleaned_text_doc)

all_text_cleaned = clean_text(all_text)

In [4]:
##################################################################################
###   extract sentence tokens (i.e. complete sentences) + NER for each sentence
##################################################################################

#=== extract complete sentences
import nltk
from nltk import tokenize
sentences = tokenize.sent_tokenize(all_text_cleaned)

#--- drop sentences that have length less than 3
selected_sentences = []
for sent in sentences:
    if len(sent) > 3:
        selected_sentences.append(sent)

In [5]:
##################################################################################
###   Named Entity Recognition on the sentences
##################################################################################
#=== create a function that tokenizes the words in the sentences, then perform NER
def token_tags(sentence):
    word_tokens = nltk.word_tokenize(sentence)
    word_tags = nltk.pos_tag(word_tokens)
    return word_tags

#=== we use SpaCy to process the sentences
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

doc = []
for sent in selected_sentences:
    doc.append(nlp(sent))

#=== we perform token-level entity annotation to describe the entity boundaries
"""
"B" --> token begins an entity
"I" --> token is inside an entity
"O" --> token is outside an entity
""  --> no entity tag is set
"""
pp.pprint([(X, X.ent_iob_, X.ent_type_) for X in doc[2]])

#=== perform actual NER
#--- we collapse the list into one whole "article"
article = ' '.join(x for x in selected_sentences)
nlp_article = nlp(article)
print("Total number of entities: ", len(nlp_article.ents))

[   (Accordingly, 'O', ''),
    (,, 'O', ''),
    (the, 'O', ''),
    (President, 'O', ''),
    (hereby, 'O', ''),
    (circulates, 'O', ''),
    (the, 'O', ''),
    (report, 'O', ''),
    (received, 'O', ''),
    (from, 'O', ''),
    (the, 'B', 'ORG'),
    (Panel, 'I', 'ORG'),
    (of, 'I', 'ORG'),
    (Experts, 'I', 'ORG'),
    (200420, 'O', ''),
    (*, 'O', ''),
    (2002046, 'B', 'DATE'),
    (*, 'O', ''),
    (Annex, 'B', 'PERSON'),
    (Letter, 'I', 'PERSON'),
    (dated, 'O', ''),
    (26, 'B', 'DATE'),
    (February, 'I', 'DATE'),
    (2020, 'I', 'DATE'),
    (from, 'O', ''),
    (the, 'B', 'ORG'),
    (Panel, 'I', 'ORG'),
    (of, 'I', 'ORG'),
    (Experts, 'I', 'ORG'),
    (established, 'O', ''),
    (pursuant, 'O', ''),
    (to, 'O', ''),
    (resolution, 'O', ''),
    (1874, 'B', 'DATE'),
    (addressed, 'O', ''),
    (to, 'O', ''),
    (the, 'O', ''),
    (President, 'O', ''),
    (of, 'O', ''),
    (the, 'B', 'ORG'),
    (Security, 'I', 'ORG'),
    (Council, 'I', 'ORG'),

In [6]:
# identify what labels would be of interest, e.g. PERSON, ORG
"""
ORG - companies, agencies, institutions
GPE - countries, cities, states
DATE - absolute or relatives dates or periods
CARDINAL - numerals that do not fail under another type
PERSON - people including fictional
NORP - nationalities, or religious or political groups
LOC - Non-GPE locations
MONEY - monetary values, including the units
ORDINAL - "first", "second"
PRODUCT - objects, vehicles, foods etc. (not services)
FAC - facilities, e.g. buildings, airports
WORK_OF_ART - titles of books, songs
QUANTITY - measurements as of weight or distance
LAW - named documents made into laws
TIME - times smaller than a day
PERCENT - includes "%"
LANGUAGE - language
EVENT - e.g. named hurricane, battles, wars, sports events
"""
labels = [x.label_ for x in nlp_article.ents]
print("Labels in the article: ", Counter(labels)) 

Labels in the article:  Counter({'ORG': 2011, 'CARDINAL': 1181, 'GPE': 1012, 'DATE': 999, 'PERSON': 605, 'NORP': 159, 'LOC': 81, 'PRODUCT': 74, 'MONEY': 73, 'QUANTITY': 59, 'FAC': 54, 'WORK_OF_ART': 37, 'TIME': 35, 'ORDINAL': 31, 'EVENT': 19, 'LAW': 12, 'PERCENT': 4, 'LANGUAGE': 3})


In [7]:
items = [x.text for x in nlp_article.ents]
print("Most common entities in article: ", Counter(items).most_common(10))

Most common entities in article:  [('Panel', 379), ('the Democratic People’s Republic of Korea', 256), ('China', 222), ('DPRK', 190), ('Nampo', 43), ('Chinese', 43), ('two', 39), ('Security Council', 37), ('2019', 33), ('Sierra Leone', 30)]


In [29]:
article_sentences = [x for x in nlp_article.sents]

#--- take a look at the entities in a sample sentence
displacy.render(nlp(str(article_sentences[280:300])), jupyter = True, style = 'ent')

In [9]:
#=== create a function to get the items of interest based on the label of interest
def quick_get(nlp_object, label_of_interest):
    """
    the "nlp_object" is to be an object that has been applied "nlp" method to, e.g.
    "nlp_article = nlp(article)"
    """
    list_of_objects = []
    for ent in nlp_object.ents:
        if ent.label_ == label_of_interest and ent.text not in list_of_objects:
            list_of_objects.append(ent.text)

    #--- convert the list_of_objects into a dictionary
    objects_dict = {"items": list_of_objects}
    df = pd.DataFrame(objects_dict)

    return df


In [13]:
df1 = quick_get(nlp_article, "ORG")
df1.head(30)

Unnamed: 0,items
0,United Nations Security Council Distr
1,the Security Council
2,the Panel of Experts
3,Council
4,Security Council
5,the Security Council Committee
6,the Committee
7,Panel
8,the Council.
9,2/266 Enclosure Letter
