In [1]:
%load_ext blackcellmagic

In [2]:
import os
os.chdir("..")

In [3]:
import pandas as pd
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator, TextConverter
from io import StringIO
import re

In [57]:
pd.set_option("display.max_rows", None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## 1. Extract text from pdf

In [5]:
path_report = 'data/pdf_reports/0_888.L_2020.pdf'

## 1. Extract textboxes with coordinates from pdf and store in df

In [6]:
def convert_pdf_to_df(path_report):
    fp = open(path_report, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)
    list_ = []
    counter = 0

    # for each page, extract page number, page size and all textboxes with coordinates
    for page in pages:
        counter += 1
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                x0_page, y0_page, x1_page, y1_page = page.mediabox
                x0, y0, x1, y1 = lobj.bbox
                thisdict = {
                    "page": counter,
                    "x0_page": x0_page,
                    "y0_page": y0_page,
                    "x1_page": x1_page,
                    "y1_page": y1_page,                    
                    "x0": x0,
                    "y0": y0,
                    "x1": x1,
                    "y1": y1,
                    "text": lobj.get_text()
                }
                list_.append(thisdict)
    
    # df containing all textboxes with coordinates
    df = pd.DataFrame(list_)
    
    return df

In [51]:
df = convert_pdf_to_df(path_report)
df

Unnamed: 0,page,x0_page,y0_page,x1_page,y1_page,x0,y0,x1,y1,text
0,1,0.0,0.0,595.276,841.89,249.3422,57.0516,345.88720,67.0516,888 HOLDINGS PLC\n
1,1,0.0,0.0,595.276,841.89,224.6041,42.0069,370.71290,50.0069,ANNUAL REPORT & ACCOUNTS 2020\n
2,2,0.0,0.0,595.276,841.89,172.4043,677.7519,524.56430,760.8719,A YEAR OF \nSTRONG GROWTH \n
3,2,0.0,0.0,595.276,841.89,42.5197,541.8516,257.52570,627.8516,888 IS ONE OF THE \nWORLD’S LEADING \nONLINE BETTING AND \nGAMING COMPANIES. \n
4,2,0.0,0.0,595.276,841.89,42.5197,460.2010,204.76770,518.2010,"888 Holdings plc (“888” or the “Company”) \nand its subsidiaries (together, the “Group”) \noperate leading online gaming brands \nacross four key product verticals (Casino, \nSport, Poker and Bingo) with a presence \nacross multiple regulated markets. \n"
...,...,...,...,...,...,...,...,...,...,...
4614,155,0.0,0.0,595.276,841.89,45.3543,84.4276,110.52030,98.4256,Design and Production\nwww.carrkamasa.co.uk\n
4615,155,0.0,0.0,595.276,841.89,45.3543,20.3656,125.52545,28.8656,Corporate.888.com\n
4616,156,0.0,0.0,595.276,841.89,42.5197,702.2334,136.03090,740.2334,888 Holdings plc \nSuite 601/701 Europort \nEuroport Road \nGibraltar\n
4617,156,0.0,0.0,595.276,841.89,42.5197,666.5614,150.91010,694.5614,T +350 20049800 \nF +350 20048280 \nE info@888holdingsplc.com\n


## 2. Remove textboxes fully within 8% of page borders
These are likely to be headers, footers, or page numbers. Chose 8% because of default margins in word and powerpoint as well as some tests.

In [None]:
margin = 0.08
# Check top (bottom of textbox in highest margin% of page height )
df.drop(df[df['y0'] > (df['y1_page'] - (df['y1_page'] - df['y0_page']) * margin)].index, inplace=True)
# Check bottom (top of textbox in lowest margin% of page height)
df.drop(df[df['y1'] < (df['y0_page'] + (df['y1_page'] - df['y0_page']) * margin)].index, inplace=True)
# Check left side (right side of textbox in most left margin% of page width)
df.drop(df[df['x1'] < (df['x0_page'] + (df['x1_page'] - df['x0_page']) * margin)].index, inplace=True)
# Check right side (left side of textbox in most right margin% of page width)
df.drop(df[df['x0'] > (df['x1_page'] - (df['x1_page'] - df['x0_page']) * margin)].index, inplace=True)

## 3. Clean text
Remove linebraks, extra spaces, unwanted characters, urls, etc.

In [None]:
import string
import re

def clean_text(text):
    # Remove linebreaks
    text = text.replace('\n',' ')
    # Remove non ASCII characters
    text = ''.join(filter(lambda x: x in set(string.printable), text))
    # Remove header numbers
    text = re.sub(r'^\s?\d+(.*)$', r'\1', text)
    # Remove trailing spaces
    text = text.strip()
    # Link words back together that are split between lines
    text = re.sub(r'\s?-\s?', '-', text)
    # Remove spaces prior to punctuation
    text = re.sub(r'\s?([,:;\.])', r'\1', text)
    # CSRs contain a lot of figures that are not relevant to grammatical structure
    text = re.sub(r'\d{5,}', r' ', text)
    # Remove URLs
    text = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove multiple dots
    text = re.sub(r'\.+', '.', text)
    return text

df['text'] = df['text'].apply(clean_text)

## 4. Remove textboxes without any stopwords 
Capture tables, headers, footers, nonense, ...

In [52]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def check_text_for_stopwords(input_str):
    
    # Remove single letters from stop_words besides i and a
    # Extend stopwords with SDG related words
    stop_words = stopwords.words('english')
    stop_words.remove('s')
    stop_words.remove('t')
    stop_words.remove('d')
    stop_words.remove('m')
    stop_words.remove('o')
    stop_words.remove('y')
    new_stop_words = ['sdg','sustainable', 'development', 'goal', 'goals', 'poverty', 'zero', 'hunger', 'good', 'health', 'well-being', 'quality', 'education', 'gender', 'equality', 'clean', 'water', 'sanitation', 'affordable', 'energy', 'decent', 'work', 'economic', 'growth', 'industry', 'innovation', 'infrastructure', 'reduced', 'inequalities', 'cities', 'communities', 'responsible', 'consumption', 'production', 'climate', 'action', 'life', 'below', 'land', 'peace', 'justice', 'institutions', 'partnerships']
    stop_words.extend(new_stop_words)

    # tokenize input_string and transform to lowercase
    word_tokens = word_tokenize(input_str.lower())

    # Check if input_str contains stopwords. If yes, return True
    check = False
    for w in word_tokens:
        if w in stop_words:
            check = True
            break

    # Keep input_string if it contains stopwords and remove if not
    return check

df.drop(df[~df.apply(lambda row: check_text_for_stopwords(row['text']), axis=1)].index, inplace=True)

## 5. Additional steps
- (optional) Aggregate consecutive textboxes if next line starts with a space or previous does not end with a dot
- Remove pages with more than x% characters other than letters

In [None]:
# df grouped per page
df_pages = df[['page', 'text']].groupby(['page'])['text'].apply(''.join).reset_index()
# pages with more than X% numbers/special characters/ spaces relative to the number of letters
df_pages[[(len(re.sub('[\w]+' ,'', x)) > 0.3*len(x.replace(' ',''))) for x in df_pages['text']]]

## 6. Join textboxes back together to one string

In [56]:
text = ' '.join(df['text'])
print(text)

A YEAR OF STRONG GROWTH IS ONE OF THE WORLDS LEADING ONLINE BETTING AND GAMING COMPANIES. Holdings plc (888 or the Company) and its subsidiaries (together, the Group) operate leading online gaming brands across four key product verticals (Casino, Sport, Poker and Bingo) with a presence across multiple regulated markets. S MISSION 888s mission is to develop state-of-the-art technology and products that provide fun, fair and safe digital gambling products to players globally. leverages its proprietary technology to provide to players and B2B partners an innovative and world-class online gambling experience. By doing this effectively, 888 is able to succeed in the fast-growing and dynamic online gambling industry and generate value for its shareholders. S FOCUS 888s primary strategic focus is on growing its strong brands in sustainable markets where there are regulatory frameworks that protect customers and provide clarity for operators. To achieve this, we focus on continuous investment 