In [4]:
import os
os.chdir("..")

In [5]:
import pandas as pd
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [6]:
pd.set_option("display.max_rows", None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Extract text from pdf

In [7]:
path_report = 'data/pdf_reports/13342_FY0_YUMC.N.pdf'

### Extract textboxes with coordinates and store in df

In [11]:
def convert_pdf_to_txt(path_report):
    fp = open(path_report, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    retstr = StringIO()
    codec = 'utf-8'
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    for page in pages:
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [12]:
text = convert_pdf_to_txt(path_report)

In [25]:
import re
import nltk

def preprocess_text(text):
    '''
    Preprocess a string.
    :parameter
        :param text: string - name of column containing text
    :return
        preprocessed text
    '''
    # Clean text
    # Remove linebreaks
    text = text.replace('\n',' ')
    # Remove all non-ascii characters
    text = text.encode("ascii", "ignore").decode('UTF-8')
    # Convert to lowercase, remove punctuations and characters and then strip
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    # Tokenize (convert from string to list)
    lst_text = text.split()
    
    # Remove Stopwords
    lst_text = [word for word in lst_text if word not in nltk.corpus.stopwords.words("english")]
                
    # Stemming (remove -ing, -ly, ...) - Not used because lemmatisation is used
    #ps = nltk.stem.porter.PorterStemmer()
    #lst_text = [ps.stem(word) for word in lst_text]
                
    # Lemmatisation (convert the word into root word)
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    lst_text = [lem.lemmatize(word) for word in lst_text]
            
    # Convert back to string from list
    text = " ".join(lst_text)
    
    return text

In [26]:
preprocess_text(text)

'creating creating responsible responsible ecosystem ecosystem 2020 sustainability report yum china people content remark yum china ceo report vision overview growth powered innovation sustainability highlight external recognition corporate governance business ethic sustainability management sustainability strategy contribution sdgs sustainability governance stakeholder engagement materiality assessment food building value chain protects food safety technology enabled food safety management multidimensional product innovation actively promote industry development environment honoring environmental commitment promoting climate action practicing green operation build green restaurant partnering supplier build sustainable upstream ecosystem promoting sustainable packaging innovative green design 02 02 04 06 06 07 08 09 10 11 12 12 14 15 16 17 20 24 32 34 38 40 44 47 54 56 accelerating employee growth achieve company vision customer mania service aienabled solution practicing responsible s

# Bag of Words

In [27]:
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

In [28]:
# Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [30]:
corpus = [text]
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [31]:
dic_vocabulary

{'creating': 1728,
 'responsible': 7141,
 'ecosystem': 2052,
 '2020': 138,
 'sustainability': 7697,
 'report': 7070,
 'yum': 9985,
 'china': 1331,
 'people': 5432,
 'remarks': 7061,
 'by': 1175,
 'ceo': 1248,
 'about': 363,
 'this': 8586,
 'our': 4811,
 'vision': 9557,
 'overview': 5065,
 'growth': 2769,
 'powered': 5937,
 'innovation': 3209,
 'highlights': 2887,
 'external': 2362,
 'recognition': 6952,
 'corporate': 1692,
 'governance': 2722,
 'business': 1161,
 'ethics': 2285,
 'management': 3562,
 'strategies': 7593,
 'contribution': 1659,
 'to': 8788,
 'the': 7940,
 'sdgs': 7362,
 'stakeholder': 7534,
 'engagement': 2195,
 'materiality': 3624,
 'assessment': 912,
 'food': 2455,
 'building': 1151,
 'value': 9518,
 'chain': 1261,
 'that': 7872,
 'protects': 6558,
 'safety': 7284,
 'technology': 7801,
 'enabled': 2169,
 'multi': 3773,
 'dimensional': 1924,
 'product': 6223,
 'actively': 412,
 'promote': 6422,
 'industry': 3161,
 'development': 1883,
 'environment': 2225,
 'honoring': 

In [33]:
pd.DataFrame(dic_vocabulary.items(), columns=['word', 'count']).sort_values(by='count', ascending=False)

Unnamed: 0,word,count
2033,zeus,9999
3570,zero value,9998
3623,zero deforestation,9997
318,zero,9996
8779,yunnan guizhou,9995
2387,yunnan,9994
3296,yumchina com,9993
586,yumchina,9992
8876,yumc cup,9991
7791,yumc college,9990
