# PDF Scrapping for ESG

#### Test File: 
https://www.blackrock.com/corporate/literature/continuous-disclosure-and-important-information/blackrock-2020-sasb-disclosure.pdf

In [3]:
import glob
import re
import os
import fitz
import numpy as np
import pandas as pd
import math

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy

import gensim.corpora as corpora

from sklearn.feature_extraction.text import CountVectorizer

import collections

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 24.6 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.19.6


In [5]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
pdf_path = '/content/drive/My Drive/BT4222/ESG_Report.pdf'

In [9]:
def read_pdf(file_path):
    pymupdf_text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            pymupdf_text += page.getText()
    return pymupdf_text

In [10]:
# function takes in a report and breaks it up into individual sentences
def convert_pdf_into_sentences(text):

    # remove unnecessary spaces and line breaks
    text = re.sub(r'\x0c\x0c|\x0c', "", str(text))
    text = re.sub('\n ', '', str(text))
    text = re.sub('\n', ' ', str(text))
    text = ' '.join(text.split())
    text = " " + text + "  "
    text = text.replace("\n", " ")
    if "”" in text: text = text.replace(".”", "”.")
    if "\"" in text: text = text.replace(".\"", "\".")
    if "!" in text: text = text.replace("!\"", "\"!")
    if "?" in text: text = text.replace("?\"", "\"?")
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]

    #filter for sentences with more than 100 characters
    sentences = [s.strip() for s in sentences if len(s) > 100]
    return sentences

In [11]:
doc = read_pdf(pdf_path)
sentences = convert_pdf_into_sentences(doc)
sentences[:20]

Deprecation: 'getText' removed from class 'Page' after v1.19 - use 'get_text'.


['NETSCOUT Environmental, Social, and Governance Report 2021 EMPLOYEES GOVERNANCE PLANET CUSTOMERS COMMUNITIES NETSCOUT 2021 Environmental, Social, and Governance Report 1 THE DIGITIZED WORLD COMES WITH PROMISE AND PERIL.',
 'SINCE 1991, OUR CHOSEN MISSION HAS BEEN TO SOLVE THE TOUGHEST IT AND CYBERSECURITY PROBLEMS, AS GUARDIANS OF THE CONNECTED WORLD, SO THAT COLLECTIVELY WE CAN REALIZE THE PROMISE AND FEND OFF THE PERIL.',
 'This has been NETSCOUT’s mission since our beginning, to allow our customers – leading companies in telecommunications, government, critical infrastructure, and enterprises across the globe – to accelerate the benefits of the connected world with less disruption and risk to their businesses and to their customers.',
 'From our first product shipment in 1992, a remote monitoring hardware appliance, to our software- based and Software as a Service solutions today, our vision has been to create Visibility without Borders.',
 'Through persistence and focus on our co

In [12]:
# function for gensim preprocessing
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True)) 

In [13]:
# function for creating stopwords (english stopwords, context specific words, company names)
def create_stopwords(companyname):
    stop_words = companyname + stopwords.words('english') 
    stop_words.extend(['accounting', 'active', 'income', 'adventure', 'allocation', 'shares', 'amortization', 'amplitude', 'annuity', 'appreciation', 'arbitrage', 'ask', 'asset', 'asset approach', 'aval'])
    stop_words.extend(['plc', 'group', 'target', 'track', 'capital', 'holding', 'report', 'annualreport', 'esg', 'bank', 'report', 'annualreport', 'long', 'make', 'fy'])
    return stop_words
    
# function for removing stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [14]:
def create_bigram_mod(data_words):
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. 
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

# function for making bigrams
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [15]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [16]:
data_words = list(sent_to_words(sentences))
bigram_mod = create_bigram_mod(data_words)
stop_words = create_stopwords(['BlackRock','blackrock'])
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
id2word = corpora.Dictionary(data_lemmatized) # create dictionary
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts] # create corpus

In [None]:
data_lemmatized[:5]

[['sustainability',
  'disclosure',
  'report',
  'sustainability',
  'standard',
  'board',
  'standard',
  'management',
  'criterion',
  'new'],
 ['conduct',
  'stakeholder',
  'assessment',
  'identify',
  'key',
  'environmental',
  'social',
  'governance',
  'issue',
  'matter',
  'stakeholder'],
 ['result',
  'analysis',
  'several',
  'supplemental',
  'metric',
  'management',
  'criterion',
  'previous',
  'sustainability',
  'standard',
  'board',
  'metric',
  'add',
  'disclosure'],
 ['disclosure',
  'include',
  'communication',
  'progress',
  'regard',
  'incorporation',
  'principle',
  'ungc',
  'business',
  'operation',
  'strategy',
  'policy',
  'procedure'],
 ['engage',
  'touche',
  'llp',
  'perform',
  'review',
  'engagement',
  'management',
  'assertion',
  'relate',
  'specified_metric',
  'disclosure']]

In [None]:
# Creation of bag of words model
# Run generate_sentiment_score function below first, before running bag of words model
def bag_of_words(df):
    cv = CountVectorizer(max_features=1000)
    df_final = cv.fit_transform(list(df['sentence'])).toarray()
    vocab = cv.get_feature_names()
    df2 = pd.DataFrame(df_final,columns=vocab)
    df2['optimism'] = df['sentiment_score'].tolist()
    return df2

In [None]:
data_lemmatized_new = [data for sublist in data_lemmatized for data in sublist]
data_lemmatized_new[:15]

['sustainability',
 'disclosure',
 'report',
 'sustainability',
 'standard',
 'board',
 'standard',
 'management',
 'criterion',
 'new',
 'conduct',
 'stakeholder',
 'assessment',
 'identify',
 'key']

In [None]:
counted = collections.Counter(data_lemmatized_new)
counted.most_common()

[('management', 174),
 ('investment', 171),
 ('employee', 148),
 ('risk', 137),
 ('company', 137),
 ('include', 132),
 ('disclosure', 119),
 ('sustainability', 109),
 ('fund', 85),
 ('information', 81),
 ('business', 78),
 ('policy', 74),
 ('standard', 67),
 ('engagement', 67),
 ('bis', 65),
 ('review', 63),
 ('provide', 60),
 ('criterion', 59),
 ('strategy', 56),
 ('client', 56),
 ('team', 54),
 ('support', 51),
 ('financial', 49),
 ('approach', 49),
 ('manage', 49),
 ('activity', 48),
 ('community', 47),
 ('relate', 46),
 ('issue', 45),
 ('firm', 45),
 ('practice', 45),
 ('product', 45),
 ('process', 45),
 ('human_right', 42),
 ('global', 42),
 ('portfolio', 42),
 ('environmental', 41),
 ('term', 41),
 ('effort', 41),
 ('sasb', 40),
 ('material', 40),
 ('well', 40),
 ('investor', 39),
 ('health', 39),
 ('report', 38),
 ('engage', 38),
 ('conduct', 37),
 ('impact', 36),
 ('market', 36),
 ('safety', 35),
 ('program', 35),
 ('metric', 32),
 ('available', 32),
 ('climate', 31),
 ('descri