In [None]:
!pip install fitz
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 7.2 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.19.6


In [None]:
import glob
import re
import os
import fitz
import numpy as np
import pandas as pd
import math
import json
import pprint

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
import spacy

import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import collections

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...





In [None]:
# --------------------------- Read a pdf into a large string of text ---------------------------
def read_pdf(file_path):
    pymupdf_text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            pymupdf_text += page.getText()
    return pymupdf_text


# --------------------------- Read a report and breaks it up into individual sentences ---------------------------
def convert_pdf_into_sentences(text):
    # Remove unnecessary spaces and line breaks
    text = re.sub(r'\x0c\x0c|\x0c', "", str(text))
    text = re.sub('\n ', '', str(text))
    text = re.sub('\n', ' ', str(text))
    text = ' '.join(text.split())
    text = " " + text + "  "
    text = text.replace("\n", " ")
    if "”" in text: text = text.replace(".”", "”.")
    if "\"" in text: text = text.replace(".\"", "\".")
    if "!" in text: text = text.replace("!\"", "\"!")
    if "?" in text: text = text.replace("?\"", "\"?")
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]

    # Filter for sentences with more than 100 characters
    sentences = [s.strip() for s in sentences if len(s) > 100]
    return sentences


# --------------------------- Retrieve the report name from the pdf ---------------------------
def reportName(path):
    name = path.split('/')[-1]
    company = name.split('.')[0]
    return company

In [None]:
# Read our database of ESG reports
path = '/Users/kevintanyuejun/Desktop/Reports 2.0'
# path = ''
esg_reports = glob.glob(path + '/*.pdf')
esg_corpus = {}
for report in esg_reports:
    esg_corpus[reportName(report)] = convert_pdf_into_sentences(read_pdf(report))
    print(f"Reading Report '{reportName(report)}'")

Deprecation: 'getText' removed from class 'Page' after v1.19 - use 'get_text'.


Reading Report 'Mobvista, Inc'
Reading Report 'NICE Ltd'
Reading Report 'Jack Henry _ Associates Inc 2020'
Reading Report 'Iflytek Co'
Reading Report 'Integrated Research Ltd'
Reading Report 'Meituan 2020'
Reading Report 'Logicom Public Ltd'
Reading Report 'PagSeguro Digital Ltd'
Reading Report 'ManTech International Corp'
Reading Report 'Hyundai Autoever Corp'
Reading Report 'NTT DATA Corp'
Reading Report 'network-2021-prelims_090322'
Reading Report 'NetEnt AB 2019'
Reading Report 'home24 SE 2018'
Reading Report 'Medallia, Inc'
Reading Report 'Salesforce'
Reading Report 'SII - AF_RA_Annuel_2020_2021_EN_0'
Reading Report 'Izertis SA 2021'
Reading Report 'Marvelous Inc'
Reading Report 'NetEase, Inc'
Reading Report 'Samsung SDS Co'
Reading Report 'Sabre Corp'
Reading Report 'Rovio Entertainment Oyj'
Reading Report 'IAC_InteractiveCorp 2020'
Reading Report 'Progress Publishes 2020 Corporate Social Responsibility Report'
Reading Report 'Sansan, Inc'
Reading Report 'SII SA'
Reading Report '

In [None]:
# Read the key words from our json file
f = open('keywords.json')
keywordBank = json.load(f)
f.close()


FileNotFoundError: ignored

In [None]:
keywordBank

{'Environment': {'Opportunities in Clean Tech': ['renewable',
   'energy efficiency',
   'smart',
   'efficient',
   'efficiency',
   'cleantech',
   'green technologies',
   'energy-saving',
   'energy',
   'green',
   'technology'],
  'Carbon Emissions': ['Climate Risk',
   'low-carbon',
   'carbon',
   'carbon footprint',
   'footprint',
   'co2e',
   'co2',
   'decarbonization',
   'decarbonisation',
   'low-carbon',
   'GHG emissions',
   'GHG',
   'dioxide',
   'emissions',
   'emission',
   'climate',
   'Climate',
   'pollution'],
  'Water Stress': ['water',
   'Water',
   'water-related',
   'Water-related',
   'water-stress',
   'water-intensity',
   'aquatic',
   'Aquatic',
   'aqua',
   'aquatic',
   'wastewater'],
  'Electronic Waste': ['electronic waste',
   'e-waste',
   'dispose',
   'disposal',
   'waste',
   'electric'],
  'Toxic Emissions & Waste': ['toxic',
   'hazardous substances',
   'toxic waste',
   'hazardous waste',
   'chemical',
   'bio-hazard',
   'bio-was

### Extract out features such as:

    - PDF Document Name
    - Sentence Count (i.e Length of PDF)
    - ESG Sentences + Percentage of Occurence
    - Env Sentence + Percentage of Occurence
    - Social Sentence + Percentage of Occurence
    - Gov Sentence + Percentage of Occurence
    - Env Sentiment Score 
    - Social Sentiment Score 
    - Gov Sentiment Score 
    - Averaged ESG Sentiment Score

In [None]:
# --------------------------- Sentiment Analysis ---------------------------
# This function calculates the sentiment score for the various sentences using VADER
# Sentence:
#   - The sentence to be inputted to the function, which will return the respective sentiment score
#   - If there are > 1 sentence, the average will be computed and returned
def averagedCompoundSentimentScore(sentences):
    sid = SentimentIntensityAnalyzer()
    score = 0
    for sentence in sentences:
        sentiment = sid.polarity_scores(sentence)
        score += sentiment['compound']
    try:
        return score / len(sentences)
    except ZeroDivisionError:
        return score





# --------------------------- Sentence Extraction ---------------------------
# This function extracts out the keywords from the given corpus
# corpus: 
#   - This refers to a document (i.e one company)
# subFeatureKeywords:
#   - A list containing all the keywords which we would like to identify from our sentence bank
def keySentences(corpus, subFeatureKeywords):
    sentencesCaptured = []
    for word in subFeatureKeywords:
        sentencesCaptured.extend([sentence for sentence in corpus if word in sentence])
    return sentencesCaptured





# --------------------------- Print all sentences (Debugging purposes only) ---------------------------
def printAllSentences(corpus, pillar, keywordBank):
    for subFeature, kewords in keywordBank[pillar].items():
        print('\n\n\n')
        print(f"======= Printing Sentences from: '{subFeature}' =======")
        sentences = keySentences(corpus, kewords)
        for sentence in sentences:
            print(sentence)
            print('\n\n')
        




# --------------------------- Subpillar Feature Statistics ---------------------------
# 4 options for pillar: 
#   - 'Environment'
#   - 'Social'
#   - 'Governance'
#   - 'ESG phrases'
# corpus: 
#   - A specific company report, and NOT the whole collection of reports from all companies!
# keywordBank: 
#   - All the keywords from the subpillar
def subPillar_featureStats(corpus, pillar, keywordBank):
    data = {}

    # Calculate the sentences, frequency of sentence occurence, sentiment score etc
    def summaryStatistics(corpus, subFeatureKeywords):
        temp = {
            # "Sentences": None,
            "NumOfSentences": None,
            "FrequencyOfOccurence": None,
            "SentimentScore": None
        }
        # temp['Sentences'] = subpillar_sentences(corpus, keywordBank[pillar])
        sentences = keySentences(corpus, subFeatureKeywords)
        temp['NumOfSentences'] = len(sentences)
        temp['FrequencyOfOccurence'] = round(len(sentences) / len(corpus), 5)
        temp['SentimentScore'] = averagedCompoundSentimentScore(sentences)
        return temp

    for subFeature, subFeatureKeywords in keywordBank[pillar].items():
        data[subFeature] = summaryStatistics(corpus, subFeatureKeywords)
    
    return data





# --------------------------- Complete Feature Statistics ---------------------------
# This combines all the data across the 3 pillars into a dictionary 
# esg_bank:
#   - Complete set of data processed from reading in all the companies
#   - Structure of esg_bank:
#       - Dictionary where
#           - key: company name
#           - value: [sentence1, sentence2, ..., sentenceN]
# companyName:
#   - The company we wish to explore
# keywordBank:
#   - Complete set of data from the keywords.json file
def featureStats(esg_bank, companyName, keywordBank):
    company = {
        companyName: []
    }
    for pillar in [*keywordBank][:-1]:
        temp = {}
        temp[pillar] = subPillar_featureStats(esg_bank[companyName], pillar, keywordBank)
        company[companyName].append(temp)
    return company
    


def processByLength(esg_bank, keywordBank, numberOfReports):
    print('\n\n === Generating feature statistic data from all companies === \n\n')
    companies = []

    def flatten_data(dictionary_data):
        new_data = {
            "Company Name": list(dictionary_data.keys())[0],
        }
        avgscore = []
        for subData in dictionary_data.values():
            for i in range(0, 3):
                for pillar, pillarValues in subData[i].items():
                    title = f"avg {pillar} Sentiment Score"
                    try:   
                        avg = round(sum([data['SentimentScore'] for title, data in pillarValues.items()]) / len(pillarValues), 5)
                        new_data[title] = avg
                        avgscore.append(avg)
                    except ZeroDivisionError:
                        new_data[title] = 0
                        avgscore.append(0)
        new_data['avg ESG Sentiment'] = round(sum(avgscore) / len(avgscore), 5)
        return new_data

    counter = 0
    for company, data in esg_bank.items():
        if counter == numberOfReports:
            break
        else:
            print(f"Processing data from --- {company}")
            company_data = featureStats(esg_bank, company, keywordBank)
            companies.append(flatten_data(company_data))
            counter += 1

    return pd.DataFrame(companies)



# def featureStatistic(pdf_corpus, *keywords):
#     keywordCorpusName = ['ESG', 'Env', 'Social', 'Gov']
#     data = {
#         'PDF Doc': [],
#         'Sentence Count (i.e Length of PDF)': [],
#         'ESG Sentence Count': [],
#         keywordCorpusName[0] + ' Sentence': [],
#         keywordCorpusName[1] + ' Sentence': [],
#         keywordCorpusName[2] + ' Sentence': [],
#         keywordCorpusName[0] + ' Sentiment Score': [],
#         keywordCorpusName[1] + ' Sentiment Score': [],
#         keywordCorpusName[2] + ' Sentiment Score': [],
#     }

#     for pdf_name, pdf_data in pdf_corpus.items():
#         print(f"Processing {pdf_name}...")

#         data['PDF Doc'].append(pdf_name)
#         data['Sentence Count (i.e Length of PDF)'].append(len(pdf_data))


#         for index, keywordList in enumerate(keywords):
#             sentences = keySentences(keywordList, pdf_data)
#             sentimentScore = []
#             if keywordCorpusName[index] == 'ESG':
#                 data['ESG Sentence Count'].append(len(sentences))
#             else:
#                 data[keywordCorpusName[index - 1] + ' Sentence'].append(len(sentences))
#                 try:
#                     score = averagedCompoundSentimentScore(sentences)
#                     sentimentScore.append(score)
#                     data[keywordCorpusName[index - 1] + ' Sentiment Score'].append(score)
#                 except ZeroDivisionError:
#                     data[keywordCorpusName[index - 1] + ' Sentiment Score'].append(0)
        
#     statistic = pd.DataFrame(data, columns=list(data.keys()))
#     statistic['ESG Average Sentiment'] = (statistic['ESG Sentiment Score'] + \
#         statistic['Env Sentiment Score'] + statistic['Social Sentiment Score']) / 3
#     return statistic

In [None]:
# --------------------------- Weights for each subpillar ---------------------------
# import from weights.json 
# The weights have been calibrated to sum up to 100%

f = open('weights.json')
weights = json.load(f)
pprint.pprint(weights)
f.close()

{'Environment': {'Carbon Emissions': 5.7,
                 'Electronic Waste': 0.6,
                 'Opportunities in Clean Tech': 11.4,
                 'Toxic Emissions & Waste': 0.3,
                 'Water Stress': 1.9},
 'Governance': {'Governance': 41.2},
 'Social': {'Human Capital Development': 20.8,
            'Labor Management': 5.8,
            'Privacy and Data Security': 12.3}}


In [None]:
processByLength(esg_corpus, keywordBank, 20)



 === Generating feature statistic data from all companies === 


Processing data from --- Mobvista, Inc
Processing data from --- NICE Ltd
Processing data from --- Jack Henry _ Associates Inc 2020
Processing data from --- Iflytek Co
Processing data from --- Integrated Research Ltd
Processing data from --- Meituan 2020
Processing data from --- Logicom Public Ltd
Processing data from --- PagSeguro Digital Ltd
Processing data from --- ManTech International Corp
Processing data from --- Hyundai Autoever Corp
Processing data from --- NTT DATA Corp
Processing data from --- network-2021-prelims_090322
Processing data from --- NetEnt AB 2019
Processing data from --- home24 SE 2018
Processing data from --- Medallia, Inc
Processing data from --- Salesforce
Processing data from --- SII - AF_RA_Annuel_2020_2021_EN_0
Processing data from --- Izertis SA 2021
Processing data from --- Marvelous Inc
Processing data from --- NetEase, Inc


Unnamed: 0,Company Name,avg Environment Sentiment Score,avg Social Sentiment Score,avg Governance Sentiment Score,avg ESG Sentiment
0,"Mobvista, Inc",0.06563,0.39254,0.40645,0.28821
1,NICE Ltd,0.2374,0.46952,0.52577,0.4109
2,Jack Henry _ Associates Inc 2020,0.29226,0.52483,0.51878,0.44529
3,Iflytek Co,-0.00183,0.52517,0.42056,0.31463
4,Integrated Research Ltd,0.37847,0.35297,0.35777,0.36307
5,Meituan 2020,0.17271,0.43422,0.4309,0.34594
6,Logicom Public Ltd,0.21378,0.36782,0.32991,0.30384
7,PagSeguro Digital Ltd,0.24411,0.38277,0.41574,0.34754
8,ManTech International Corp,0.33431,0.43215,0.41813,0.39486
9,Hyundai Autoever Corp,0.28896,0.36376,0.42687,0.35986


In [None]:
pprint.pprint(featureStats(esg_corpus, 'NTT DATA Corp', keywordBank))

{'NTT DATA Corp': [{'Environment': {'Carbon Emissions': {'FrequencyOfOccurence': 0.00621,
                                                         'NumOfSentences': 2,
                                                         'SentimentScore': 0.9915},
                                    'Electronic Waste': {'FrequencyOfOccurence': 0.00621,
                                                         'NumOfSentences': 2,
                                                         'SentimentScore': -0.6249},
                                    'Opportunities in Clean Tech': {'FrequencyOfOccurence': 0.17081,
                                                                    'NumOfSentences': 55,
                                                                    'SentimentScore': 0.3519509090909091},
                                    'Toxic Emissions & Waste': {'FrequencyOfOccurence': 0.0,
                                                                'NumOfSentences': 0,
                   

In [None]:
subPillar_featureStats(esg_corpus['IAC_InteractiveCorp 2020'], 'Environment', keywordBank)

{'Opportunities in Clean Tech': {'NumOfSentences': 9,
  'FrequencyOfOccurence': 0.05625,
  'SentimentScore': 0.8066444444444445},
 'Carbon Emissions': {'NumOfSentences': 20,
  'FrequencyOfOccurence': 0.125,
  'SentimentScore': 0.45719999999999994},
 'Water Stress': {'NumOfSentences': 1,
  'FrequencyOfOccurence': 0.00625,
  'SentimentScore': 0.9468},
 'Electronic Waste': {'NumOfSentences': 2,
  'FrequencyOfOccurence': 0.0125,
  'SentimentScore': 0.30679999999999996},
 'Toxic Emissions & Waste': {'NumOfSentences': 0,
  'FrequencyOfOccurence': 0.0,
  'SentimentScore': 0}}

In [None]:
subPillar_featureStats(esg_corpus['IAC_InteractiveCorp 2020'], 'Social', keywordBank)

{'Human Capital Development': {'NumOfSentences': 8,
  'FrequencyOfOccurence': 0.05,
  'SentimentScore': 0.8763625},
 'Privacy and Data Security': {'NumOfSentences': 50,
  'FrequencyOfOccurence': 0.3125,
  'SentimentScore': 0.48931800000000003},
 'Labor Management': {'NumOfSentences': 25,
  'FrequencyOfOccurence': 0.15625,
  'SentimentScore': 0.37038400000000005}}

In [None]:
# df = featureStatistic(esg_corpus, esg_keywords, environment_features, social_features, governance_features)
# df