In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 4.2 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.19.6


In [3]:
import glob
import re
import os
import fitz
import numpy as np
import pandas as pd
import math
import json
import pprint

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
import spacy

import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import collections

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...





In [None]:
# --------------------------- Read a pdf into a large string of text ---------------------------
def read_pdf(file_path):
    pymupdf_text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            pymupdf_text += page.getText()
    return pymupdf_text


# --------------------------- Read a report and breaks it up into individual sentences ---------------------------
def convert_pdf_into_sentences(text):
    # Remove unnecessary spaces and line breaks
    text = re.sub(r'\x0c\x0c|\x0c', "", str(text))
    text = re.sub('\n ', '', str(text))
    text = re.sub('\n', ' ', str(text))
    text = ' '.join(text.split())
    text = " " + text + "  "
    text = text.replace("\n", " ")
    if "”" in text: text = text.replace(".”", "”.")
    if "\"" in text: text = text.replace(".\"", "\".")
    if "!" in text: text = text.replace("!\"", "\"!")
    if "?" in text: text = text.replace("?\"", "\"?")
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]

    # Filter for sentences with more than 100 characters
    sentences = [s.strip() for s in sentences if len(s) > 100]
    return sentences


# --------------------------- Retrieve the report name from the pdf ---------------------------
def reportName(path):
    name = path.split('/')[-1]
    company = name.split('.')[0]
    return company

In [None]:
# Read our database of ESG reports
path = 'Reports 2.0'
esg_reports = glob.glob(path + '/*.pdf')
esg_corpus = {}
for report in esg_reports:
    esg_corpus[reportName(report)] = convert_pdf_into_sentences(read_pdf(report))
    print(f"Reading Report '{reportName(report)}'")

Deprecation: 'getText' removed from class 'Page' after v1.19 - use 'get_text'.


Reading Report 'Enea'
Reading Report 'TIS, Inc'
Reading Report 'Sopra Steria Group'
Reading Report 'Rackspace Technology Global, Inc'
Reading Report 'Infomedia Ltd'
Reading Report 'IGG Inc'
Reading Report 'Datamatics Global Services Ltd'
Reading Report 'Intuit Inc'
Reading Report 'Data Respons'
Reading Report 'POSCO ICT Co Ltd'
Reading Report 'ShotSpotter, Inc'
Reading Report 'Kape Technologies'
Reading Report 'Parsons Corp'
Reading Report 'Crayon Group Holding'
Reading Report 'Grace Technology, Inc'
Reading Report 'G5 Entertainment'
Reading Report 'Happiest Minds Technologies Ltd'
Reading Report 'iFAST Corp Ltd'
Reading Report 'Ceridian HCM Holding, Inc'
Reading Report 'Booking Holdings Inc'
Reading Report 'Nintendo Co Ltd'
Reading Report 'Nihon Unisys, Ltd'
Reading Report 'Autodesk Inc'
Reading Report 'Sykes Enterprises, Inc'
Reading Report 'Naspers Ltd'
Reading Report 'Linx'
Reading Report 'Accenture'
Reading Report 'Linedata Services'
Reading Report 'Lime Technologies'
Reading Repo

In [None]:
# Read the key words from our json file
f = open('keywords.json')
keywordBank = json.load(f)
f.close()

In [None]:
# keywordBank

### Extract out features such as:

    - PDF Document Name
    - Sentence Count (i.e Length of PDF)
    - ESG Sentences + Percentage of Occurence
    - Env Sentence + Percentage of Occurence
    - Social Sentence + Percentage of Occurence
    - Gov Sentence + Percentage of Occurence
    - Env Sentiment Score 
    - Social Sentiment Score 
    - Gov Sentiment Score 
    - Averaged ESG Sentiment Score

In [None]:
# --------------------------- Sentiment Analysis ---------------------------
# This function calculates the sentiment score for the various sentences using VADER
# Sentence:
#   - The sentence to be inputted to the function, which will return the respective sentiment score
#   - If there are > 1 sentence, the average will be computed and returned
def averagedCompoundSentimentScore(sentences):
    sid = SentimentIntensityAnalyzer()
    score = 0
    for sentence in sentences:
        sentiment = sid.polarity_scores(sentence)
        score += sentiment['compound']
    try:
        return score / len(sentences)
    except ZeroDivisionError:
        return score





# --------------------------- Sentence Extraction ---------------------------
# This function extracts out the keywords from the given corpus
# corpus: 
#   - This refers to a document (i.e one company)
# subFeatureKeywords:
#   - A list containing all the keywords which we would like to identify from our sentence bank
def keySentences(corpus, subFeatureKeywords):
    sentencesCaptured = []
    for word in subFeatureKeywords:
        sentencesCaptured.extend([sentence for sentence in corpus if word in sentence])
    return sentencesCaptured





# --------------------------- Print all sentences (Debugging purposes only) ---------------------------
def printAllSentences(corpus, pillar, keywordBank):
    for subFeature, kewords in keywordBank[pillar].items():
        print('\n\n\n')
        print(f"======= Printing Sentences from: '{subFeature}' =======")
        sentences = keySentences(corpus, kewords)
        for sentence in sentences:
            print(sentence)
            print('\n\n')
        




# --------------------------- Subpillar Feature Statistics ---------------------------
# 4 options for pillar: 
#   - 'Environment'
#   - 'Social'
#   - 'Governance'
#   - 'ESG phrases'
# corpus: 
#   - A specific company report, and NOT the whole collection of reports from all companies!
# keywordBank: 
#   - All the keywords from the subpillar
def subPillar_featureStats(corpus, pillar, keywordBank):
    data = {}

    # Calculate the sentences, frequency of sentence occurence, sentiment score etc
    def summaryStatistics(corpus, subFeatureKeywords):
        temp = {
            # "Sentences": None,
            "NumOfSentences": None,
            "FrequencyOfOccurence": None,
            "SentimentScore": None
        }
        # temp['Sentences'] = subpillar_sentences(corpus, keywordBank[pillar])
        sentences = keySentences(corpus, subFeatureKeywords)
        temp['NumOfSentences'] = len(sentences)
        temp['FrequencyOfOccurence'] = round(len(sentences) / len(corpus), 5)
        temp['SentimentScore'] = averagedCompoundSentimentScore(sentences)
        return temp

    for subFeature, subFeatureKeywords in keywordBank[pillar].items():
        data[subFeature] = summaryStatistics(corpus, subFeatureKeywords)
    
    return data





# --------------------------- Complete Feature Statistics ---------------------------
# This combines all the data across the 3 pillars into a dictionary 
# esg_bank:
#   - Complete set of data processed from reading in all the companies
#   - Structure of esg_bank:
#       - Dictionary where
#           - key: company name
#           - value: [sentence1, sentence2, ..., sentenceN]
# companyName:
#   - The company we wish to explore
# keywordBank:
#   - Complete set of data from the keywords.json file
def featureStats(esg_bank, companyName, keywordBank):
    company = {
        companyName: []
    }
    for pillar in [*keywordBank][:-1]:
        temp = {}
        temp[pillar] = subPillar_featureStats(esg_bank[companyName], pillar, keywordBank)
        company[companyName].append(temp)
    return company
    


def processByLength(esg_bank, keywordBank, numberOfReports):
    print('\n\n === Generating feature statistic data from all companies === \n\n')
    companies = []

    def flatten_data(dictionary_data):
        new_data = {
            "Companies": list(dictionary_data.keys())[0],
        }
        for subData in dictionary_data.values():
            for i in range(0, 3):
                for pillar, pillarValues in subData[i].items():
                    for title, data in pillarValues.items():
                        new_data[title + ' Sentiment Score'] = data['SentimentScore'] 
        return new_data

    counter = 0
    for company, data in esg_bank.items():
        if counter == numberOfReports:
            break
        else:
            print(f"Processing data from --- {company}")
            company_data = featureStats(esg_bank, company, keywordBank)
            companies.append(flatten_data(company_data))
            counter += 1

    return pd.DataFrame(companies)

In [None]:
# --------------------------- Weights for each subpillar ---------------------------
# import from weights.json 
# The weights have been calibrated to sum up to 100%

f = open('weights.json')
weights = json.load(f)
# pprint.pprint(weights)
f.close()

In [None]:
company_scores = processByLength(esg_corpus, keywordBank, len(esg_corpus))



 === Generating feature statistic data from all companies === 


Processing data from --- Enea
Processing data from --- TIS, Inc
Processing data from --- Sopra Steria Group
Processing data from --- Rackspace Technology Global, Inc
Processing data from --- Infomedia Ltd
Processing data from --- IGG Inc
Processing data from --- Datamatics Global Services Ltd
Processing data from --- Intuit Inc
Processing data from --- Data Respons
Processing data from --- POSCO ICT Co Ltd
Processing data from --- ShotSpotter, Inc
Processing data from --- Kape Technologies
Processing data from --- Parsons Corp
Processing data from --- Crayon Group Holding
Processing data from --- Grace Technology, Inc
Processing data from --- G5 Entertainment
Processing data from --- Happiest Minds Technologies Ltd
Processing data from --- iFAST Corp Ltd
Processing data from --- Ceridian HCM Holding, Inc
Processing data from --- Booking Holdings Inc
Processing data from --- Nintendo Co Ltd
Processing data from --- Nihon

In [None]:
company_scores

Unnamed: 0,Companies,Opportunities in Clean Tech Sentiment Score,Carbon Emissions Sentiment Score,Water Stress Sentiment Score,Electronic Waste Sentiment Score,Toxic Emissions & Waste Sentiment Score,Human Capital Development Sentiment Score,Privacy and Data Security Sentiment Score,Labor Management Sentiment Score,Governance Sentiment Score
0,Enea,0.425327,0.557400,0.072240,0.234367,0.000000,0.375837,0.355967,0.327428,0.284460
1,"TIS, Inc",0.521842,0.432322,0.996200,0.239570,0.000000,0.535148,0.416305,0.501125,0.564920
2,Sopra Steria Group,0.504519,0.281237,0.347221,0.022540,0.144443,0.452177,0.392583,0.459815,0.456827
3,"Rackspace Technology Global, Inc",0.544586,0.101990,0.229163,0.006082,0.542300,0.213250,0.465908,0.532997,0.347849
4,Infomedia Ltd,0.341683,-0.401900,0.000000,0.547075,0.000000,0.483650,0.396403,0.375560,0.378634
...,...,...,...,...,...,...,...,...,...,...
402,JoyCity Corp,0.732820,0.315017,0.149886,0.049583,-0.350000,0.049314,0.640739,0.688136,0.609192
403,iDreamSky Technology Holdings Ltd,0.256462,0.303830,0.339791,0.267332,-0.207264,0.467616,0.349031,0.337683,0.374027
404,Kunlun Tech Co Ltd,0.680855,0.527367,0.301933,0.066330,-0.005139,0.659550,0.494459,0.542107,0.405922
405,IVU Traffic Technologies,0.439241,0.246389,0.000000,0.009519,0.000000,0.323867,0.302686,0.244283,0.162826


In [None]:
# pprint.pprint(featureStats(esg_corpus, 'Enea', keywordBank))

In [None]:
# Merging Target values to the dataframe
corporate = pd.read_csv('Corporate2.0.csv')

# Filter out those that contains the report
corporate_new = corporate.loc[corporate['Annual Report'].notnull()].reset_index(drop=True)

# Select relevant columns
corporate_new = corporate_new[['Companies', 'ESG Risk Score', 'ESG Risk Rating']]

len(corporate_new)

417

In [None]:
# Merge with dataframe from above
corporate_new_merged = pd.merge(corporate_new, company_scores, how='inner', on = 'Companies').reset_index(drop=True)
corporate_new_merged

Unnamed: 0,Companies,ESG Risk Score,ESG Risk Rating,Opportunities in Clean Tech Sentiment Score,Carbon Emissions Sentiment Score,Water Stress Sentiment Score,Electronic Waste Sentiment Score,Toxic Emissions & Waste Sentiment Score,Human Capital Development Sentiment Score,Privacy and Data Security Sentiment Score,Labor Management Sentiment Score,Governance Sentiment Score
0,24SevenOffice Group,24.6,Medium ESG Risk,0.357958,0.000000,0.000000,0.000000,0.00000,0.471713,0.375589,0.218405,0.330786
1,"2U, Inc",19.8,Low ESG Risk,0.226180,-0.077200,-0.077200,-0.077200,0.00000,0.563257,0.406256,0.275578,0.390339
2,Absolute Software Corp,24.1,Medium ESG Risk,-0.279050,-0.229400,-0.421500,-0.421500,0.00000,0.000000,0.231411,0.827100,0.488943
3,Accenture,9.7,Negligible ESG Risk,0.632968,0.453108,-0.421500,0.744725,0.00000,0.610265,0.381267,0.518605,0.600880
4,"ACI Worldwide, Inc",20.8,Medium ESG Risk,0.399900,0.353917,0.470240,0.177225,0.00000,0.678200,0.290737,0.232244,0.263880
...,...,...,...,...,...,...,...,...,...,...,...,...
402,"TIS, Inc",19.3,Low ESG Risk,0.521842,0.432322,0.996200,0.239570,0.00000,0.535148,0.416305,0.501125,0.564920
403,Total System Services,18.0,Low ESG Risk,0.607006,0.389800,0.306450,-0.100911,0.00000,0.000000,0.380440,0.232475,0.535517
404,Totvs,19.9,Low ESG Risk,0.537422,0.156250,0.234375,-0.178937,0.00000,0.510184,0.503567,0.447113,0.468587
405,TradeDoubler,32.8,High ESG Risk,0.316283,0.000000,0.000000,0.209533,0.00000,0.348928,0.340593,0.274121,0.243602


In [None]:
# Check which columns do not match
coy_sorted = company_scores.sort_values(['Companies'])
coy_sorted = coy_sorted[['Companies']].reset_index(drop=True)

corporate_new_sorted = corporate_new.sort_values(['Companies'])
corporate_new_sorted = corporate_new_sorted[['Companies']].reset_index(drop=True)

count = 0
for company in list(coy_sorted['Companies']):
    if company not in list(corporate_new_sorted['Companies']):
        print(f"Company not found: {company}")
        count += 1
print(f"{count} discrepancies found!")

Company not found: Cars Inc
Company not found: NASDAQ_HCAT_2019
Company not found: Procore Technologies, Inc
Company not found: Schrödinger, Inc
4 discrepancies found!


In [None]:
# Output data to csv file
corporate_new_merged.to_csv('data.csv')

In [None]:
# pprint.pprint(featureStats(esg_corpus, 'NTT DATA Corp', keywordBank))

In [None]:
# subPillar_featureStats(esg_corpus['IAC_InteractiveCorp 2020'], 'Environment', keywordBank)

In [None]:
# subPillar_featureStats(esg_corpus['IAC_InteractiveCorp 2020'], 'Social', keywordBank)

In [None]:
# df = featureStatistic(esg_corpus, esg_keywords, environment_features, social_features, governance_features)
# df