In [1]:
import os

os.chdir("..")

In [2]:
import pandas as pd
import numpy as np
import pickle
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

# Sentence splitting
import nltk
from nltk import tokenize
nltk.download('punkt')

# Universal Sentence Encoder
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
import re

[nltk_data] Downloading package punkt to /home/jbaumert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
pd.set_option("display.max_rows", 5)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

## 1. Load data

In [4]:
df = pd.read_pickle("data/processed/reports_tokenized.p")
df

Unnamed: 0,ID,Identifier,Company_Name,ISIN,Ticker,Country_of_Exchange,Financial_Period_Absolute,Financial_Period_Relative,CSR_URL,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17,CSR_Filename,CSR_Text,CSR_Text_clean,CSR_Text_tokenized
0,0,888.L,888 Holdings PLC,GI000A0F6407,888,United Kingdom,2020,FY0,https://corporate.888.com/wp-content/uploads/2021/04/2020-Annual-Report.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,False,True,False,0_888.L_2020.pdf,888 HOLDINGS PLC\n\nANNUAL REPORT & ACCOUNTS 2020\n\nA YEAR OF \nSTRONG GROWTH \n\n888 IS ONE ...,888 ORG ANNUAL REPORT & ACCOUNTS 2020 A YEAR OF STRONG GROWTH ORG IS ONE OF THE WORLDS LEADING O...,org annual report account year strong growth org one world leading online betting gaming company...
1,1,A.N,Agilent Technologies Inc,US00846U1016,A,United States of America,2020,FY0,https://www.agilent.com/about/companyinfo/sustainability/Agilent-Report-CSR-2020.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,True,True,False,1_A.N_2020.pdf,Delivering on \nour Promises\n\n2020 Corporate Social Responsibility Report\n\n1\n\nLetter fro...,Delivering on our Promises 2020 Corporate ORG 1 Letter from the President Stakeholder engagement...,delivering promise corporate org letter president stakeholder engagement environment table conte...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8101,8101,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2016,FY-4,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2016.pdf,False,False,True,True,True,True,True,True,False,,False,True,False,,False,True,False,12676_ZBH.N_2016.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2016 T A B L E O...,sustainability report company profile corporate overview purpose mission value sustainability co...
8102,8102,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2015,FY-5,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2015.pdf,False,False,True,True,False,True,True,False,False,,False,True,False,,False,False,False,12677_ZBH.N_2015.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2015 Investing n...,sustainability report investing future world letter ceo stakeholder company profile corporate ov...


## 2. For each CSR
- Split CSR into sentences 
- Compute similarity of each sentence with a target sentence related to the SDGs using Universal Sentence Encoder embeddings
- Order the sentences by decreasing similarity
- Join sentences back together to one text

In [5]:
# Load the Universal Sentence Encoder's TF Hub module and define function to embed a text

model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def embed(input):
    return model(input)

2022-04-05 12:19:53.028327: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-05 12:19:59.096140: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9014 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:01:00.0, compute capability: 8.6
2022-04-05 12:19:59.098065: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 11752 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:25:00.0, compute capability: 8.6
2022-04-05 12:19:59.100027: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 wi

In [6]:
# Define target words from SDGs and compute target embedding
target_words = 'poverty hunger health well-being education gender equality clean water sanitation affordable energy work economic growth industry innovation infrastructure inequality sustainable cities communities responsible consumption production climate action life land peace justice institutions partnership'
target_embedding = embed([target_words])

2022-04-05 12:20:40.756669: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [7]:
# Define helper functions

# Function to split a text into a dataframe of sentences
def split_text_into_sentences(text):
    sentences = tokenize.sent_tokenize(text)
    df_sentences = pd.DataFrame({'Sentences': sentences})
    return df_sentences

# Function to compute the similarity of each sentence in a df with the target embedding
def compute_similarity(df_sentences, target_embedding):
    df_sentences['Similarities'] = ''
    # Compute similarity for each sentence
    for index, row in df_sentences.iterrows():
        embedding = embed([row['Sentences']])
        df_sentences.loc[index, 'Similarities'] = cosine_similarity(embedding, target_embedding)
    # Order by decreasing similarity
    df_sentences_ordered = df_sentences.sort_values(by='Similarities', ascending=False).reset_index(drop=True)
    return df_sentences_ordered

In [8]:
df['CSR_Text_clean_ordered'] = ''

for index, row in tqdm(df.iterrows(), total=len(df)):
    # Split CSR into sentences
    df_sentences = split_text_into_sentences(row['CSR_Text_clean'])
    # Compute similarities and order sentences
    df_sentences_ordered = compute_similarity(df_sentences, target_embedding)
    # Transform sentences back to one text
    df.loc[index, 'CSR_Text_clean_ordered'] = ' '.join(df_sentences_ordered['Sentences'])

  0%|          | 0/8103 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
df

Unnamed: 0,ID,Identifier,Company_Name,ISIN,Ticker,Country_of_Exchange,Financial_Period_Absolute,Financial_Period_Relative,CSR_URL,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17,CSR_Filename,CSR_Text,CSR_Text_clean,CSR_Text_tokenized,CSR_Text_clean_ordered
0,0,888.L,888 Holdings PLC,GI000A0F6407,888,United Kingdom,2020,FY0,https://corporate.888.com/wp-content/uploads/2021/04/2020-Annual-Report.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,False,True,False,0_888.L_2020.pdf,888 HOLDINGS PLC\n\nANNUAL REPORT & ACCOUNTS 2020\n\nA YEAR OF \nSTRONG GROWTH \n\n888 IS ONE ...,888 ORG ANNUAL REPORT & ACCOUNTS 2020 A YEAR OF STRONG GROWTH ORG IS ONE OF THE WORLDS LEADING O...,org annual report account year strong growth org one world leading online betting gaming company...,"Following the findings of the AVIV AMCG report, we have identified a series of actions to streng..."
1,1,A.N,Agilent Technologies Inc,US00846U1016,A,United States of America,2020,FY0,https://www.agilent.com/about/companyinfo/sustainability/Agilent-Report-CSR-2020.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,True,True,False,1_A.N_2020.pdf,Delivering on \nour Promises\n\n2020 Corporate Social Responsibility Report\n\n1\n\nLetter fro...,Delivering on our Promises 2020 Corporate ORG 1 Letter from the President Stakeholder engagement...,delivering promise corporate org letter president stakeholder engagement environment table conte...,Occupational health and safety ORG's ORG (EHS) Policy outlines a fundamental corporate commitmen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8101,8101,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2016,FY-4,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2016.pdf,False,False,True,True,True,True,True,True,False,,False,True,False,,False,True,False,12676_ZBH.N_2016.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2016 T A B L E O...,sustainability report company profile corporate overview purpose mission value sustainability co...,"Our sustainability objectives focus on six key areas: Governance: Maintain oversight, systems an..."
8102,8102,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2015,FY-5,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2015.pdf,False,False,True,True,False,True,True,False,False,,False,True,False,,False,False,False,12677_ZBH.N_2015.pdf,E N V I R O N M E N T A L \n\n \n\nS O C I A L \n\nG O V E R N A N C E\n\nSustainability \nRep...,E N V I R O N M E N T A L S O C I A L G O V E R N A N C E Sustainability Report 2015 Investing n...,sustainability report investing future world letter ceo stakeholder company profile corporate ov...,"Our sustainability objectives focus on six key areas: Governance: Maintain oversight, systems, a..."


In [10]:
df.drop(columns=['CSR_Text', 'CSR_Text_clean', 'CSR_Text_tokenized'], inplace=True)
df

Unnamed: 0,ID,Identifier,Company_Name,ISIN,Ticker,Country_of_Exchange,Financial_Period_Absolute,Financial_Period_Relative,CSR_URL,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17,CSR_Filename,CSR_Text_clean_ordered
0,0,888.L,888 Holdings PLC,GI000A0F6407,888,United Kingdom,2020,FY0,https://corporate.888.com/wp-content/uploads/2021/04/2020-Annual-Report.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,False,True,False,0_888.L_2020.pdf,"Following the findings of the AVIV AMCG report, we have identified a series of actions to streng..."
1,1,A.N,Agilent Technologies Inc,US00846U1016,A,United States of America,2020,FY0,https://www.agilent.com/about/companyinfo/sustainability/Agilent-Report-CSR-2020.pdf,False,False,True,True,True,True,True,True,False,,False,True,True,,True,True,False,1_A.N_2020.pdf,Occupational health and safety ORG's ORG (EHS) Policy outlines a fundamental corporate commitmen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8101,8101,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2016,FY-4,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2016.pdf,False,False,True,True,True,True,True,True,False,,False,True,False,,False,True,False,12676_ZBH.N_2016.pdf,"Our sustainability objectives focus on six key areas: Governance: Maintain oversight, systems an..."
8102,8102,ZBH.N,Zimmer Biomet Holdings Inc,US98956P1021,ZBH,United States of America,2015,FY-5,https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/z/NYSE_ZBH_2015.pdf,False,False,True,True,False,True,True,False,False,,False,True,False,,False,False,False,12677_ZBH.N_2015.pdf,"Our sustainability objectives focus on six key areas: Governance: Maintain oversight, systems, a..."


In [11]:
# Store to pickle
df.to_pickle('data/processed/reports_ordered.p')

## 3. Investigation

In [140]:
pd.set_option("display.max_rows", 5)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

# Split CSR into sentences
df_sentences = split_text_into_sentences(df['CSR_Text_clean'][0])
# Compute similarities and order sentences
df_sentences_ordered = compute_similarity(df_sentences, target_embedding)

In [141]:
# Look at 10 most important sentences
list(df_sentences_ordered['Sentences'].head(10))

['Following the findings of the AVIV AMCG report, we have identified a series of actions to strengthen our environmental performance and reduce our impact on the climate crisis: Based on the 2020 AVIV AMCG findings, we will set a five-year strategy for reducing our environmental impact We will conduct an independent ORG emissions report on an annual basis, using a consistent methodology to track the efficacy of this environmental strategy We will also conduct a robust assessment of our broader environmental impacts, including waste disposal, water consumption and recycling rates, to ensure our strategy addresses the myriad ways the Company impacts both local ecosystems and the global climate We will explore ways in which flexible working and hot desking can be used to reduce our need for office space, reducing both costs to the business and our consumption of energy, water and other resources We will seek to improve the energy efficiency of our offices by upgrading to modern appliances

In [142]:
# Look at 10 least important sentences
list(df_sentences_ordered['Sentences'].tail(10))

['Given the nature of our engagement, some of these measures had been implemented, albeit to a lesser extent, in previous years, providing an appropriate base from which to expand these forms of interactions and facilitate our oversight of the component audit team.',
 'The ORG considered managements assessment of items included in the financial statements and the prominence given to them.',
 'As part of this process, ORG received updates from management and discussed follow-up actions in response to regulatory matters relating to customer activity in prior periods.',
 'The financial statements of subsidiaries are included in the consolidated financial statements using the purchase method of accounting.',
 'TRADE RECEIVABLES Trade receivables are initially recognised at fair value and subsequently measured at amortised cost and principally comprise amounts due from credit card companies and from e-payment companies.',
 'Mr. Dafna held a number of positions with ORG from 2003, taking an 