DAX ESG Dataset

Analysis
https://colab.research.google.com/drive/1TrBuB3BbI5oOskD4ht7aHaAYyporTR7n?usp=sharing
https://www.kaggle.com/datasets/equintel/dax-esg-media-dataset?resource=download
[Detecting greenwashing signals through a comparison of ESG reports and public media](https://www.swisstext.org/wp-content/uploads/2023/09/Greenwashing.pdf )



In [None]:
from tqdm import tqdm
import torch
tqdm.pandas()
import pandas as pd
import numpy as np

In [None]:

df = pd.read_csv("https://media.githubusercontent.com/media/JosPolfliet/vlerick-mai-nlp-2023/main/DATA/esg_documents_for_dax_companies.csv", delimiter="|").head(100).drop(columns=["Unnamed: 0"])
df["wc"] = df["content"].progress_apply(lambda x: len(str(x).split(" ")))
df.head()

100%|██████████| 100/100 [00:01<00:00, 78.68it/s]


Unnamed: 0,company,content,datatype,date,domain,esg_topics,internal,symbol,title,url,wc
0,Beiersdorf AG,Sustainability Highlight Report CARE BEYOND SK...,sustainability_report,2021-03-31,,"['CleanWater', 'GHGEmission', 'ProductLiabilit...",1,BEI,BeiersdorfAG Sustainability Report 2021,,8637
1,Deutsche Telekom AG,Corporate Responsibility Report 2021 2 Content...,sustainability_report,2021-03-31,,"['DataSecurity', 'Iso50001', 'GlobalWarming', ...",1,DTE,DeutscheTelekomAG Sustainability Report 2021,,94088
2,Vonovia SE,VONOVIA SE SUSTAINABILITY REPORT 2021 =For a S...,sustainability_report,2021-03-31,,"['Whistleblowing', 'DataSecurity', 'Vaccine', ...",1,VNA,VonoviaSE Sustainability Report 2021,,68376
3,Merck KGaA,Sustainability Report 2021 TABLE OF CONTENTS S...,sustainability_report,2021-03-31,,"['DataSecurity', 'DataMisuse', 'DrugResistance...",1,MRK,MerckKGaA Sustainability Report 2021,,80152
4,MTU,Our ideas and concepts FOR A SUSTAINABLE FUTUR...,sustainability_report,2020-03-31,,"['WorkLifeBalance', 'Corruption', 'AirQuality'...",1,MTX,MTUAeroEngines Sustainability Report 2020,,38975


## Add actions to df

In [None]:
df["content"].iloc[1]



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

tokenizer_name = "ESGBERT/EnvironmentalBERT-action"
model_name = "ESGBERT/EnvironmentalBERT-action"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)







## Slow but readable

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [None]:
s = df["content"].iloc[1]
# s="Failure to comply with legal or regulatory requirements and/or changes."

MAX_SEQ_LENGTH = 512
OVERLAP = 25
CHUNK_SIZE = MAX_SEQ_LENGTH - OVERLAP

def chunked_predict(s):
    inputs = tokenizer(str(s), return_tensors="pt")

    total_sequence_length = inputs["input_ids"].shape[1]
    n_chunks = total_sequence_length//CHUNK_SIZE+1

    scores = np.zeros(n_chunks)
    for i in range(0, n_chunks):
        chunk = inputs["input_ids"][0][i*CHUNK_SIZE:i*CHUNK_SIZE+MAX_SEQ_LENGTH].to(device)
        pred = model(chunk.unsqueeze(0))[0]
        scores[i] = pred.softmax(1)[0][1].item() # Predicted confidence score that the text of this chunk is about an action
        # print(f"Chunk {i}: {probs[i]:.3f}", tokenizer.decode(chunk))

    return {"mean_score": np.mean(scores),"n_chunks": n_chunks, "n_estimated_actions": np.sum(np.array(scores)>0.8)}

# print(chunked_predict(s))
# print(chunked_predict("Failure to comply with legal or regulatory requirements and/or changes."))
# print(chunked_predict("Failure to comply with legal or regulatory requirements and/or changes. We reduced water usage by 23% in 2020."))
df["content"].head(20).progress_apply(chunked_predict)

100%|██████████| 100/100 [03:37<00:00,  2.17s/it]


0     {'mean_score': 0.8834124859769509, 'n_chunks':...
1     {'mean_score': 0.6637682869390631, 'n_chunks':...
2     {'mean_score': 0.4290465255569003, 'n_chunks':...
3     {'mean_score': 0.6792989016751085, 'n_chunks':...
4     {'mean_score': 0.6828923148223112, 'n_chunks':...
                            ...                        
95    {'mean_score': 0.4438948776369216, 'n_chunks':...
96    {'mean_score': 0.5643083453178406, 'n_chunks':...
97    {'mean_score': 0.9995898604393005, 'n_chunks':...
98    {'mean_score': 0.49197470108629204, 'n_chunks'...
99    {'mean_score': 0.6982663869857788, 'n_chunks':...
Name: content, Length: 100, dtype: object

In [None]:
import torch
import torch.nn.functional as F
def chunked_predict_fast(s):

  inputs = tokenizer(str(s), return_tensors="pt", pad_to_multiple_of=MAX_SEQ_LENGTH).to(device)

  total_sequence_length = inputs["input_ids"].shape[1]
  n_chunks = total_sequence_length // MAX_SEQ_LENGTH + 1 # TODO : pad final chunk and add +1
  BATCH_SIZE = 256

  with torch.no_grad():
      scores = torch.zeros(n_chunks)
      number_of_padding_tokens = MAX_SEQ_LENGTH - inputs["input_ids"][0].shape[0] % MAX_SEQ_LENGTH
      padded_input_ids = F.pad(inputs["input_ids"], (0, number_of_padding_tokens), 'constant', 0)[0]
      chunks = torch.split(padded_input_ids, MAX_SEQ_LENGTH, dim=0)

      for j in range(0, len(chunks) // BATCH_SIZE+1 ):
        print(f"{j*BATCH_SIZE}:{(j+1)*BATCH_SIZE} of {n_chunks}")
        reshaped_chunks = torch.stack(chunks[j*BATCH_SIZE:(j+1)*BATCH_SIZE], dim=0)

        batch_outputs = model(reshaped_chunks)["logits"]
        batch_scores = batch_outputs.softmax(1)
        scores[j*BATCH_SIZE:max((j+1)*BATCH_SIZE, len(scores))] = batch_scores[:,0]

        # # Combine the results from different chunks
        # for i, output in enumerate(batch_outputs):
        #     pred = output[0]
        #     scores[i+j*BATCH_SIZE] = batch_scores[:,0]
  scores=scores.detach().cpu().numpy()
  # return {"mean_score": np.mean(scores),"n_chunks": n_chunks, "n_estimated_actions": np.sum(np.array(scores)>0.8)}

df["content"].head(20).progress_apply(chunked_predict_fast)

 10%|█         | 2/20 [00:00<00:02,  8.92it/s]

0:256 of 24
0:256 of 250


 15%|█▌        | 3/20 [00:02<00:16,  1.00it/s]

0:256 of 184


 20%|██        | 4/20 [00:05<00:25,  1.62s/it]

0:256 of 200


 25%|██▌       | 5/20 [00:07<00:28,  1.87s/it]

0:256 of 97


 30%|███       | 6/20 [00:08<00:21,  1.56s/it]

0:256 of 171


 35%|███▌      | 7/20 [00:10<00:20,  1.62s/it]

0:256 of 154


 40%|████      | 8/20 [00:11<00:19,  1.59s/it]

0:256 of 387


 40%|████      | 8/20 [00:13<00:19,  1.64s/it]


RuntimeError: ignored

In [None]:
padded_input_ids = F.pad(inputs["input_ids"], (0, number_of_padding_tokens), 'constant', 0)[0]
chunks = torch.split(padded_input_ids, MAX_SEQ_LENGTH, dim=0)
len(chunks)

250

In [None]:
# df[["mean_score", "n_chunks", "n_estimated_actions"]] = df["content"].progress_apply(chunked_predict)
# df.to_pickle("esg_documents_for_dax_companies_with_scores.pkl")

In [None]:
inputs["input_ids"][0].shape[0] % MAX_SEQ_LENGTH

333