<a href="https://colab.research.google.com/github/LordLean/Extracting-Green-Bonds-Use-of-Proceeds/blob/main/evaluation/retriever_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Information Retieval

## Answer Retriever


In [1]:
!pip install rank-bm25

!pip install PyPDF2

!pip install tabula-py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading PyPDF2-2.10.3-py3-none-any.whl (214 kB)
[K     |████████████████████████████████| 214 kB 3.9 MB/s 
Installing collected packages: PyPDF2
Successfully installed PyPDF2-2.10.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tabula-py
  Downloading tabula_py-2.5.1-py3-none-any.whl (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
Collecting distro
  Downloading distro-1.7.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.7.0 tabula-py-2.5.1


In [2]:
import numpy as np

import tabula
from rank_bm25 import BM25Okapi
from PyPDF2 import PdfReader

In [3]:
class TableReader:

  def __init__(self, pdf):
    self.pdf = pdf
    self.dfs = None

  def read_pages(self, pages="all", multiple_tables=True, stream=True):
    '''
    Return tables discovered within pdf.
    '''
    self.dfs = tabula.read_pdf(self.pdf, pages=pages, multiple_tables=multiple_tables, stream=stream)
    self.__clean_dfs()
    return self.dfs

  def __clean_dfs(self, thresh=2):
    self.dfs = [df.dropna(thresh=thresh) for df in self.dfs]


class Reader:

  def __init__(self, filename):
    self.reader = PdfReader(filename)
    self.tb = TableReader(filename)
    self.page_viewer = {page_num : {} for page_num in range(self.reader.numPages)}
    self.idx2page_item = []
  
  def __extract_text(self,):
    '''
    Page-wise text extraction and tokenize for BM25.
    '''
    text_index_mem = 0
    # List to store each tokenized corpus
    tokenized_corpus_list = []
    for i in range(self.reader.numPages):
      raw_text = self.reader.getPage(i).extractText()
      self.page_viewer[i]["raw_text"] = raw_text
      # Split text
      corpus = raw_text.split("\n \n")
      # Store results.
      self.page_viewer[i]["corpus"] = corpus
      for item in corpus:
        self.idx2page_item.append((i, item)) # page,textItem
      # Tokenize
      tokenized_corpus = [doc.split(" ") for doc in corpus]
      tokenized_corpus_list.append(tokenized_corpus)
    # BM25 computations only after the complete tokenized corpus is collated. 
    # Merge tokenized corpus'.
    tokenized_corpus_complete = [item for sublist in tokenized_corpus_list for item in sublist]
    # BM25
    self.bm25 = BM25Okapi(tokenized_corpus_complete)

  def __extract_tables(self):
    '''
    Page-wise table extractor.
    '''
    for i in range(self.reader.numPages):
      # page=0 will throw error using tabula.
      page = str(i+1)
      self.page_viewer[i]["tables"] = self.tb.read_pages(pages=page)

  def extract_pdf(self):
    # Extract data
    self.__extract_text()
    # self.__extract_tables()

  def print_page(self, page_num):
    '''
    Print separated sections of text given a page.
    '''
    corpus = self.page_viewer[page_num]["corpus"]
    for item in (corpus):
      print("\n{}\n".format("-"*60))
      print(item)
    print("\n{}\n".format("-"*60))
    for df in self.page_viewer[page_num]["tables"]:
      print(df.style)
      display(df)

  def __score(self, queries, weights):
    '''
    Compute the average BM25 score of each given query on each page of text.
    '''
    self.ranked_scores = []
    for query in queries:
      # tokenize query by whitespace.
      tokenized_query = query.split()
      # Compute score.
      doc_scores = self.bm25.get_scores(tokenized_query)
      self.ranked_scores.append(doc_scores)
    # Compute average (weighted) score against all queries.
    if not len(weights):
      # Equal weighting.
      self.average_score = np.average(self.ranked_scores, axis=0)
    elif len(queries) != len(weights):
        # Unequal number of elements.
        raise ValueError("Number of query and weight elements passed must be equal.")
    else:
      # Weighted average.
      self.average_score = np.average(self.ranked_scores, weights=weights, axis=0)
 
  def get_ranked_texts(self, queries, weights=[], n=5):
    '''
    Return n pages which scored highest using BM25.
    '''
    # Run score method to calculate BM25.
    self.__score(queries, weights)
    try:
      idx = sorted(range(len(self.average_score)), key=lambda i: self.average_score[i], reverse=True)[:n]
    except IndexError:
      idx = sorted(range(len(self.average_score)), key=lambda i: self.average_score[i], reverse=True)
    final_results = []
    for i in range(n):
      page_num, text = self.idx2page_item[idx[i]]
      # tables = self.page_viewer[page_num]["tables"]
      # final_results.append({"page_num":page_num, "text":text, "tables":tables})
      final_results.append(text)

    return final_results
    

## Answer Re-ranker (Neural: BERT / T5)

In [4]:
!pip install pygaggle

!pip install transformers==4.6.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygaggle
  Downloading pygaggle-0.0.3.1.tar.gz (33 kB)
Collecting coloredlogs==14.0
  Downloading coloredlogs-14.0-py2.py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.3 MB/s 
Collecting pydantic==1.5
  Downloading pydantic-1.5-cp37-cp37m-manylinux2014_x86_64.whl (7.3 MB)
[K     |████████████████████████████████| 7.3 MB 10.3 MB/s 
[?25hCollecting pyserini==0.10.1.0
  Downloading pyserini-0.10.1.0-py3-none-any.whl (63.3 MB)
[K     |████████████████████████████████| 63.3 MB 12 kB/s 
Collecting spacy==2.2.4
  Downloading spacy-2.2.4-cp37-cp37m-manylinux1_x86_64.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 6.2 MB/s 
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 41.3 MB/s 
[?25hCollecting tqdm==4.45.0
  Downloading tqdm-4.45.

In [5]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5, MonoBERT

class Reranker:

  def __init__(self):
    self.mono5t = MonoT5()
    self.monobert = MonoBERT()

  def rerank(self, query, texts, method="T5"):
    query = Query(query)
    texts = [Text(text, {"docid" : i}, 0) for i, text in enumerate(texts)]

    if method == "T5":
      reranker = self.mono5t
    if method == "BERT":
      reranker = self.monobert

    reranked = reranker.rerank(query, texts)
    reranked.sort(key=lambda x: x.score, reverse=True)

    return reranked

2022-08-25 12:23:34 [INFO] loader: Loading faiss with AVX2 support.
2022-08-25 12:23:34 [INFO] loader: Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2022-08-25 12:23:34 [INFO] loader: Loading faiss.
2022-08-25 12:23:34 [INFO] loader: Successfully loaded faiss.


# ICMA Database Upload

In [6]:
!wget https://www.icmagroup.org/assets/documents/Sustainable-finance/Database/ICMA-Sustainable-Bonds-Database-120822.xlsx

--2022-08-25 12:23:36--  https://www.icmagroup.org/assets/documents/Sustainable-finance/Database/ICMA-Sustainable-Bonds-Database-120822.xlsx
Resolving www.icmagroup.org (www.icmagroup.org)... 91.216.93.249
Connecting to www.icmagroup.org (www.icmagroup.org)|91.216.93.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 274575 (268K) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
Saving to: ‘ICMA-Sustainable-Bonds-Database-120822.xlsx’


2022-08-25 12:23:37 (760 KB/s) - ‘ICMA-Sustainable-Bonds-Database-120822.xlsx’ saved [274575/274575]



In [7]:
import os
import pandas as pd
import openpyxl

In [44]:
filename = "ICMA-Sustainable-Bonds-Database-120822.xlsx"

# select green bond sheet.
gb_sheet = pd.ExcelFile(filename).sheet_names[0] 

df = pd.read_excel(filename, sheet_name=gb_sheet, header=1)

In [45]:
# Use openpyxl to load xls with hyperlink text.
wb = openpyxl.load_workbook(filename)
ws = wb[gb_sheet]

hyperlink_list = []

for i in range(len(df)):
  try:
    hyperlink_list.append(ws.cell(row=(3+i), column=6).hyperlink.target)
  except:
    # Nan 
    hyperlink_list.append(None)

# Add list to df.
df["External Review Report Text"] = hyperlink_list

In [46]:
df["Issuer Category/Sector"].unique()

array(['Financial Institution', 'Corporate-Energy', 'Utility',
       'Corporate-Infrastructure', 'Corporate-Real Estate',
       'Corporate-Transportation', 'MDB', 'Agency', 'Corporate-agri food',
       'Corporate-Consumer services', 'Corporate-Consumer goods',
       'Sovereign', 'Corporate-Industry', 'Municipal', nan,
       'Corporate-Technology', 'Corporate-consumer services',
       'Corporate-Tourism', 'Corporate-Real estate', 'Corporate-Telecom',
       'Corporate-Water', 'Corporate-Healthcare'], dtype=object)

In [47]:
sectors = ["Corporate-Real Estate", "Corporate-Energy", "Corporate-Transportation"]

In [48]:
european = [
    'Spain', "The Netherlands", "Italy", "Sweden", "Norway", "France", "Luxembourg",
    "UK", "Belgium", "Hungary", "Switzerland", "Germany", "Finland", "Iceland", "Poland",
    "Czech Republic", "Denmark", "Ireland", "Greece", "Guernsey", "Austria", "Latvia",
    "Lithuania", "Romania", "Slovenia", "Slovakia",
]

sector = "Corporate-Real Estate"
# sector = "Corporate-Energy"
# sector = "Corporate-Transportation"
external = "sustainalytics".upper() # second-party opinion


df = df.loc[
    (df["Jurisdiction"].isin(european)) &
    (df["Issuer Category/Sector"] == sector) &
    (df["External Review Report"] == external)
] 

files = df["External Review Report Text"].to_list()

In [32]:
sector_name2url = {}

In [49]:
name2url = {link.strip().rsplit('/', 1)[-1] : link.strip() for link in files}
url2name = {link.strip() : link.strip().rsplit('/', 1)[-1]for link in files}
url2name

{'https://www.icmagroup.org/Emails/icma-vcards/Blackstone_External%20Review%20Report.pdf': 'Blackstone_External%20Review%20Report.pdf',
 'http://www.icmagroup.org/Emails/icma-vcards/Castellum_External%20Review%20Report.pdf': 'Castellum_External%20Review%20Report.pdf',
 'https://www.icmagroup.org/Emails/icma-vcards/Cibus_External%20Review%20Report.pdf': 'Cibus_External%20Review%20Report.pdf',
 'https://www.icmagroup.org/Emails/icma-vcards/CTP_External%20Review%20Report.pdf': 'CTP_External%20Review%20Report.pdf',
 'https://www.icmagroup.org/Emails/icma-vcards/Deutsche%20Wohnen_External%20Review%20Report.pdf': 'Deutsche%20Wohnen_External%20Review%20Report.pdf',
 'https://www.icmagroup.org/Emails/icma-vcards/Globalworth_External%20Review%20Report.pdf': 'Globalworth_External%20Review%20Report.pdf',
 'http://www.icmagroup.org/Emails/icma-vcards/ICADE_External%20Review%20Report.pdf': 'ICADE_External%20Review%20Report.pdf',
 'https://www.icmagroup.org/Emails/icma-vcards/Johnson_External%20Revi

In [50]:
sector_name2url[sector] = name2url

In [52]:
for sector in sectors:
  print(sector)
  for company in sector_name2url[sector].keys():
    print("     " +company)
  print("\n\n")

Corporate-Real Estate
     Blackstone_External%20Review%20Report.pdf
     Castellum_External%20Review%20Report.pdf
     Cibus_External%20Review%20Report.pdf
     CTP_External%20Review%20Report.pdf
     Deutsche%20Wohnen_External%20Review%20Report.pdf
     Globalworth_External%20Review%20Report.pdf
     ICADE_External%20Review%20Report.pdf
     Johnson_External%20Review%20Report.pdf
     Specialfastigheter-External%20Review%20Report.pdf
     tritax-eurobox_External%20Review%20Report.pdf
     Vesteda_External%20Review%20Report.pdf
     V%C3%ADa%20C%C3%A9lere_External%20Review%20Report.pdf
     Vonovia_External%20Review%20Report.pdf
     Xior_External%20Review%20Report.pdf



Corporate-Energy
     Baseload%20Capital_External%20Review%20Report.pdf
     E.ON_External%20Review%20Report.pdf
     E.ON_External%20Review%20Report%202021.pdf
     ESB_External%20Review%20Report.pdf
     innogy_External%20Review%20Report.pdf
     Midsummer_External%20Review%20Report.pdf
     Landsvirkjun-External%2

In [14]:
cwd = os.getcwd()

path = os.path.join(cwd, "documents")

os.mkdir(path)

for sec in sectors:
  os.mkdir(os.path.join(path, sec))

# # Create documents folder
# !mkdir documents
# # Create sector Specific directory
# !mkdir documents/Corporate-Energy

In [29]:
import os.path
import urllib.request

for link, name in url2name.items():
    filename = os.path.join('./documents/{}'.format(sector), name)
    if not os.path.isfile(filename):
        print('Downloading: ' + filename)
        try:
            urllib.request.urlretrieve(link, filename)
        except Exception as inst:
            print(inst)
            print('  Encountered unknown error. Continuing.')

Downloading: ./documents/Corporate-Transportation/Alpha%20Trains_External%20Review%20Report.pdf
Downloading: ./documents/Corporate-Transportation/Ferrovie_External%20Review%20Report.pdf
Downloading: ./documents/Corporate-Transportation/Volkswagen_External%20Review%20Report%202020.pdf


# Run Questions

In [54]:
from tqdm.notebook import tqdm_notebook

In [55]:
queries = [
    "what did the use of proceeds finance?",
    "What are the eligible categories for the use of proceeds?",
]

In [53]:
# dataframe to hold results
df_test = pd.DataFrame(columns=["Sector", "Company", "Query", "Method"]+["Passage {}".format(i+1) for i in range(10)])
df_test

Unnamed: 0,Company,Sector,Query,Method,Passage 1,Passage 2,Passage 3,Passage 4,Passage 5,Passage 6,Passage 7,Passage 8,Passage 9,Passage 10


In [30]:
reranker = Reranker()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1841.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691413.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=314.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340665848.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=571.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [31]:
bm25_count = 20
reranked_count = 10

In [57]:
for sector in tqdm_notebook(sectors):
  # Run retrieval and QA models over specificed SPO documents.
  for company in tqdm_notebook(sector_name2url[sector].keys()):
    # Extract PDF
    print(company)
    filename = "documents/{}/{}".format(sector,company)
    reader = Reader(filename)
    reader.extract_pdf()
    # Iterate through query, question pairs 
    for query in queries:
      # Get BM25 rankings
      try:
        texts = reader.get_ranked_texts([query], n=bm25_count)
      # No answers.
      except IndexError:
        print("No Results: {}\n       Query: {}".format(company, query))
        break
      for method in ["T5", "BERT"]:
        # Rerank
        reranked = reranker.rerank([query], texts, method=method)
        reranked = [item for item in reranked if len(item.text.strip())>0]
        reranked = reranked[:reranked_count]
        # Get text from pygaggle obj.
        reranked = [item.text for item in reranked]
        # Pad reranked to 10 entries.
        reranked += ["None"] * (10 - len(reranked))

        df_test.loc[len(df_test)] = [sector, company, query, method] + reranked

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))



Blackstone_External%20Review%20Report.pdf


  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


Castellum_External%20Review%20Report.pdf




Cibus_External%20Review%20Report.pdf




CTP_External%20Review%20Report.pdf




Deutsche%20Wohnen_External%20Review%20Report.pdf




Globalworth_External%20Review%20Report.pdf




ICADE_External%20Review%20Report.pdf




Johnson_External%20Review%20Report.pdf




Specialfastigheter-External%20Review%20Report.pdf




tritax-eurobox_External%20Review%20Report.pdf
Vesteda_External%20Review%20Report.pdf




V%C3%ADa%20C%C3%A9lere_External%20Review%20Report.pdf
Vonovia_External%20Review%20Report.pdf




Xior_External%20Review%20Report.pdf







HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

Baseload%20Capital_External%20Review%20Report.pdf




E.ON_External%20Review%20Report.pdf




E.ON_External%20Review%20Report%202021.pdf




ESB_External%20Review%20Report.pdf




innogy_External%20Review%20Report.pdf




Midsummer_External%20Review%20Report.pdf




Landsvirkjun-External%20Review%20Report.pdf




neutral%20fuels_External%20Review%20Report.pdf
Repower_External%20Review%20Report.pdf




RWE_External%20Review%20Report.pdf




Synthesis%20Analytics_External%20Review%20Report.pdf




Water%20Wheel_External%20Review%20Report.pdf







HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Alpha%20Trains_External%20Review%20Report.pdf
Ferrovie_External%20Review%20Report.pdf
Volkswagen_External%20Review%20Report%202020.pdf








In [133]:
df_test_q1 = df_test.loc[
    (df_test["Query"] == queries[0])
] 
df_test_q1.reset_index(drop=True, inplace=True)

In [134]:
from IPython.display import clear_output 

# List to store inputs.
full_list = []

# Cell holds functionality to iterate through dataframe and print all 10 retrieved passages.
# for the purpose of hand labelling correctly relevant passages.
for i in range(len(df_test_q1)):
  for j, column in enumerate(df_test_q1.iloc[i]):
    if j > 3:
      print("-"*40, "{}".format(j-3), "-"*40)
      print(column)
      print("\n\n")
  x = input()
  x = [int(n) for n in x.split()]
  full_list.append(x)

  clear_output()

In [144]:
df_bert = df_test_q1.loc[
    (df_test["Method"] == "BERT")
] 
df_t5 = df_test_q1.loc[
    (df_test["Method"] == "T5")
] 

In [145]:
df_test_q1

Unnamed: 0,Company,Sector,Query,Method,Passage 1,Passage 2,Passage 3,Passage 4,Passage 5,Passage 6,Passage 7,Passage 8,Passage 9,Passage 10,labels
0,Blackstone_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,12 \nThe eligible categories for the use of p...,Conclusion \nBPPEH has developed the Blacks...,Evaluation Summary \nSustainalytics is of the...,5 • Project Evaluation and Selection: \n- BPP...,"The Company, under the supervision of the Gree...",Use of proceeds reporting : \n☐ Project -by-pr...,\nUse of proceeds categories as per GBP: \n☒...,BPPEH engaged Sustainalytics to review the F...,7 Section 3: Impact of Use of Proceeds \nAll ...,3 This docum ent contains Sustainalytics’ opin...,"[1, 2, 3, 7, 9, 10]"
1,Blackstone_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,Use of proceeds reporting : \n☐ Project -by-pr...,"15 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",14 ☐ Linkage to individual bond(s) ☐ Other (p...,\nUse of proceeds categories as per GBP: \n☒...,Tracking of proceeds: \n☒ Green Bond proceeds...,Evaluation and selection \n☒ Credentials on t...,16 Disclaimer \nCopyright ©202 1 Sustainalyti...,6 BPPEH has provided to Sustainalytics a list ...,Conclusion \nBPPEH has developed the Blacks...,6 implementation of its group -wide sustainab...,"[4, 8, 9, 10]"
2,Castellum_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,2.3.1 Use of Proceeds \nThe proceeds of the ...,1. USE OF PROCEEDS \nOverall comment on secti...,Reporting \n Yes Castellum will publish a ye...,\n 5 applicable criteria or to repay a Green ...,Although Castellum does not commit to a ratio ...,2.3.2 Process for Project Evaluation and Sele...,2.3 Castellum’s Green Bond \nCastellum intend...,Castellum will publish a yearly report describ...,2.2.3 Management of Proceeds \nGreen Bond n...,\n4. REPORTING \nOverall comment on section ...,"[2, 5, 6, 7]"
3,Castellum_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,Tracking of proceeds: \n☒ Green Bond proceeds...,\n 18 ☒ Disclosure of portfolio balance of \n...,"\n 2 EXECUTIVE SUMMARY \nCastellum, an indu...",1. INTRODUCTION \nCastellum intends to issue ...,2.3.1 Use of Proceeds \nThe proceeds of the ...,2.3 Castellum’s Green Bond \nCastellum intend...,\n 8 Management \nof Proceeds \n Yes The ne...,\n4. REPORTING \nOverall comment on section ...,Reporting \n Yes Castellum will publish a ye...,Where Castellum is responsible for project ope...,"[3, 6]"
4,Cibus_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,6 Conclusion \nCibus has developed the Cibu...,Section 3. Detailed review \nReviewers are en...,3 No information provided by Sustainalytics un...,Use of proceeds categories as per GBP: \n☐ Re...,2 Introduction \nCibus Nordic Real Estate AB ...,Evaluation Summary \nSustainalytics is of the...,"13 overarching objectives, strategy, policy an...",4. REPORTING \nOverall comment on section (if...,5 Section 3: Impact of Use of Proceeds \nThe ...,9 Review provider’s name: Sustainalytics \nC...,"[1, 2, 3, 5, 6, 9]"
5,Cibus_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,"Where appropriate, please specify name and dat...",12 ☐ Other (please specify): \nImpact report...,10 ☐ Pollution prevention and control ☐ Envir...,9 Review provider’s name: Sustainalytics \nC...,4. REPORTING \nOverall comment on section (if...,2. PROCESS FOR PROJECT EVALUATION AND SELECTIO...,Use of proceeds categories as per GBP: \n☐ Re...,2 Introduction \nCibus Nordic Real Estate AB ...,11 \n3. MANAGEMENT OF PROCEEDS \nOverall co...,14 Disclaimer \nCopyright ©2020 Sustainalytic...,"[3, 8]"
6,CTP_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,Conclusion \nCTP has developed the CTP Green...,"2 Introduction \nCTP Group (“CTP”, or “the Co...",9 The eligible categories for the use of proc...,Use of proceeds categories as per GBP: \n☒ Re...,Evaluation Summary \nSustainalytics is of the...,4. REPORTING \nOverall comment on section (if...,3 No infor mation provided by Sustainalytics u...,Section 3. Detailed review \nReviewe rs are e...,"12 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",5 habitats.” Concerning land use the Directive...,"[1, 2, 3, 4, 5, 7, 10]"
7,CTP_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,11 Frequency: \n☒ Annual ☐ Semi -annual \n☐...,"12 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",4. REPORTING \nOverall comment on section (if...,Use of proceeds categories as per GBP: \n☒ Re...,10 ☐ Other (please specify): \n3. MANAGEMEN...,13 Disclaimer \nCopyright ©2020 Sustainalytic...,2. PROCESS FOR PROJECT EVALUATION AND SELECTIO...,"2 Introduction \nCTP Group (“CTP”, or “the Co...","18 Ministry of Energy, “ Integrated National E...","4 environmental impact metrics, see Appendix 2...","[1, 4, 8, 10]"
8,Deutsche%20Wohnen_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,Conclusion \nDeutsche Wohnen has developed ...,10 \nThe eligible categories for the use of p...,Evaluation Summary \nSustainalytics is of the...,\nUse of proceeds categories as per GBP: \n☒...,"3 Sustainalytics’ Second -Party Opinion, while...",2 Introduction \nDeutsche Wohnen SE Group (...,Net proceeds will be managed by Deutsche Wohne...,Deutsche Wohnen intends to report on the allo...,Section 3. Detailed review \nReviewers are en...,"4 - Within the Renewable Energy category, Deut...","[1, 2, 3, 4, 5, 6, 10]"
9,Deutsche%20Wohnen_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,12 ☐ Linkage to individual bond(s) ☐ Other (p...,"13 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",Deutsche Wohnen intends to report on the allo...,\nUse of proceeds categories as per GBP: \n☒...,Tracking of procee ds: \n☒ Green bond proceed...,Evaluation and selection \n☒ Credentials on t...,14 Disclaimer \nCopyright ©202 1 Sustainalyti...,2 Introduction \nDeutsche Wohnen SE Group (...,"3 Sustainalytics’ Second -Party Opinion, while...",5 As part of the Company’s focus on its envir...,"[8, 9]"


In [146]:
df_bert

Unnamed: 0,Company,Sector,Query,Method,Passage 1,Passage 2,Passage 3,Passage 4,Passage 5,Passage 6,Passage 7,Passage 8,Passage 9,Passage 10,labels
1,Blackstone_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,Use of proceeds reporting : \n☐ Project -by-pr...,"15 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",14 ☐ Linkage to individual bond(s) ☐ Other (p...,\nUse of proceeds categories as per GBP: \n☒...,Tracking of proceeds: \n☒ Green Bond proceeds...,Evaluation and selection \n☒ Credentials on t...,16 Disclaimer \nCopyright ©202 1 Sustainalyti...,6 BPPEH has provided to Sustainalytics a list ...,Conclusion \nBPPEH has developed the Blacks...,6 implementation of its group -wide sustainab...,"[4, 8, 9, 10]"
3,Castellum_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,Tracking of proceeds: \n☒ Green Bond proceeds...,\n 18 ☒ Disclosure of portfolio balance of \n...,"\n 2 EXECUTIVE SUMMARY \nCastellum, an indu...",1. INTRODUCTION \nCastellum intends to issue ...,2.3.1 Use of Proceeds \nThe proceeds of the ...,2.3 Castellum’s Green Bond \nCastellum intend...,\n 8 Management \nof Proceeds \n Yes The ne...,\n4. REPORTING \nOverall comment on section ...,Reporting \n Yes Castellum will publish a ye...,Where Castellum is responsible for project ope...,"[3, 6]"
5,Cibus_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,"Where appropriate, please specify name and dat...",12 ☐ Other (please specify): \nImpact report...,10 ☐ Pollution prevention and control ☐ Envir...,9 Review provider’s name: Sustainalytics \nC...,4. REPORTING \nOverall comment on section (if...,2. PROCESS FOR PROJECT EVALUATION AND SELECTIO...,Use of proceeds categories as per GBP: \n☐ Re...,2 Introduction \nCibus Nordic Real Estate AB ...,11 \n3. MANAGEMENT OF PROCEEDS \nOverall co...,14 Disclaimer \nCopyright ©2020 Sustainalytic...,"[3, 8]"
7,CTP_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,11 Frequency: \n☒ Annual ☐ Semi -annual \n☐...,"12 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",4. REPORTING \nOverall comment on section (if...,Use of proceeds categories as per GBP: \n☒ Re...,10 ☐ Other (please specify): \n3. MANAGEMEN...,13 Disclaimer \nCopyright ©2020 Sustainalytic...,2. PROCESS FOR PROJECT EVALUATION AND SELECTIO...,"2 Introduction \nCTP Group (“CTP”, or “the Co...","18 Ministry of Energy, “ Integrated National E...","4 environmental impact metrics, see Appendix 2...","[1, 4, 8, 10]"
9,Deutsche%20Wohnen_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,12 ☐ Linkage to individual bond(s) ☐ Other (p...,"13 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",Deutsche Wohnen intends to report on the allo...,\nUse of proceeds categories as per GBP: \n☒...,Tracking of procee ds: \n☒ Green bond proceed...,Evaluation and selection \n☒ Credentials on t...,14 Disclaimer \nCopyright ©202 1 Sustainalyti...,2 Introduction \nDeutsche Wohnen SE Group (...,"3 Sustainalytics’ Second -Party Opinion, while...",5 As part of the Company’s focus on its envir...,"[8, 9]"
11,Globalworth_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,Frequency: \n☒ Annual ☐ Semi -annual \n☐ Ot...,\nUse of proceeds reporting: \n☐ Project -by...,\nUse of proceeds categories as per GBP: \n☐...,\nTracking of proceeds: \n☒ Green Bond proce...,11 Evaluation and selection \n☒ Credentials o...,Section 2 . Review overview \nSCOPE OF REVIEW...,2 Introduction \nGlobalworth Real Estate Inve...,\nConclusion \nGlobalworth has developed th...,Evaluation Summary \nSustainalytics is of the...,"10 GWI, “Globalworth Real Estate Investments L...","[3, 7, 8, 9]"
13,ICADE_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,Use of proceeds reporting: \n☐ Project -by-pr...,Tracking of proceeds: \n☒ Green Bond proceed...,15 \nUse of proceeds categories as per GBP: ...,Well positioned to mitigate common Environment...,"7 \nCO 2 emissions. In addit ion, the real e...","\nConclusion \nIcade, one of the leading com...","In France, the real estate sector accounts for...",Sustainalytics considers that Icade ’s Green B...,6 Similarly to a positive energy building whic...,"However, Sustainalytics notes that t he majori...","[3, 4, 6, 7, 9]"
15,Johnson_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,13 Information reported: \n☒ Allocated amount...,4. REPORTING \nOverall comment on section (if...,Use of proceeds categories as per GBP: \n☒ Re...,12 Information on Responsibilities and Accoun...,2 Introduction \nJohnson Controls Internation...,2. PROCESS FOR PROJECT EVALUATION AND SELECTIO...,Conclusion \nJCI has developed the Johnson C...,3 Any update of t he present Second -Party Opi...,6 projects could also lead to negative environ...,"4 Potential ).6 Furthermore, in relation to ...","[3, 5, 7, 8]"
17,Specialfastigheter-External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,Use of proceeds reporting: \n☒ Project -by-pr...,Tracking of proceeds: \n☒ Green Bond proceeds...,Use of proceeds categories as per GBP: \n☒ Re...,14 \n ☐ Other (please specify) : \nAdditional ...,Contribution of the proceeds of the green bond...,"As a state -owned entity, Specialfa stigheter ...",Specialfastigheter has engaged Sustainalytics...,"Through consultation, Sustainalytics understan...",Specialfastigheter has developed a Green Bond ...,Disclaimer \nAll rights reserved. No part of ...,"[3, 9]"
19,tritax-eurobox_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,BERT,13 Use of proceeds reporting : \n☐ Project -by...,12 Information on Responsibilities and Accoun...,"SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, IF A...",Use of proceeds categories as per GBP: \n☒ Re...,\nTracking of proceeds: \n☒ Green Bond proce...,\nEvaluation and selection \n☒ Credentials o...,"2 Introduction \nTritax EuroBox plc (""Tritax...",15 Disclaimer \nCopyright ©202 1 Sustainalyti...,4 been disclosed by the Group to represent a ...,8 Conclusion \nTritax Euro Box has developed...,"[4, 7, 9]"


In [147]:
df_t5

Unnamed: 0,Company,Sector,Query,Method,Passage 1,Passage 2,Passage 3,Passage 4,Passage 5,Passage 6,Passage 7,Passage 8,Passage 9,Passage 10,labels
0,Blackstone_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,12 \nThe eligible categories for the use of p...,Conclusion \nBPPEH has developed the Blacks...,Evaluation Summary \nSustainalytics is of the...,5 • Project Evaluation and Selection: \n- BPP...,"The Company, under the supervision of the Gree...",Use of proceeds reporting : \n☐ Project -by-pr...,\nUse of proceeds categories as per GBP: \n☒...,BPPEH engaged Sustainalytics to review the F...,7 Section 3: Impact of Use of Proceeds \nAll ...,3 This docum ent contains Sustainalytics’ opin...,"[1, 2, 3, 7, 9, 10]"
2,Castellum_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,2.3.1 Use of Proceeds \nThe proceeds of the ...,1. USE OF PROCEEDS \nOverall comment on secti...,Reporting \n Yes Castellum will publish a ye...,\n 5 applicable criteria or to repay a Green ...,Although Castellum does not commit to a ratio ...,2.3.2 Process for Project Evaluation and Sele...,2.3 Castellum’s Green Bond \nCastellum intend...,Castellum will publish a yearly report describ...,2.2.3 Management of Proceeds \nGreen Bond n...,\n4. REPORTING \nOverall comment on section ...,"[2, 5, 6, 7]"
4,Cibus_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,6 Conclusion \nCibus has developed the Cibu...,Section 3. Detailed review \nReviewers are en...,3 No information provided by Sustainalytics un...,Use of proceeds categories as per GBP: \n☐ Re...,2 Introduction \nCibus Nordic Real Estate AB ...,Evaluation Summary \nSustainalytics is of the...,"13 overarching objectives, strategy, policy an...",4. REPORTING \nOverall comment on section (if...,5 Section 3: Impact of Use of Proceeds \nThe ...,9 Review provider’s name: Sustainalytics \nC...,"[1, 2, 3, 5, 6, 9]"
6,CTP_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,Conclusion \nCTP has developed the CTP Green...,"2 Introduction \nCTP Group (“CTP”, or “the Co...",9 The eligible categories for the use of proc...,Use of proceeds categories as per GBP: \n☒ Re...,Evaluation Summary \nSustainalytics is of the...,4. REPORTING \nOverall comment on section (if...,3 No infor mation provided by Sustainalytics u...,Section 3. Detailed review \nReviewe rs are e...,"12 SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, I...",5 habitats.” Concerning land use the Directive...,"[1, 2, 3, 4, 5, 7, 10]"
8,Deutsche%20Wohnen_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,Conclusion \nDeutsche Wohnen has developed ...,10 \nThe eligible categories for the use of p...,Evaluation Summary \nSustainalytics is of the...,\nUse of proceeds categories as per GBP: \n☒...,"3 Sustainalytics’ Second -Party Opinion, while...",2 Introduction \nDeutsche Wohnen SE Group (...,Net proceeds will be managed by Deutsche Wohne...,Deutsche Wohnen intends to report on the allo...,Section 3. Detailed review \nReviewers are en...,"4 - Within the Renewable Energy category, Deut...","[1, 2, 3, 4, 5, 6, 10]"
10,Globalworth_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,\nConclusion \nGlobalworth has developed th...,\nUse of proceeds reporting: \n☐ Project -by...,\nUse of proceeds categories as per GBP: \n☐...,The eligible categor ies for the use of proce...,2 Introduction \nGlobalworth Real Estate Inve...,4 Sustainalytics’ Opinion \nSection 1: Sustai...,10 \nSection 3. Detailed review \nReviewers ...,Evaluation Summary \nSustainalytics is of the...,ABOUT ROLE(S) OF INDEPENDENT REVIEW PROVIDERS ...,Section 2 . Review overview \nSCOPE OF REVIEW...,"[1, 3, 4, 5, 6, 8]"
12,ICADE_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,15 \nUse of proceeds categories as per GBP: ...,Use of proceeds reporting: \n☐ Project -by-pr...,Sustainalytics is of the opinion that projects...,3 \nIcade has engaged Sustainalytics to prov...,"\nConclusion \nIcade, one of the leading com...","However, Sustainalytics notes that t he majori...",• Strong commitment to allocation and imp act ...,16 \n3. MANAGEMENT OF PROCEEDS \nOverall co...,Contribu tion of the proceeds of the green bon...,Tracking of proceeds: \n☒ Green Bond proceed...,"[1, 5]"
14,Johnson_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,Conclusion \nJCI has developed the Johnson C...,11 The eligible categories for the use of proc...,Evaluation Summary \nSustainalytics is of the...,Use of proceeds categories as per GBP: \n☒ Re...,3 Any update of t he present Second -Party Opi...,According to the International Energy Agency (...,2 Introduction \nJohnson Controls Internation...,Section 3. Detailed review \nReviewe rs are e...,4. REPORTING \nOverall comment on section (if...,14 Review provider(s): Date of publication: ...,"[1, 2, 3, 4, 5, 6, 7]"
16,Specialfastigheter-External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,Specialfastigheter has developed a Green Bond ...,"As a state -owned entity, Specialfa stigheter ...",12 \n Section 3. Detailed review \n1. USE OF...,Use of proceeds categories as per GBP: \n☒ Re...,Use of proceeds reporting: \n☒ Project -by-pr...,"Overall, Sustainalytics is of the opinion that...",3. MANAGEMENT OF PROCEEDS \nOverall comment o...,Specialfastigheter has engaged Sustainalytics...,Disclaimer \nAll rights reserved. No part of ...,"Through consultation, Sustainalytics understan...","[1, 2, 3, 4, 6]"
18,tritax-eurobox_External%20Review%20Report.pdf,Corporate-Real Estate,what did the use of proceeds finance?,T5,11 The eligible categories for the use of proc...,Evaluation Summary \nSustainalytics is of the...,8 Conclusion \nTritax Euro Box has developed...,"3 Sustainalytics ' Second -Party Opinion, whil...",13 Use of proceeds reporting : \n☐ Project -by...,Use of proceeds categories as per GBP: \n☒ Re...,4. REPORTING \nOverall comment on section (if...,Section 3. Detailed review \nReviewers are en...,"2 Introduction \nTritax EuroBox plc (""Tritax...","SPECIFY OTHER EXTERNAL REVIEWS AVAILABLE, IF A...","[1, 2, 3, 4, 6, 9]"


In [252]:
def calculate_metrics(df):
  labels = df["labels"]

  # recall@1
  recall_1 = len([label for label in labels if 1 in label]) / len(labels)
  # recall@5
  recall_5 = len([label for label in labels if any(item in label for item in list(range(1,6)))]) / len(labels)

  # mrr@5
  mrr_5 = [1/label[0] if label[0] <= 5 else 0 for label in df["labels"]]
  mrr_5 = sum(mrr_5) / len(mrr_5)

  # mrr@10
  mrr_10 = [1/label[0] if label[0] <= 10 else 0 for label in df["labels"]]
  mrr_10 = sum(mrr_10) / len(mrr_10)

  # map@5
  map_5 = []
  # map@10
  map_10 = []
  # for each label:
  # e.g. [4, 8, 9, 10]
  for label in df["labels"]:
    # @ 5 - [4]
    five = [num for num in label if num <= 5]
    # @ 10 - [4, 8, 9, 10]
    ten = [num for num in label if num <= 10]

    if five:
      # get average precision
      mini_5 = []
      for i, item in enumerate(five):
        # i+1 tracks the index of how many correct answers - e.g. 0+1 (for first answer)
        # item tracks the rank of correct answer - e.g. following example : 4
        mini_5.append( ((i+1)/item) )
      # average precision : (1/4) / 1
      avg_precision = sum(mini_5) / len(mini_5)
      # Store in main list
      map_5.append(avg_precision)
    else:
      map_5.append(0)

    if ten:
      # get average precision
      mini_10 = []
      for i, item in enumerate(ten):
        mini_10.append( ((i+1)/item) )
      # avg precision : (1/4 + 2/8 + 3/9 + 4/10) / 4
      avg_precision = sum(mini_10) / len(mini_10)
      # Store in main list
      map_10.append(avg_precision)
    else:
      map_10.append(0)

  map_5 = sum(map_5) / len(map_5)
  map_10 = sum(map_10) / len(map_10)

  return {
      "recall@1" : recall_1,
      "recall@5" : recall_5,
      "MRR@5" : mrr_5,
      "MRR@10" : mrr_10,
      "MAP@5" : map_5,
      "MAP@10" : map_10
  }

In [253]:
calculate_metrics(df_t5)

{'recall@1': 0.9310344827586207,
 'recall@5': 1.0,
 'MRR@5': 0.9655172413793104,
 'MRR@10': 0.9655172413793104,
 'MAP@5': 0.9286398467432949,
 'MAP@10': 0.878488023561915}

In [254]:
calculate_metrics(df_bert)

{'recall@1': 0.10344827586206896,
 'recall@5': 0.9310344827586207,
 'MRR@5': 0.35344827586206895,
 'MRR@10': 0.36350574712643674,
 'MAP@5': 0.34463601532567045,
 'MAP@10': 0.3622883732036875}