In [1]:
import pandas as pd
import json

import numpy as np
import faiss
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

from transformers import logging

import os
import chardet
import re

from sacrebleu.metrics import BLEU

import fitz  
from bert_score import score

In [2]:
with open('../Data/bias_terms.json', 'r') as file:
    bias_json_data = json.load(file)
print(bias_json_data)

[{'Bias_Type': 'Adherence bias', 'Description': 'A systematic distortion in outcome data that arises when participants who adhere to a study protocol or intervention differ from those who do not adhere, when that difference relates to the outcome of interest.'}, {'Bias_Type': 'Admission rate bias', 'Description': 'Arises when the variables under study are affected by the selection of hospitalized subjects leading to a bias between the exposure and the\xa0disease under study.'}, {'Bias_Type': 'All’s well literature bias', 'Description': 'Occurs when publications omit or play down controversies or disparate results.'}, {'Bias_Type': 'Allocation bias', 'Description': 'Systematic difference in how participants are assigned to comparison groups in a clinical trial.'}, {'Bias_Type': 'Apprehension bias', 'Description': 'When a study participant responds differently due to being observed'}, {'Bias_Type': 'Ascertainment bias', 'Description': 'Systematic differences in the identification of indi

In [3]:
def pdf_to_txt(pdf_path, txt_path):
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
                    print(f"Error reading {pdf_path}: {e}")
                    
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        for page in doc:
            text = page.get_text()
            txt_file.write(text)
    print("extracted txt file")
    doc.close()

In [4]:
def read_text_with_encoder(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read(10000)
    
    detected = chardet.detect(raw_data)
    encoding = detected['encoding']
    #print("Detected encoding:", encoding)

    # Now open the file with the detected encoding
    with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
        text =  file.read()
        text = text.replace('\ufffd', '')
    return text

In [5]:
base_dir = '../Data/Policy_docs'
#folders = ['CHL', 'DEU', 'DNK', 'EGY', 'FRA', 'ITA', 'MUS', 'UAE', 'USA']
folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

contents_dict = {}


for folder in folders:
    folder_path = os.path.join(base_dir, folder).replace("\\", "/")
    #print(folder_path)
    
    try:
        new_folders = os.listdir(folder_path)
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
        continue
    
    country_docs = []
    # loop over multiple documents per country
    for new_folder in new_folders:
        # TEMP: stop if more than 5 docs are present per country
        
        new_folder_path = os.path.join(folder_path, new_folder).replace("\\", "/")
        #print(new_folder_path)
        files = os.listdir(new_folder_path)
        # Filter the list to only include .txt files
        txt_files = [f for f in files if f.lower().endswith('.txt')]
        pdf_file = [f for f in files if f.lower().endswith('.pdf')][0]
        pdf_file_path = os.path.join(new_folder_path, pdf_file).replace("\\", "/")
        
        
        txt_file = pdf_file.rsplit('.', 1)[0] + '.txt'
        txt_file_path = os.path.join(new_folder_path, txt_file)
        if not txt_files:
            # Transform pdf into txt file
            pdf_to_txt(pdf_file_path, txt_file_path)
            # Reload txt files after creating it
            txt_files = [f for f in files if f.lower().endswith('.txt')]
        

        for txt_file in txt_files:
            # Make sure to not open link txt file
            if txt_file != "link.txt":
                file_path = os.path.join(new_folder_path, txt_file).replace("\\", "/")
                try:
                    file_text = read_text_with_encoder(file_path)
                    country_docs.append(file_text)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
        
        
    contents_dict[folder] = country_docs

In [6]:
# Pattern matches a string that starts with one or more digits.
pattern = re.compile(r'^\d\s*')


splitted_documents = {}
for country in contents_dict.keys():
    country_docs = []
    for document in contents_dict[country]:
        document  = document.replace('\n', '')

        # Remove website links (matches http://, https://, and www. links)
        document = re.sub(r'\b(?:https?://|www\.)\S+\b', '', document)

        splitted_document = document.split('.')
        splitted_document = [s.strip() for s in splitted_document if s.strip()]
        splitted_document = [s for s in splitted_document if '@' not in s]

        splitted_document = [s for s in splitted_document if not pattern.match(s)]

        # Delete each chunk that consists of less than 10 characters
        splitted_document = [s for s in splitted_document if len(s) >= 10]

        splitted_document = [s for s in splitted_document if 'et al' not in s]
        
        country_docs.append(splitted_document)
    splitted_documents[country] = country_docs
    


In [7]:
print(len(splitted_documents))

28


In [30]:
#(print(doc(chunk)) for chunk in doc in splitted_documents["USA"])

for country in splitted_documents.keys():
    for doc in splitted_documents[country]:
        print(country)
        print(len(doc))


ARG
1875
ARG
6471
BRA
642
CAN
242
CHE
179
CHE
2149
CHL
877
COL
414
DEU
427
DNK
790
EGY
852
ESP
772
EU
388
EU
1667
FRA
383
GBR
353
GRC
33
IRL
77
IRL
432
IRL
8
ISR
828
ITA
195
JPN
1246
JPN
681
MUS
791
NLD
578
NLD
423
POL
1733
PRT
308
SRB
766
SWE
258
THA
2452
THA
238
UAE
405
URY
1480
USA
514
USA
24
USA
17
USA
576
USA
749
USA
788
USA
581
USA
76


In [None]:

logging.set_verbosity_error()

docs = [
    Document(page_content=item["Description"], metadata={"bias": item["Bias_Type"]})
    for item in bias_json_data
]

docs_text = [doc.page_content for doc in docs]

# model_type = "sentence-transformers/LaBSE"
model_type = "paraphrase-multilingual-MiniLM-L12-v2"

embeddings = HuggingFaceEmbeddings(model_name=model_type)

raw_embeddings = embeddings.embed_documents(docs_text)

# Apply L2 normalization
normalized_embeddings = []
for emb in raw_embeddings:
    
    norm = np.linalg.norm(emb)
    # Avoid division by zero
    if norm > 0:
        normalized_embeddings.append(emb / norm)
    else:
        normalized_embeddings.append(emb)
        
# Convert to a numpy array to and ensure float32 type for FAISS
normalized_embeddings = np.array(normalized_embeddings).astype("float32")

embedding_dim = normalized_embeddings.shape[1]

# Create a FAISS index that uses inner product (which, for normalized vectors, equals cosine similarity)
index = faiss.IndexFlatIP(embedding_dim)
index.add(normalized_embeddings)

docstore = {i: doc for i, doc in enumerate(docs)}

bleu_metric = BLEU(effective_order=True)

results_list = []
for country in splitted_documents.keys():
    doc_number = 1
    for doc in splitted_documents[country]:
    # Perform similarity search on policy sentences
        for chunk in doc:
            # Compute the raw query embedding
            query_emb = embeddings.embed_query(chunk)
            # Normalize the query embedding
            norm = np.linalg.norm(query_emb)
            if norm > 0:
                query_emb = query_emb / norm
            else:
                query_emb = query_emb
            # Convert to numpy array with shape (1, embedding_dim)
            query_emb_np = np.array([query_emb]).astype("float32")
            
            # Perform search in the FAISS index
            distances, indices = index.search(query_emb_np, k=1)
            best_idx = indices[0][0]
            best_score = distances[0][0] 
            
            best_doc = docstore[best_idx]            
            
            candidate = chunk  # this should be a string
            reference = best_doc.page_content  # also a string

            # sacreBLEU expects a list of references, so wrap the reference in a list
            sentence_bleu_score = bleu_metric.sentence_score(candidate, [reference]).score
            
            """
            model_type = "distilbert-base-multilingual-cased"
            _, _, bert_f1 = score([candidate], [reference], model_type = model_type, verbose=False)
            
            bert_score_value = bert_f1[0].item() if hasattr(bert_f1[0], "item") else bert_f1[0]
            """
            
            results_list.append({
                "Country": country,
                "Document_nr": doc_number,
                "Text Chunk": chunk,
                "Best Matching Bias": best_doc.metadata["bias"],
                "Bias Description": best_doc.page_content,
                "Similarity Score": best_score,
                "BLEU Score": sentence_bleu_score
            })
        print(doc_number)
        doc_number += 1
        

df_chunk_bias = pd.DataFrame(results_list)


ARTIFICIAL INTELLIGENCE FOR SOCIAL GOOD IN LATIN AMERICA AND  THE CARIBBEAN:    The Regional Landscape and  12 Country Snapshots A fAIr LAC initiative reporthttps:/ /  2020 Banco Interamericano de Desarrollo
Esta obra se encuentra sujeta a  una licencia Creative Commons IGO 3
org/licenses/by-nc-nd/3
No se permiten obras derivadas
Cualquier disputa relacionada con el uso de las obras del BID que no pueda resolverse amistosamente se someter a arbitraje de conformidad con las reglas de la CNUDMI (UNCITRAL)
El uso del nombre del BID para cualquier fin distinto al reconocimiento respectivo y el uso del logotipo del BID, no estn autorizados por esta licencia CC-IGO y requieren de un acuerdo de licencia adicional
Note que el enlace URL incluye trminos y condiciones adicionales de esta licencia
Las opiniones expresadas en esta publicacin son de los autores y no necesariamente reflejan el punto de vista del Banco Interamericano de Desarrollo, de su Directorio Ejecutivo ni de los pases que repre

KeyboardInterrupt: 

In [28]:
df_chunk_bias

Unnamed: 0,Country,Document_nr,Text Chunk,Best Matching Bias,Bias Description,Similarity Score,BLEU Score
0,ARG,1,ARTIFICIAL INTELLIGENCE FOR SOCIAL GOOD IN LAT...,Confirmation bias,The search for and use of information to suppo...,0.198970,1.379446
1,ARG,1,Esta obra se encuentra sujeta a una licencia ...,Availability bias,A distortion that arises from the use of infor...,0.232810,0.000000
2,ARG,1,org/licenses/by-nc-nd/3,One-sided reference bias,When authors restrict their references to only...,0.232993,0.000000
3,ARG,1,No se permiten obras derivadas,Language bias,Publication of research findings in a particul...,0.132271,0.000000
4,ARG,1,Cualquier disputa relacionada con el uso de la...,Hypothetical bias,A distortion that arises when an individual’s ...,0.219305,0.000000
...,...,...,...,...,...,...,...
32905,USA,5,Given the role of the AI R&D workforce in ad...,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.210792,2.513185
32906,USA,5,NITRD should study how best to characterize a...,Confirmation bias,The search for and use of information to suppo...,0.210815,1.274404
32907,USA,5,"As indicated by the outcome of the studies, ap...",Industry Sponsorship bias,A tendency for the methods and results of a st...,0.275070,3.156862
32908,USA,5,NATIONAL ARTIFICIAL INTELLIGENCE RESEARCH AND ...,Confirmation bias,The search for and use of information to suppo...,0.143324,0.125430


In [29]:
df_chunk_bias[(df_chunk_bias['Country'] == 'USA') & (df_chunk_bias['Document_nr'] == 1)].head(50)

Unnamed: 0,Country,Document_nr,Text Chunk,Best Matching Bias,Bias Description,Similarity Score,BLEU Score
31030,USA,1,Federal Data Strategy2020 Action Plan1,Confirmation bias,The search for and use of information to suppo...,0.20423,0.0
31031,USA,1,Identify Da ta Needs to Answer Priority Ag en...,Availability bias,A distortion that arises from the use of infor...,0.222693,0.0
31032,USA,1,Cons titute a Diverse Da ta Governan ce Body3,Language bias,Publication of research findings in a particul...,0.238861,4.767707
31033,USA,1,As sess Da ta and Rel ated Infr astructure Ma ...,Language bias,Publication of research findings in a particul...,0.139427,0.0
31034,USA,1,Identify Opportunities to Incre ase St aff Da...,Confirmation bias,The search for and use of information to suppo...,0.271194,2.083729
31035,USA,1,Identify Priority Da tasets for Ag ency Open D...,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.195771,1.81102
31036,USA,1,Publish and Update Data Invent ories7,Publication bias,When the likelihood of a study being published...,0.245957,0.0
31037,USA,1,Launch a F eder al Chief Da ta Offic er Council8,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.137328,1.81102
31038,USA,1,Improve Da ta and Model Resour ces for AI Rese...,Observer bias,The process of observing and recording informa...,0.136986,2.648568
31039,USA,1,Improve Financial Manag ement Da ta Sta ndard s10,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.286933,0.0


In [30]:
# Compute the 25th percentile similarity score for each group and assign it to a new series
thresholds = df_chunk_bias.groupby(['Country', 'Document_nr'])['Similarity Score'].transform(lambda x: x.quantile(0.25))

# Filter the rows that have a similarity score above the threshold
df_chunk_bias = df_chunk_bias[df_chunk_bias['Similarity Score'] > thresholds]

df_chunk_bias

Unnamed: 0,Country,Document_nr,Text Chunk,Best Matching Bias,Bias Description,Similarity Score,BLEU Score
0,ARG,1,ARTIFICIAL INTELLIGENCE FOR SOCIAL GOOD IN LAT...,Confirmation bias,The search for and use of information to suppo...,0.198970,1.379446
1,ARG,1,Esta obra se encuentra sujeta a una licencia ...,Availability bias,A distortion that arises from the use of infor...,0.232810,0.000000
2,ARG,1,org/licenses/by-nc-nd/3,One-sided reference bias,When authors restrict their references to only...,0.232993,0.000000
4,ARG,1,Cualquier disputa relacionada con el uso de la...,Hypothetical bias,A distortion that arises when an individual’s ...,0.219305,0.000000
5,ARG,1,El uso del nombre del BID para cualquier fin d...,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.188946,0.000000
...,...,...,...,...,...,...,...
32901,USA,5,"To help implement this Strategic Plan, NITRD s...",Industry Sponsorship bias,A tendency for the methods and results of a st...,0.228386,0.000000
32904,USA,5,While some reports have indicated a potential ...,One-sided reference bias,When authors restrict their references to only...,0.178106,0.744529
32905,USA,5,Given the role of the AI R&D workforce in ad...,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.210792,2.513185
32906,USA,5,NITRD should study how best to characterize a...,Confirmation bias,The search for and use of information to suppo...,0.210815,1.274404


In [31]:
def compute_prevalent_bias(group):
    # Compute statistics for each bias type in the group
    bias_stats = group.groupby("Best Matching Bias").agg(
        frequency=('Best Matching Bias', 'count'),
        avg_similarity=('Similarity Score', 'mean'),
        avg_bleu=('BLEU Score', 'mean'), 
        avg_length=('Text Chunk', lambda s: s.str.len().mean())
    )
    
    # Prevalence Score = frequency * (avg_similarity ** 2) * (avg_length ** 0.5)
    #bias_stats['prevalence'] = bias_stats['frequency'] * (bias_stats['avg_similarity'] ** 2) * (bias_stats['avg_length'] ** 0.5)
    bias_stats['prevalence'] = bias_stats['frequency'] * (bias_stats['avg_similarity'] ** 2)

    # Get the bias type with the maximum prevalence score
    best_bias = bias_stats['prevalence'].idxmax()
    bias_stats.head(20)

    # Return a series with the results
    return pd.Series({
        "Most Prevalent Bias": best_bias,
        "Prevalence Score": bias_stats.loc[best_bias, "prevalence"],
        "Bias Frequency": bias_stats.loc[best_bias, "frequency"],
        "Mean Similarity Score": bias_stats.loc[best_bias, "avg_similarity"],
        "Mean BLEU Score": bias_stats.loc[best_bias, "avg_bleu"]
    })


df_document_bias = df_chunk_bias.groupby(["Country", "Document_nr"]).apply(compute_prevalent_bias).reset_index()

df_document_bias


  df_document_bias = df_chunk_bias.groupby(["Country", "Document_nr"]).apply(compute_prevalent_bias).reset_index()


Unnamed: 0,Country,Document_nr,Most Prevalent Bias,Prevalence Score,Bias Frequency,Mean Similarity Score,Mean BLEU Score
0,ARG,1,Industry Sponsorship bias,34.525407,446,0.278229,2.02153
1,ARG,2,Informed presence bias,42.699146,437,0.312586,1.585495
2,BRA,1,Industry Sponsorship bias,9.926751,118,0.290043,0.646302
3,CAN,1,Confirmation bias,3.824637,47,0.285263,1.890667
4,CHE,1,Language bias,1.18464,30,0.198716,0.441193
5,CHE,2,Availability bias,7.552572,376,0.141727,0.764882
6,CHL,1,Language bias,19.676771,282,0.264151,0.483642
7,COL,1,Language bias,4.822405,57,0.290867,0.491033
8,DEU,1,Industry Sponsorship bias,8.811886,100,0.296848,2.243034
9,DNK,1,Confirmation bias,14.5907,168,0.294702,2.607769


In [21]:
json_data = df_document_bias.to_json(orient='records')

parsed_json = json.loads(json_data)
pretty_json = json.dumps(parsed_json, indent=4)
print(pretty_json)

with open("RAG_data/document_metrics.json", "w") as f:
    json.dump(parsed_json, f, indent=4)

[
    {
        "Country": "CHL",
        "Document_nr": 1,
        "Most Prevalent Bias": "Language bias",
        "Prevalence Score": 19.6767664701,
        "Mean Similarity Score": 0.2641510367,
        "Bias Frequency": 282,
        "Mean BLEU Score": 0.4836420824
    },
    {
        "Country": "DEU",
        "Document_nr": 1,
        "Most Prevalent Bias": "Industry Sponsorship bias",
        "Prevalence Score": 8.8118806481,
        "Mean Similarity Score": 0.2968481183,
        "Bias Frequency": 100,
        "Mean BLEU Score": 2.24303365
    },
    {
        "Country": "DNK",
        "Document_nr": 1,
        "Most Prevalent Bias": "Confirmation bias",
        "Prevalence Score": 14.5907027721,
        "Mean Similarity Score": 0.2947022617,
        "Bias Frequency": 168,
        "Mean BLEU Score": 2.6077690865
    },
    {
        "Country": "EGY",
        "Document_nr": 1,
        "Most Prevalent Bias": "Industry Sponsorship bias",
        "Prevalence Score": 16.4371029362,
  

In [22]:
export_structure = []

for country, docs in splitted_documents.items():
    for idx, doc in enumerate(docs, start=1):
        for sentence in doc:
            export_structure.append({
                "Country": country,
                "Document_nr": idx,
                "Text": sentence
            })

with open("RAG_data/documents_text.json", "w") as f:
    json.dump(export_structure, f, indent=4)

json_output = json.dumps(export_structure, indent=4)
print(json_output)

[
    {
        "Country": "CHL",
        "Document_nr": 1,
        "Text": "POLTICA NACIONAL DE INTELIGENCIA ARTIFICIALPOLTICA NACIONAL DE INTELIGENCIA ARTIFICIALCOMIT DE EXPERTOSCOORDINACIN REDACCIN Y EDICIN COMUNICACIONES  Y DISEOCOMIT INTERMINISTERIAL Marcelo ArenasPontificia Universidad Catlica de Chile e IM Fundamentos de los Datos Nstor BecerraUniversidad de Chile Raphael BergoeingUniversidad de Chile y Comisin Nacional de Productividad Alberto CerdaUniversidad de Chile Aisn EtcheverryAgencia Nacional de Investigacin y DesarrolloJos A"
    },
    {
        "Country": "CHL",
        "Document_nr": 1,
        "Text": "Guridi Carlos vilaDemin ArancibiaCarlos vilaMara Jos CaroJacinta GirardiNatalia Gonzlez R"
    },
    {
        "Country": "CHL",
        "Document_nr": 1,
        "Text": "GuridiAndrea Rivera Mnica MartinFrancisca MirandaPatricio AlarcnFernanda SchorrPilar Grant Ministerio de Ciencia, Tecnologa, Conocimiento e Innovacin Ministerio de Interior y Seguridad Pblica Mini