In [1]:
import pandas as pd
import json

import numpy as np
import faiss
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

import os
import chardet
import re

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


In [2]:
with open('../Data/bias_terms.json', 'r') as file:
    bias_json_data = json.load(file)
print(bias_json_data)

[{'Bias_Type': 'Adherence bias', 'Description': 'A systematic distortion in outcome data that arises when participants who adhere to a study protocol or intervention differ from those who do not adhere, when that difference relates to the outcome of interest.'}, {'Bias_Type': 'Admission rate bias', 'Description': 'Arises when the variables under study are affected by the selection of hospitalized subjects leading to a bias between the exposure and the\xa0disease under study.'}, {'Bias_Type': 'All’s well literature bias', 'Description': 'Occurs when publications omit or play down controversies or disparate results.'}, {'Bias_Type': 'Allocation bias', 'Description': 'Systematic difference in how participants are assigned to comparison groups in a clinical trial.'}, {'Bias_Type': 'Apprehension bias', 'Description': 'When a study participant responds differently due to being observed'}, {'Bias_Type': 'Ascertainment bias', 'Description': 'Systematic differences in the identification of indi

In [3]:
def read_text_with_encoder(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read(10000)
    
    detected = chardet.detect(raw_data)
    encoding = detected['encoding']
    print("Detected encoding:", encoding)

    # Now open the file with the detected encoding
    with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
        text =  file.read()
        text = text.replace('\ufffd', '')
    return text

In [4]:
base_dir = '../Data/Policy_docs'
folders = ['CHL', 'DEU', 'DNK', 'EGY', 'FRA', 'ITA', 'MUS', 'UAE', 'USA']

contents_dict = {}


for folder in folders:
    folder_path = os.path.join(base_dir, folder).replace("\\", "/")
    print(folder_path)
    
    try:
        new_folders = os.listdir(folder_path)
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
        continue
    
    country_docs = []
    # loop over multiple documents per country
    for new_folder in new_folders:
        # TEMP: stop if more than 5 docs are present per country
        if len(country_docs) >= 5:
            break
        new_folder_path = os.path.join(folder_path, new_folder).replace("\\", "/")
        #print(new_folder_path)
        files = os.listdir(new_folder_path)
        # Filter the list to only include .txt files
        txt_files = [f for f in files if f.lower().endswith('.txt')]
        
        if not txt_files:
            print(f"No .txt file found in folder: {new_folder_path}")
            continue

        for txt_file in txt_files:
            # Make sure to not open link txt file
            if txt_file != "link.txt":
                file_path = os.path.join(new_folder_path, txt_file).replace("\\", "/")
                try:
                    file_text = read_text_with_encoder(file_path)
                    country_docs.append(file_text)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
        
        
    contents_dict[folder] = country_docs

../Data/Policy_docs/CHL
Detected encoding: utf-8
../Data/Policy_docs/DEU
Detected encoding: utf-8
../Data/Policy_docs/DNK
Detected encoding: utf-8
../Data/Policy_docs/EGY
Detected encoding: utf-8
../Data/Policy_docs/FRA
Error reading ../Data/Policy_docs/FRA/Good Practice Recommendations to Integrate Ethics in the Development of AI Solutions in Healthcare/Good Practice Recommendations to Integrate Ethics in the Development of AI Solutions in Healthcare.txt: [Errno 2] No such file or directory: '../Data/Policy_docs/FRA/Good Practice Recommendations to Integrate Ethics in the Development of AI Solutions in Healthcare/Good Practice Recommendations to Integrate Ethics in the Development of AI Solutions in Healthcare.txt'
../Data/Policy_docs/ITA
Detected encoding: utf-8
../Data/Policy_docs/MUS
Detected encoding: ascii
../Data/Policy_docs/UAE
Detected encoding: utf-8
../Data/Policy_docs/USA
Detected encoding: utf-8
No .txt file found in folder: ../Data/Policy_docs/USA/Addition of Software Spe

In [5]:
len(contents_dict['UAE'])

1

In [6]:
# Pattern matches a string that starts with one or more digits followed by at least one space and then any characters.
pattern = re.compile(r'^\d+\s+.*$')


splitted_documents = {}
for country in contents_dict.keys():
    country_docs = []
    for document in contents_dict[country]:
        document  = document.replace('\n', '')

        # Remove website links (matches http://, https://, and www. links)
        document = re.sub(r'\b(?:https?://|www\.)\S+\b', '', document)

        splitted_document = document.split('.')
        splitted_document = [s.strip() for s in splitted_document if s.strip()]
        splitted_document = [s for s in splitted_document if '@' not in s]

        splitted_document = [s for s in splitted_document if not pattern.match(s)]

        # Delete each chunk that consists of less than 10 characters
        splitted_document = [s for s in splitted_document if len(s) >= 10]

        splitted_document = [s for s in splitted_document if 'et al' not in s]
        
        country_docs.append(splitted_document)
    splitted_documents[country] = country_docs
    


In [7]:
splitted_documents['UAE'][0]

['UAE NATIONAL STRATEGY FOR ARTIFICIAL INTELLIGENCE 2031 | 3WE WILL TRANSFORM THE UAE INTO A WORLD LEADER IN AI BY INVESTING IN PEOPLE AND INDUSTRIES THAT ARE KEY TO OUR SUCCESS',
 'Page 6  MINISTERIAL FORWARDPage 8  EXECUTIVE SUMMARYPage 10  WHERE THE UAE HAS OPPORTUNITIES TO LEAD  P age 10  Industry Assets & Emerging Sectors  P age 13  Smart Government  P age 13  Data Sharing and Governance  P age 14  New Generation of Regional TalentPage 18  EIGHT STRATEGIC OBJECTIVESPage 20  OBJECTIVE 1:  BUILD A R EPUTATION AS AN AI DESTINATION  Page 20  UAI BrandPage 22  OBJECTIVE 2:  INCR EASE THE UAE COMPETITIVE ASSETS IN PRIORITY     SECT ORS THROUGH DEPLOYMENT OF AI  Page 22  Existing Assets  Page 23  Emerging Sectors  Page 25  Proof-of-Concept Support in Priority SectorsPage 26  OBJECTIVE 3: DEVELOP A FERTILE ECOSYSTEM FOR AI  Page 26  AI Network  Page 26  Applied AI Accelerator  Page 28  AI Incentive Scheme for Overseas Companies  Page 29  Business Support for UAE AI FirmsPage 30  OBJECTIVE

In [14]:
import nltk
nltk.data.path = ['C:\\Users\\busjo\\AppData\\Roaming\\nltk_data']
nltk.download('punkt', download_dir='C:\\Users\\busjo\\AppData\\Roaming\\nltk_data')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\busjo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
import os
print(os.listdir('C:\\Users\\busjo\\AppData\\Roaming\\nltk_data\\tokenizers\\punkt'))


['.DS_Store', 'czech.pickle', 'danish.pickle', 'dutch.pickle', 'english.pickle', 'estonian.pickle', 'finnish.pickle', 'french.pickle', 'german.pickle', 'greek.pickle', 'italian.pickle', 'malayalam.pickle', 'norwegian.pickle', 'polish.pickle', 'portuguese.pickle', 'PY3', 'README', 'russian.pickle', 'slovene.pickle', 'spanish.pickle', 'swedish.pickle', 'turkish.pickle']


In [11]:
nltk.download('punkt')

docs = [
    Document(page_content=item["Description"], metadata={"bias": item["Bias_Type"]})
    for item in bias_json_data
]

docs_text = [doc.page_content for doc in docs]

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

raw_embeddings = embeddings.embed_documents(docs_text)

# Apply L2 normalization
normalized_embeddings = []
for emb in raw_embeddings:
    
    norm = np.linalg.norm(emb)
    # Avoid division by zero
    if norm > 0:
        normalized_embeddings.append(emb / norm)
    else:
        normalized_embeddings.append(emb)
        
# Convert to a numpy array to and ensure float32 type for FAISS
normalized_embeddings = np.array(normalized_embeddings).astype("float32")

embedding_dim = normalized_embeddings.shape[1]

# Create a FAISS index that uses inner product (which, for normalized vectors, equals cosine similarity)
index = faiss.IndexFlatIP(embedding_dim)
index.add(normalized_embeddings)

docstore = {i: doc for i, doc in enumerate(docs)}

results_list = []
for country in splitted_documents.keys():
    doc_number = 1
    for doc in splitted_documents[country]:
    # Perform similarity search on policy sentences
        for chunk in doc:
            # Compute the raw query embedding
            query_emb = embeddings.embed_query(chunk)
            # Normalize the query embedding
            norm = np.linalg.norm(query_emb)
            if norm > 0:
                query_emb = query_emb / norm
            else:
                query_emb = query_emb
            # Convert to numpy array with shape (1, embedding_dim)
            query_emb_np = np.array([query_emb]).astype("float32")
            
            # Perform search in the FAISS index
            distances, indices = index.search(query_emb_np, k=1)
            best_idx = indices[0][0]
            best_score = distances[0][0] 
            
            best_doc = docstore[best_idx]

            # Compute the BLEU score comparing the chunk (candidate) to the bias description (reference)
            smoothing = SmoothingFunction().method1
            print(type(best_doc.page_content))
            print(best_doc.page_content)
            reference_tokens = nltk.word_tokenize(best_doc.page_content)
            candidate_tokens = nltk.word_tokenize(chunk)
            bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing)

            results_list.append({
                "Country": country,
                "Document nr.": doc_number,
                "Text Chunk": chunk,
                "Best Matching Bias": best_doc.metadata["bias"],
                "Bias Description": best_doc.page_content,
                "Similarity Score": best_score,
                "BLEU Score": bleu_score
            })
        doc_number += 1

df_chunk_bias = pd.DataFrame(results_list)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\busjo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<class 'str'>
The use of an insufficiently accurate method to detect the outcome of interest, such that clinically important differences are not detected.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\busjo/nltk_data'
    - 'c:\\Users\\busjo\\Documents\\JADS\\Thesis\\AI_Policy_Thesis\\venv\\nltk_data'
    - 'c:\\Users\\busjo\\Documents\\JADS\\Thesis\\AI_Policy_Thesis\\venv\\share\\nltk_data'
    - 'c:\\Users\\busjo\\Documents\\JADS\\Thesis\\AI_Policy_Thesis\\venv\\lib\\nltk_data'
    - 'C:\\Users\\busjo\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [42]:
df_chunk_bias[(df_chunk_bias['Country'] == 'USA') & (df_chunk_bias['Document nr.'] == 1)]

Unnamed: 0,Country,Document nr.,Text Chunk,Best Matching Bias,Bias Description,Similarity Score
4440,USA,1,Federal Data Strategy2020 Action Plan1,Confirmation bias,The search for and use of information to suppo...,0.204230
4441,USA,1,Identify Da ta Needs to Answer Priority Ag en...,Availability bias,A distortion that arises from the use of infor...,0.222693
4442,USA,1,Cons titute a Diverse Da ta Governan ce Body3,Language bias,Publication of research findings in a particul...,0.238861
4443,USA,1,As sess Da ta and Rel ated Infr astructure Ma ...,Language bias,Publication of research findings in a particul...,0.139427
4444,USA,1,Identify Opportunities to Incre ase St aff Da...,Confirmation bias,The search for and use of information to suppo...,0.271195
...,...,...,...,...,...,...
4970,USA,1,Future annual Action Plans will build on the 2...,Compliance bias,Participants compliant with an intervention di...,0.226393
4971,USA,1,Feedback from stakeholders has and will contin...,Information bias,Bias that arises from systematic differences i...,0.339518
4972,USA,1,Future annual Action Plans will build on and e...,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.283798
4973,USA,1,gov or visit strategy,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.358807


In [56]:
# Compute the 25th percentile similarity score for each group and assign it to a new series
thresholds = df_chunk_bias.groupby(['Country', 'Document nr.'])['Similarity Score'].transform(lambda x: x.quantile(0.25))

# Filter the rows that have a similarity score above the threshold
df_chunk_bias = df_chunk_bias[df_chunk_bias['Similarity Score'] > thresholds]

df_chunk_bias

Unnamed: 0,Country,Document nr.,Text Chunk,Best Matching Bias,Bias Description,Similarity Score
0,CHL,1,POLTICA NACIONAL DE INTELIGENCIA ARTIFICIALPOL...,Insensitive measure bias,The use of an insufficiently accurate method t...,0.199898
3,CHL,1,"Ingeniera Industrial, FCFM, Universidad de Chi...",Language bias,Publication of research findings in a particul...,0.275003
6,CHL,1,"tica, Aspectos Legales y Regulatorios, 49 e Im...",Racial bias,"A distortion arising from systemic, institutio...",0.283611
10,CHL,1,Ciberseguridad y Ciberdefensa 613,Language bias,Publication of research findings in a particul...,0.262125
11,CHL,1,Gnero 64Glosario 68Referencias 72RESUMENEJECUT...,Language bias,Publication of research findings in a particul...,0.282046
...,...,...,...,...,...,...
5676,USA,5,Assess Dat a and Related Infr astructure Matur...,Incorporation bias,When the results of an index test form part of...,0.290301
5678,USA,5,Identify Priority Dat a Asse ts for Ag ency Op...,Chronological bias,When study participants allocated earlier to a...,0.277415
5681,USA,5,Identify Opportunities to Incr ease Staff Dat ...,Volunteer bias,Participants volunteering to take part in a st...,0.263223
5687,USA,5,Develop a Dat a Ethics Fr amework15,Industry Sponsorship bias,A tendency for the methods and results of a st...,0.265969


In [60]:
def compute_prevalent_bias(group):
    # Compute statistics for each bias type in the group
    bias_stats = group.groupby("Best Matching Bias").agg(
        frequency=('Best Matching Bias', 'count'),
        avg_similarity=('Similarity Score', 'mean'),
        avg_length=('Text Chunk', lambda s: s.str.len().mean())
    )
    
    # Prevalence Score = frequency * (avg_similarity ** 2) * (avg_length ** 0.5)
    #bias_stats['prevalence'] = bias_stats['frequency'] * (bias_stats['avg_similarity'] ** 2) * (bias_stats['avg_length'] ** 0.5)
    bias_stats['prevalence'] = bias_stats['frequency'] * (bias_stats['avg_similarity'] ** 2)

    # Get the bias type with the maximum prevalence score
    best_bias = bias_stats['prevalence'].idxmax()
    bias_stats.head(20)

    # Return a series with the results
    return pd.Series({
        "Most Prevalent Bias": best_bias,
        "Prevalence Score": bias_stats.loc[best_bias, "prevalence"],
        "Mean Similarity Score": bias_stats.loc[best_bias, "avg_similarity"],
        "Bias Frequency": bias_stats.loc[best_bias, "frequency"]
    })

# Group by Country and Document nr. and apply the function
df_document_bias = df_chunk_bias.groupby(["Country", "Document nr."]).apply(compute_prevalent_bias).reset_index()

df_document_bias


  df_document_bias = df_chunk_bias.groupby(["Country", "Document nr."]).apply(compute_prevalent_bias).reset_index()


Unnamed: 0,Country,Document nr.,Most Prevalent Bias,Prevalence Score,Mean Similarity Score,Bias Frequency
0,CHL,1,Language bias,20.239629,0.264638,289
1,DEU,1,Industry Sponsorship bias,8.87711,0.296466,101
2,DNK,1,Confirmation bias,15.052646,0.294125,174
3,EGY,1,Industry Sponsorship bias,16.835706,0.284501,208
4,ITA,1,Language bias,5.016751,0.260373,74
5,MUS,1,Industry Sponsorship bias,12.737538,0.277006,166
6,UAE,1,Industry Sponsorship bias,8.657556,0.273192,116
7,USA,1,Industry Sponsorship bias,9.694664,0.298231,109
8,USA,2,Confirmation bias,0.434662,0.294843,5
9,USA,3,Information bias,0.536736,0.518043,2


In [59]:
df_chunk_bias[(df_chunk_bias['Country'] == 'USA') & (df_chunk_bias['Document nr.'] == 3)]

Unnamed: 0,Country,Document nr.,Text Chunk,Best Matching Bias,Bias Description,Similarity Score
5000,USA,3,NATIONAL SCIENCE FOUNDATION2415 EISENHOWER AVE...,Reporting biases,A systematic distortion that arises from the s...,0.347282
5001,USA,3,"Indeed, a key component ofCISE's mission is to...",Volunteer bias,Participants volunteering to take part in a st...,0.387219
5002,USA,3,Some research practices and methods may carry ...,Information bias,Bias that arises from systematic differences i...,0.619102
5003,USA,3,The increased reliance on computing and inform...,Availability bias,A distortion that arises from the use of infor...,0.388663
5004,USA,3,"Professional societies, national and global co...",Performance bias,Systematic differences in the care provided to...,0.470382
5005,USA,3,"Codes ofethics, for example, have been establi...",Information bias,Bias that arises from systematic differences i...,0.416984
5006,USA,3,Somecodes or standards are addressing privacy ...,Informed presence bias,The presence of a person’s information in an e...,0.306489
5007,USA,3,"Others emphasize theneed to ensure that users,...",Confirmation bias,The search for and use of information to suppo...,0.463255
5008,USA,3,Standardsand guidelines have also been establi...,Reporting biases,A systematic distortion that arises from the s...,0.346583
5009,USA,3,"With this Dear Colleague Letter (DCL), CISE in...",Reporting biases,A systematic distortion that arises from the s...,0.337394
