In [102]:
import pandas as pd
import json
import re
from sentence_transformers import SentenceTransformer
pd.set_option('display.max_colwidth', None)  # Show full column content

In [103]:
with open('../Data/bias_terms.json', 'r') as file:
    bias_json_data = json.load(file)
print(bias_json_data)

[{'Bias_Type': 'Adherence bias', 'Description': 'A systematic distortion in outcome data that arises when participants who adhere to a study protocol or intervention differ from those who do not adhere, when that difference relates to the outcome of interest.'}, {'Bias_Type': 'Admission rate bias', 'Description': 'Arises when the variables under study are affected by the selection of hospitalized subjects leading to a bias between the exposure and the\xa0disease under study.'}, {'Bias_Type': 'All’s well literature bias', 'Description': 'Occurs when publications omit or play down controversies or disparate results.'}, {'Bias_Type': 'Allocation bias', 'Description': 'Systematic difference in how participants are assigned to comparison groups in a clinical trial.'}, {'Bias_Type': 'Apprehension bias', 'Description': 'When a study participant responds differently due to being observed'}, {'Bias_Type': 'Ascertainment bias', 'Description': 'Systematic differences in the identification of indi

In [104]:
file_path = '../Data/Policy_docs/USA/AI and Society/AI and Society.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

policy_text = text
policy_text = policy_text.replace('\n', '')

policy_text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', policy_text)

policy_sentences = policy_text.split('.')
policy_sentences = [s.strip() for s in policy_sentences if s.strip()]
policy_sentences = [s for s in policy_sentences if '@' not in s]
policy_sentences = [s for s in policy_sentences if len(s) >= 10]
policy_sentences = [s for s in policy_sentences if 'et al' not in s]

bias_sentences = [
    "To reduce the use of information that merely supports an individual’s ideas, AI models must be trained on datasets encompassing diverse and potentially contradictory viewpoints to mitigate confirmation bias.",
    
    "AI systems must prioritize the relevance and quality of the data used over mere volume, to prevent bias arising from differences in the collection of information",
    
    "To address distortions caused by prejudice against racial or ethnic groups, developers must audit training datasets to ensure no group is underrepresented or mischaracterized, thereby reducing racial bias in AI outcomes.",
    
    "To mitigate selection bias, which occurs when sampled groups differ systematically from the target population, training datasets must be representative of the diversity within the actual end-user base.",
    
    "AI policy requires full transparency in documenting dataset sources and limitations, to prevent bias caused by selective reporting or omission of information."
]

[policy_sentences.append(sentence) for sentence in bias_sentences]
policy_sentences

["NATIONAL SCIENCE FOUNDATION2415 EISENHOWER AVENUEALEXANDRIA, VIRGINIA 22314NSF 19-018Dear Colleague Letter: EArly-concept Grants for ExploratoryResearch on Artificial Intelligence (AI) and Society - SupportedJointly with the Partnership on AINovember 1 5, 2018Dear Colleagues:The National Science Foundation's (NSF) Directorates for Computer and Information Scienceand Engineering (CISE) and Social, Behavioral and Economic Sciences (SBE) together withthe Partnership on AI (PAI) wish to notify the community of their interest in supporting EArly-concept Grants for Exploratory Research (EAGERs) to understand the social challengesarising from AI technology and enable scientific contributions to overcome them",
 'The last 20 years have seen rapid advances in machine learning, pattern recognition,planning, effective decision making, natural language processing, and machine vision',
 'Theseadvances have been fueled by increased data, faster computation, and improved algorithms',
 'They are yie

In [105]:
from langchain.docstore.document import Document
docs = [
    Document(page_content=item["Bias_Type"] + " " + item["Description"], metadata={"bias": item["Bias_Type"]})
    for item in bias_json_data
]
docs_text = [doc.page_content for doc in docs]
print(docs_text)

['Adherence bias A systematic distortion in outcome data that arises when participants who adhere to a study protocol or intervention differ from those who do not adhere, when that difference relates to the outcome of interest.', 'Admission rate bias Arises when the variables under study are affected by the selection of hospitalized subjects leading to a bias between the exposure and the\xa0disease under study.', 'All’s well literature bias Occurs when publications omit or play down controversies or disparate results.', 'Allocation bias Systematic difference in how participants are assigned to comparison groups in a clinical trial.', 'Apprehension bias When a study participant responds differently due to being observed', 'Ascertainment bias Systematic differences in the identification of individuals included in a study or distortion in the collection of data in a study.', 'Attrition bias Unequal loss of participants from study groups in a trial.', 'Availability bias A distortion that a

In [106]:
import numpy as np
import faiss
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.docstore.document import Document
docs = [
    Document(page_content=item["Bias_Type"] + " " + item["Description"], metadata={"bias": item["Bias_Type"]})
    for item in bias_json_data
]
docs_text = [doc.page_content for doc in docs]

model = SentenceTransformer("sentence-transformers/LaBSE")

raw_embeddings = model.encode(docs_text, normalize_embeddings=True).astype("float32")

embedding_dim = raw_embeddings.shape[1]

# Create a FAISS index that uses inner product (which, for normalized vectors, equals cosine similarity)
index = faiss.IndexFlatIP(embedding_dim)
index.add(raw_embeddings)

docstore = {i: doc for i, doc in enumerate(docs)}

results_list = []

# Perform similarity search on policy sentences
for chunk in policy_sentences:
    # Compute the raw query embedding
    query_emb = model.encode([chunk], normalize_embeddings=True).astype("float32")
    
    # Perform search in the FAISS index
    distances, indices = index.search(query_emb, k=1)
    best_idx = indices[0][0]
    best_score = distances[0][0] 
    
    best_doc = docstore[best_idx]

    results_list.append({
        "Text Chunk": chunk,
        "Best Matching Bias": best_doc.metadata["bias"],
        "Similarity Score": best_score
    })

results_df = pd.DataFrame(results_list)


In [107]:
results_df

Unnamed: 0,Text Chunk,Best Matching Bias,Similarity Score
0,"NATIONAL SCIENCE FOUNDATION2415 EISENHOWER AVENUEALEXANDRIA, VIRGINIA 22314NSF 19-018Dear Colleague Letter: EArly-concept Grants for ExploratoryResearch on Artificial Intelligence (AI) and Society - SupportedJointly with the Partnership on AINovember 1 5, 2018Dear Colleagues:The National Science Foundation's (NSF) Directorates for Computer and Information Scienceand Engineering (CISE) and Social, Behavioral and Economic Sciences (SBE) together withthe Partnership on AI (PAI) wish to notify the community of their interest in supporting EArly-concept Grants for Exploratory Research (EAGERs) to understand the social challengesarising from AI technology and enable scientific contributions to overcome them",Industry Sponsorship bias,0.280374
1,"The last 20 years have seen rapid advances in machine learning, pattern recognition,planning, effective decision making, natural language processing, and machine vision",Information bias,0.304363
2,"Theseadvances have been fueled by increased data, faster computation, and improved algorithms",Information bias,0.300446
3,They are yielding increasingly diverse and large-scale applications deployed in settingssubject to unanticipated challenges with complex social effects,Racial bias,0.366127
4,NSF has long supported fundamental research enabling AI technology,Industry Sponsorship bias,0.339133
5,"With increases in thescale and diversity of deployments comes the need to better understand AI in the open world,including unforeseen circumstances and social impacts, and to craft approaches to AI thatconsider these from the start",Racial bias,0.36389
6,"Vital directions include developing principles for safe, robust,and trustworthy AI (including shared responsibilities between humans and AI systems);addressing issues of bias, fairness, and transparency of algorithmic intelligence; developingdeeper understanding of human-AI interaction and user education; and developing insightsabout the influences of AI on people and society",Racial bias,0.450619
7,"NSF and PAI will jointly support high-risk, high-reward research at the intersection of thesocial and technical dimensions of AI",Industry Sponsorship bias,0.319307
8,"Priority will be given to collaborative projects thatintegrate computer/computational science with the social, behavioral, and economicsciences",Adherence bias,0.334865
9,Proposals may expand understanding of the influences of AI on people andsociety or contribute technical innovations that overcome the emerging social challenges,Racial bias,0.447941
