In [1]:
import pandas as pd
import json

In [None]:
with open('../Data/bias_terms.json', 'r') as file:
    bias_json_data = json.load(file)
print(bias_json_data)

[{'Bias_Type': 'Adherence bias', 'Description': 'A systematic distortion in outcome data that arises when participants who adhere to a study protocol or intervention differ from those who do not adhere, when that difference relates to the outcome of interest.'}, {'Bias_Type': 'Admission rate bias', 'Description': 'Arises when the variables under study are affected by the selection of hospitalized subjects leading to a bias between the exposure and the\xa0disease under study.'}, {'Bias_Type': 'All’s well literature bias', 'Description': 'Occurs when publications omit or play down controversies or disparate results.'}, {'Bias_Type': 'Allocation bias', 'Description': 'Systematic difference in how participants are assigned to comparison groups in a clinical trial.'}, {'Bias_Type': 'Apprehension bias', 'Description': 'When a study participant responds differently due to being observed'}, {'Bias_Type': 'Ascertainment bias', 'Description': 'Systematic differences in the identification of indi

In [5]:
file_path = '../Data/Policy_docs/USA/Fairness, Ethics, Accountability, and Transparency/Fairness, Ethics, Accountability, and Transparency.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

policy_text = text
policy_text = policy_text.replace('\n', '')

policy_sentences = policy_text.split('.')
policy_sentences = [s.strip() for s in policy_sentences if s.strip()]
policy_sentences = [s for s in policy_sentences if '@' not in s]

# Add test sentences for comparison (generated by ChatGPT) Prompt: I am trying to create an NLP model that tries to classify which types of biases an AI policy document is trying to adhere to. The five types are: gender, sexual, religion, race, and disability bias. For testing, give me an example sentence for each bias jus described that could be found in AI policy documents  that tries to prevent that type of bias.
policy_sentences.append("The AI system shall treat all genders with equal respect, ensuring that its algorithms and decision-making processes do not reinforce or perpetuate traditional gender stereotypes")
policy_sentences.append("Our AI must be designed to avoid any form of sexual bias, ensuring that outputs and decisions do not discriminate based on sexual orientation or identity")
policy_sentences.append("The system is required to respect diverse religious beliefs, ensuring that no decision or content produced discriminates against or favors any particular faith")
policy_sentences.append("All data sources and model outputs will be routinely audited to prevent racial bias, ensuring that individuals from every racial and ethnic background receive equitable treatment")
policy_sentences.append("Our AI models shall incorporate universal design principles and undergo rigorous testing to ensure that they do not disadvantage individuals with disabilities, thereby promoting accessibility and inclusion")

policy_sentences

["NATIONAL SCIENCE FOUNDATION2415 EISENHOWER AVENUEALEXANDRIA, VIRGINIA 22314NSF 19-016Dear Colleague Letter: Fairness, Ethics, Accountability, andTransparency: Enabling Breakthrough Research to ExpandInclusivity in Computer and Information Science and EngineeringResearchNovember 2, 2018Dear Colleagues:The National Science Foundation's (NSF) Directorate for Computer and Information Scienceand Engineering (CISE) is committed to maximizing the positive consequences of theresearch that it funds through inclusive research approaches",
 "Indeed, a key component ofCISE's mission is to contribute to universal, transparent, and affordable participation in aninformation-based society",
 'Some research practices and methods may carry biases andinequities that can in turn have significant impacts on the scientific community and broadersociety',
 'The increased reliance on computing and information technologies may furtherincrease and automate such biases and inequities',
 'Professional societies,

In [6]:
import numpy as np
import faiss
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

docs = [
    Document(page_content=item["Description"], metadata={"bias": item["Bias_Type"]})
    for item in bias_json_data
]

docs_text = [doc.page_content for doc in docs]

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

raw_embeddings = embeddings.embed_documents(docs_text)

# Apply L2 normalization
normalized_embeddings = []
for emb in raw_embeddings:
    
    norm = np.linalg.norm(emb)
    # Avoid division by zero
    if norm > 0:
        normalized_embeddings.append(emb / norm)
    else:
        normalized_embeddings.append(emb)
        
# Convert to a numpy array to and ensure float32 type for FAISS
normalized_embeddings = np.array(normalized_embeddings).astype("float32")

embedding_dim = normalized_embeddings.shape[1]

# Create a FAISS index that uses inner product (which, for normalized vectors, equals cosine similarity)
index = faiss.IndexFlatIP(embedding_dim)
index.add(normalized_embeddings)

docstore = {i: doc for i, doc in enumerate(docs)}

results_list = []

# Perform similarity search on policy sentences
for chunk in policy_sentences:
    # Compute the raw query embedding
    query_emb = embeddings.embed_query(chunk)
    # Normalize the query embedding
    norm = np.linalg.norm(query_emb)
    if norm > 0:
        query_emb = query_emb / norm
    else:
        query_emb = query_emb
    # Convert to numpy array with shape (1, embedding_dim)
    query_emb_np = np.array([query_emb]).astype("float32")
    
    # Perform search in the FAISS index
    distances, indices = index.search(query_emb_np, k=1)
    best_idx = indices[0][0]
    best_score = distances[0][0] 
    
    best_doc = docstore[best_idx]

    results_list.append({
        "Text Chunk": chunk,
        "Best Matching Bias": best_doc.metadata["bias"],
        "Bias Description": best_doc.page_content,
        "Similarity Score": best_score
    })

results_df = pd.DataFrame(results_list)


  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
results_df

Unnamed: 0,Text Chunk,Best Matching Bias,Bias Description,Similarity Score
0,NATIONAL SCIENCE FOUNDATION2415 EISENHOWER AVE...,Reporting biases,A systematic distortion that arises from the s...,0.347282
1,"Indeed, a key component ofCISE's mission is to...",Volunteer bias,Participants volunteering to take part in a st...,0.387219
2,Some research practices and methods may carry ...,Information bias,Bias that arises from systematic differences i...,0.619102
3,The increased reliance on computing and inform...,Availability bias,A distortion that arises from the use of infor...,0.388663
4,"Professional societies, national and global co...",Performance bias,Systematic differences in the care provided to...,0.470382
5,"Codes ofethics, for example, have been establi...",Information bias,Bias that arises from systematic differences i...,0.416984
6,Somecodes or standards are addressing privacy ...,Informed presence bias,The presence of a person’s information in an e...,0.306489
7,"Others emphasize theneed to ensure that users,...",Confirmation bias,The search for and use of information to suppo...,0.463255
8,Standardsand guidelines have also been establi...,Reporting biases,A systematic distortion that arises from the s...,0.346583
9,"With this Dear Colleague Letter (DCL), CISE in...",Reporting biases,A systematic distortion that arises from the s...,0.337394
