In [20]:
import pandas as pd
import json
import os

import numpy as np
import faiss
from langchain.docstore.document import Document
from langchain.document_loaders import JSONLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI

from dotenv import load_dotenv

load_dotenv()

True

In [13]:
def load_bias_terms(file_path: str) -> list[Document]:

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    documents = []
    for entry in data:
        bias_type = entry.get("Bias_Type", "Unknown Bias")
        description = entry.get("Description", "")
        content = f"Bias: {bias_type}\nDescription: {description}"
        documents.append(Document(page_content=content, metadata={"type":"definition", "source": file_path}))
    
    return documents

def load_document_text(file_path: str) -> list[Document]:
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    documents = []
    for entry in data:
        country = entry.get("Country", "Unknown country")
        doc_name = entry.get("Document Name", "")
        text_chunk = entry.get("Text", "")
        content = f"Country: {country}\nDocument name: {doc_name}\nSentence: {text_chunk}"
        documents.append(Document(page_content=content, metadata={"type":"text", "source": file_path}))
    return documents

def load_document_metrics(file_path: str) -> list[Document]:
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    documents = []
    for entry in data:
        country = entry.get("Country", "Unknown country")
        doc_name = entry.get("Document Name", "")
        bias_type = entry.get("Most Prevalent Bias", "")
        content = f"Country: {country}\nDocument Name: {doc_name}\nMost Prevalent Bias: {bias_type}"

        prevalence_score = entry.get("Prevalence Score", "")
        mean_similarity_score = entry.get("Mean Similarity Score", "")
        bias_frequency = entry.get("Bias Frequency", "")
        mean_tfidf_score = entry.get("Mean Normalized TF-IDF", "")
        metadata = {"type":"bias type", "Prevalence Score":prevalence_score, "Mean Similarity Score":mean_similarity_score, "Bias Frequency":bias_frequency, 
                    "Mean Normalized TF-IDF":mean_tfidf_score, "source": file_path}

        documents.append(Document(page_content=content, metadata=metadata))
    return documents


bias_documents = load_bias_terms("RAG_data/bias_terms.json")
text_documents = load_document_text("RAG_data/documents_text.json")
document_metrics = load_document_metrics("RAG_data/document_metrics.json")

documents = bias_documents + text_documents + document_metrics
print(f"Loaded {len(documents)} documents.")
print(documents[34843])

Loaded 34846 documents.
page_content='Country: USA
Document Name: National Institute of Standards and Technology Principles for Explainable AI
Most Prevalent Bias: Spin bias' metadata={'type': 'bias type', 'Prevalence Score': 7.7077625815, 'Mean Similarity Score': 0.3668309429, 'Bias Frequency': 41, 'Mean Normalized TF-IDF': 0.5124818746, 'source': 'RAG_data/document_metrics.json'}


In [16]:
print(document_metrics[0])

page_content='Country: ARG
Document Name: Laura
Most Prevalent Bias: Racial bias' metadata={'type': 'bias type', 'Prevalence Score': 19.510558867, 'Mean Similarity Score': 0.2999204042, 'Bias Frequency': 216, 'Mean Normalized TF-IDF': 0.3011687773, 'source': 'RAG_data/document_metrics.json'}


In [24]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI  # Updated import

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE")

# Create a FAISS vector store from the documents
vectorstore = FAISS.from_documents(documents, embeddings)

# Create a retriever with the top-3 most similar documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# gpt-4o-2024-08-06
# Initialize your LLM using the updated ChatOpenAI for chat-based models

llm = ChatOpenAI(temperature=0, model_name="gpt-4o-2024-08-06", openai_api_key=os.environ["OPENAI_API_KEY"])

# Build the RetrievalQA (RAG) chain
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever)


In [None]:
#vectorstore.save_local("../faiss_labse_index")

In [26]:
# Query the RAG system using the invoke method
query = "What is the description of Confirmation bias?"
result = qa_chain.invoke(query)
print(result)


{'query': 'What is the description of Confirmation bias?', 'result': 'The description of Confirmation bias is the search for and use of information to support an individual’s ideas, beliefs, or hypotheses.'}


In [9]:
# Query the RAG system using the invoke method
query = "What is the most prevalent bias from USA documents? explain why"
result = qa_chain.invoke(query)
print(result)

{'query': 'What is the most prevalent bias from USA documents? explain why', 'result': "The most prevalent bias in USA documents varies depending on the specific document. Document number 3 shows information bias, document number 5 exhibits reporting biases, and document number 2 displays confirmation bias. Each bias type influences the information presented in the document in different ways. Information bias occurs when there are errors in the data collection or measurement process, leading to inaccurate or incomplete information. Reporting biases involve selective reporting or distortion of information, skewing the overall presentation of facts. Confirmation bias refers to the tendency to search for, interpret, or remember information in a way that confirms one's preexisting beliefs or hypotheses. These biases can impact the accuracy and objectivity of the information presented in the documents from the USA."}


In [11]:
# Query the RAG system using the invoke method
query = "What is the description of Confirmation bias?"
result = qa_chain.invoke(query)
print(result)

{'query': 'What is the description of Confirmation bias?', 'result': 'The description of Confirmation bias is the search for and use of information to support an individual’s ideas, beliefs, or hypotheses.'}


In [12]:
# Query the RAG system using the invoke method
query = "What is the most prevalent bias from USA documents? explain why"
result = qa_chain.invoke(query)
print(result)

{'query': 'What is the most prevalent bias from USA documents? explain why', 'result': "The most prevalent biases from the USA documents provided are Information bias, Reporting biases, and Confirmation bias. Each document lists a different prevalent bias, so there isn't a single most prevalent bias across all documents. \n\n- Information bias occurs when there is a systematic error in the way data is collected, leading to inaccurate or misleading information.\n- Reporting biases happen when certain outcomes or results are selectively reported, often based on their nature or direction, which can skew the understanding of the data.\n- Confirmation bias is the tendency to search for, interpret, and remember information in a way that confirms one's preexisting beliefs or hypotheses.\n\nWithout additional context or data, it's not possible to determine which of these biases is the most prevalent overall in USA documents."}
