In [1]:
import pandas as pd
import json
import os

import numpy as np
import faiss
from langchain.docstore.document import Document
from langchain.document_loaders import JSONLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI

from dotenv import load_dotenv

load_dotenv()

True

In [23]:
def load_bias_terms(file_path: str) -> list[Document]:

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    documents = []
    for entry in data:
        bias_type = entry.get("Bias_Type", "Unknown Bias")
        description = entry.get("Description", "")
        content = f"Bias: {bias_type}\nDescription: {description}"
        documents.append(Document(page_content=content, metadata={"type":"definition", "source": file_path}))
    
    return documents

def load_document_text(file_path: str) -> list[Document]:
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    documents = []
    for entry in data:
        country = entry.get("Country", "Unknown country")
        doc_name = entry.get("Document Name", "")
        text_chunk = entry.get("Text", "")
        content = f"Country: {country}\nDocument name: {doc_name}\nSentence: {text_chunk}"
        documents.append(Document(page_content=content, metadata={"type":"text", "source": file_path}))
    return documents

def load_document_metrics(file_path: str) -> list[Document]:
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    documents = []
    for entry in data:
        country = entry.get("Country", "Unknown country")
        doc_name = entry.get("Document Name", "")
        bias_type = entry.get("Most Prevalent Bias", "")
        content = f"Country: {country}\nDocument Name: {doc_name}\nMost Prevalent Bias: {bias_type}"

        prevalence_score = entry.get("Prevalence Score", "")
        mean_similarity_score = entry.get("Mean Similarity Score", "")
        bias_frequency = entry.get("Bias Frequency", "")
        mean_tfidf_score = entry.get("Mean Normalized TF-IDF", "")
        metadata = {"type":"bias type", "Prevalence Score":prevalence_score, "Mean Similarity Score":mean_similarity_score, "Bias Frequency":bias_frequency, 
                    "Mean Normalized TF-IDF":mean_tfidf_score, "source": file_path}

        documents.append(Document(page_content=content, metadata=metadata))
    return documents


bias_documents = load_bias_terms("RAG_data/bias_terms.json")
text_documents = load_document_text("RAG_data/documents_text.json")
document_metrics = load_document_metrics("RAG_data/document_metrics.json")

documents = bias_documents + text_documents + document_metrics
print(f"Loaded {len(documents)} documents.")
print(documents[34843])

Loaded 43039 documents.
page_content='Country: THA
Document name: 12th National Economic and Social Development Plan
Sentence: 7 Develop management system s and conflict resolution mechanisms  regarding natural resource s and environment al issues : 3' metadata={'type': 'text', 'source': 'RAG_data/documents_text.json'}


In [24]:
print(document_metrics[0])

page_content='Country: ARG
Document Name: Laura
Most Prevalent Bias: Racial bias' metadata={'type': 'bias type', 'Prevalence Score': 19.510558867, 'Mean Similarity Score': 0.2999204042, 'Bias Frequency': 216, 'Mean Normalized TF-IDF': 0.3011687773, 'source': 'RAG_data/document_metrics.json'}


In [25]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI  # Updated import

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE")

index_path = "../Data/faiss_labse_index"

if os.path.exists(index_path):
    vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    print("Vectorstre loaded")
else:
    print("Creating vector store")
    vectorstore = FAISS.from_documents(documents, embeddings)
    vectorstore.save_local(index_path)

# Create a retriever with the top-3 most similar documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# gpt-4o-2024-08-06
# Initialize your LLM using the updated ChatOpenAI for chat-based models

llm = ChatOpenAI(temperature=0, model_name="gpt-4o-2024-08-06", openai_api_key=os.environ["OPENAI_API_KEY"])

# Build the RetrievalQA (RAG) chain
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever)


Vectorstre loaded


In [26]:
#vectorstore.save_local("../faiss_labse_index")

In [27]:
# Query the RAG system using the invoke method
query = "What is the description of Confirmation bias?"
result = qa_chain.invoke(query)
print(result)


{'query': 'What is the description of Confirmation bias?', 'result': 'The description of Confirmation bias is the search for and use of information to support an individual’s ideas, beliefs or hypotheses.'}


In [28]:
# Query the RAG system using the invoke method
query = "What is the most prevalent bias from USA documents? explain why"
result = qa_chain.invoke(query)
print(result)

{'query': 'What is the most prevalent bias from USA documents? explain why', 'result': 'The most prevalent bias from the USA documents provided is racial bias. This is evident from the documents "AI and Society" and "Fairness, Ethics, Accountability, and Transparency," both of which identify racial bias as the most prevalent bias. Racial bias is a significant concern in AI and societal contexts because it can lead to unfair treatment and discrimination against individuals based on their race, which is why it is a focal point in discussions about fairness, ethics, and accountability in AI systems.'}


In [33]:
# Query the RAG system using the invoke method
query = "Why does the U.S. National AI R&D Strategic Plan stress the importance of developing AI systems that can explain their decisions?"
result = qa_chain.invoke(query)
print(result)

{'query': 'Why does the U.S. National AI R&D Strategic Plan stress the importance of developing AI systems that can explain their decisions?', 'result': 'The U.S. National AI R&D Strategic Plan stresses the importance of developing AI systems that can explain their decisions because it is crucial for these systems to be transparent and capable of explaining the reasons for their results to users. This transparency is essential for ensuring that AI technologies are used responsibly, safely, and beneficially, and that they align with ethical, legal, and societal principles. By being able to explain their decisions, AI systems can help build trust with users and facilitate better understanding and oversight of AI technologies.'}


In [34]:
docs = retriever.get_relevant_documents(query)
for i, doc in enumerate(docs):
    print(f"\n--- Document {i} ---\n{doc.page_content}")


--- Document 0 ---
Country: USA
Document name: National AI R&D Strategic Plan
Sentence: Theoretical work is needed to better understand why AI techniques especially machine learning often work well in practice

--- Document 1 ---
Country: USA
Document name: National AI R&D Strategic Plan
Sentence: Designing a rchitecture s for e thical AI  Additional progress in fundamental research must be made to determine how to best design architectures for AI systems that incorporate ethical reasoning

--- Document 2 ---
Country: USA
Document name: National AI R&D Strategic Plan
Sentence: Thus, research ers must develop systems that are transparent, and intrinsically capable of explaining the reasons  for their results to users

--- Document 3 ---
Country: USA
Document name: National AI R&D Strategic Plan
Sentence: NATIONAL ARTIFICIAL INTELLIGENCE RESEARCH AND DEVELOPMENT STRATEGIC PLAN    8 AI R&D Strategic Plan  focuses on  the R&D investments needed to help define and advance policies that ens

In [None]:
ground_truth = {
    "What have Canadian Indigenous researchers and experts from around the world teamed up to publish?": [
        "ca/ai/ai-society14TH20TH24THAICan Impact Report 202022CENTRING INDIGENOUS PERSPECTIVES IN DESIGNING AIIndigenous researchers and experts from around the world have teamed up to publish Indigenous Protocol and Artificial Intelligence, a position paper32 that provides a starting place for designing ethical AI through an Indigenous-centred approach"
    ],
    "What role does Carlsberg play in Denmark’s national AI strategy, and how is AI used in their beer production process?": [
        "Artificial intelligence will help beer tasting at Carlsberg1,000 different beers are screened every day at the Carlsberg laboratory in Valby near Copenhagen"
    ],
    "What evidence does the Spanish strategy provide to demonstrate that Spain has a higher percentage of female researchers in AI compared to the EU average?": [
        "España ya cuenta con un mayor número de mujeres investigadores, el 38,8% frente al 33,8% de la UE 25, según datos del INE y Eurostat, lo que signiﬁca una buena posición para impulsar la reducción de esta brecha de género"
    ],
    "What limitations does the report identify in the EU’s use of experimental policymaking, such as regulatory sandboxes?": [
        "Assessment of current initiatives of the European Commission on better regulation Third, despite the plethora of tools available to Commission services in the Toolbox, the use of experimental policymaking in the form of regulatory sandboxes and similar instruments remain orphan of a general framework at the EU level, which would allow Member States to engage in experimental policymaking",
        "Assessment of current initiatives of the EU on better regulation', 'Text': 'As it stands, Tool #69 on “Emerging Methods and Policy Instruments”, however laudable, risks providing only a theoretical opportunity for Commission policymakers, rooted in the possibility to include provisions for sandboxes in the legislative texts, rather than engaging in experimentation themselves, or relying on a structured process of experimentation in Member States while proposals are still in the making"
    ],
    "What strategy does the Italian government propose to reduce the innovation gap between northern and southern regions of the country using AI?": [
        "La rete nazionale per lIA sar inter regionale, e permetter ai territori di scambiarsi la conoscenza e le competenze per rimanere al passo dell innovazione dell IA, favorire ladozione di queste tecnologie da parte delle PMI, e quindi r idurre il divario nell'innovazione, nello sviluppo e nella competitivit delle imprese tra Nord e Sud"
    ],
    "What concrete measures does the Dutch strategy propose to double the reuse of public data for AI development within five years?": [
        "30 AI VOOR NEDERLANDVERGROTEN,VERSNELLEN EN VERBINDEN 31    1,0 0,8 0,4 0,6 0,2 0,00,240,180,23NederlandVerenigde statenOostenrijkMexicoJapanFrankrijkSpanjeColombia Nieuw-ZeelandDuitslandFinlandAustralieCanadaZuid-KoreaIsraelIerland GriekenlandNoorwegen ScoreItalie   Overheid stimuleert hergebruikData BeschikbaarheidData ToegankelijkheidOm de bruikbaarheid van publieke data te verbete-ren, moet data worden opgeschoond, gelabeld en gekoppeld"
        "Daarnaast moet er een duidelijk verdienmodel komen waarin de kosten voor de overheid in ver-houding zijn met de baten voor de directe gebrui-ker en de maatschappij",
        "Over vijf jaar bereiken we hopelijk een verdubbeling van het hergebruik van publieke data"
    ],
    "According to the Vinnova report, what factors mutually reinforce or inhibit the development of AI-based business and operational models in Sweden?": [
        "Business and operational models, data access and competence are mutually dependent and therefore heavily affected by each other in companies and public operations",
        "Without clear prospects of business benefits, the drivers of AI-based investments are inhibited",
        "If the business benefit is not clear, AI competence is also not viewed as an important factor for value creation and efficiency, affecting recruitment patterns and competence development",
        "Data access and possibilities of combining different data will be fundamentally significant for purposes of identifying the applications that can be developed"
    ],
    "Why does the U.S. National AI R&D Strategic Plan stress the importance of developing AI systems that can explain their decisions?": [
        "First, Strategy 4 emphasizes the need for explainable and transparent systems that are trusted by their users , perform in a manner that is acceptable to the users, and can be guaranteed to act as the user intended",
        "Thus, research ers must develop systems that are transparent, and intrinsically capable of explaining the reasons  for their results to users"
    ]
}