In [17]:
import pandas as pd
import json
import os

import numpy as np
import faiss
from langchain.docstore.document import Document
from langchain.document_loaders import JSONLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI

In [18]:
def load_bias_terms(file_path: str) -> list[Document]:

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    documents = []
    for entry in data:
        bias_type = entry.get("Bias_Type", "Unknown Bias")
        description = entry.get("Description", "")
        content = f"Bias: {bias_type}\nDescription: {description}"
        documents.append(Document(page_content=content, metadata={"type":"definition", "source": file_path}))
    
    return documents

def load_document_text(file_path: str) -> list[Document]:
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    documents = []
    for entry in data:
        country = entry.get("Country", "Unknown country")
        doc_nr = entry.get("Document_nr", "")
        text_chunk = entry.get("Text", "")
        content = f"Country: {country}\nDocument number: {doc_nr}\nSentence: {text_chunk}"
        documents.append(Document(page_content=content, metadata={"type":"text", "source": file_path}))
    return documents

def load_document_metrics(file_path: str) -> list[Document]:
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    documents = []
    for entry in data:
        country = entry.get("Country", "Unknown country")
        doc_nr = entry.get("Document_nr", "")
        bias_type = entry.get("Most Prevalent Bias", "")
        content = f"Country: {country}\nDocument number: {doc_nr}\nMost Prevalent Bias: {bias_type}"

        prevalence_score = entry.get("Prevalence Score", "")
        mean_similarity_score = entry.get("Mean Similarity Score", "")
        bias_frequency = entry.get("Bias Frequency", "")
        mean_BLEU_score = entry.get("Mean BLEU Score", "")
        metadata = {"type":"bias type", "Prevalence Score":prevalence_score, "Mean Similarity Score":mean_similarity_score, "Bias Frequency":bias_frequency, 
                    "Mean BLEU Score":mean_BLEU_score, "source": file_path}

        documents.append(Document(page_content=content, metadata=metadata))
    return documents


bias_documents = load_bias_terms("RAG_data/bias_terms.json")
text_documents = load_document_text("RAG_data/documents_text.json")
document_metrics = load_document_metrics("RAG_data/document_metrics.json")

documents = bias_documents + text_documents + document_metrics
print(f"Loaded {len(documents)} documents.")
print(documents[5643])

Loaded 5647 documents.
page_content='Country: USA
Document number: 2
Most Prevalent Bias: Confirmation bias' metadata={'type': 'bias type', 'Prevalence Score': 0.4346623644, 'Mean Similarity Score': 0.2948431373, 'Bias Frequency': 5, 'Mean BLEU Score': 1.2374582985, 'source': 'RAG_data/document_metrics.json'}


In [None]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI  # Updated import

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Create a FAISS vector store from the documents
vectorstore = FAISS.from_documents(documents, embeddings)

# Create a retriever with the top-3 most similar documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Initialize your LLM using the updated ChatOpenAI for chat-based models
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=os.environ["OPENAI_API_KEY"])

# Build the RetrievalQA (RAG) chain
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever)


{'query': 'What is the description of Confirmation bias?', 'result': 'Confirmation bias is the search for and use of information to support an individual’s ideas, beliefs, or hypotheses.'}


In [None]:
# Query the RAG system using the invoke method
query = "What is the description of Confirmation bias?"
result = qa_chain.invoke(query)
print(result)

In [16]:
# Query the RAG system using the invoke method
query = "What is the most prevalent bias from USA documents? Explain why"
result = qa_chain.invoke(query)
print(result)

{'query': 'What is the most prevalent bias from USA documents? Explain why', 'result': "The most prevalent bias in USA documents varies depending on the specific document. Document number 3 shows information bias, document number 5 shows reporting biases, and document number 2 shows confirmation bias. Each bias has different characteristics and implications. Information bias occurs when there is a systematic error in the way data is collected, leading to incorrect conclusions. Reporting biases involve selective reporting of information, which can skew the overall picture presented. Confirmation bias is the tendency to search for, interpret, or remember information in a way that confirms one's preconceptions. Each bias can impact the accuracy and reliability of the information presented in the documents."}


In [None]:
# Initialize the BERT tokenizer (using the 'bert-base-uncased' model)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# A simple function to load a JSON file and convert it to a LangChain Document.
def load_json_to_document(file_path: str) -> Document:
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Assume the JSON contains a field "text". If not, convert the whole JSON to a string.
    text = data.get("text", str(data))
    return Document(page_content=text, metadata={"source": file_path})

# List your JSON file paths
doc_paths = ["doc1.json", "doc2.json", "doc3.json"]

# Load documents from the JSON files
documents = [load_json_to_document(path) for path in doc_paths]

# Optional: Function to tokenize a document with BERT and split into smaller chunks if needed.
def tokenize_document(doc: Document, max_tokens: int = 128):
    tokens = tokenizer.tokenize(doc.page_content)
    # If the document is longer than max_tokens, split it into chunks.
    if len(tokens) > max_tokens:
        chunks = []
        for i in range(0, len(tokens), max_tokens):
            chunk_tokens = tokens[i:i+max_tokens]
            chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
            chunks.append(Document(page_content=chunk_text, metadata=doc.metadata))
        return chunks
    else:
        return [doc]

# Process each document (this step is optional if you don't require chunking)
processed_docs = []
for doc in documents:
    processed_docs.extend(tokenize_document(doc))

# Initialize HuggingFace embeddings (you can choose a model based on your needs)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS vector store from the processed documents
vectorstore = FAISS.from_documents(processed_docs, embeddings)

# Convert the vectorstore into a retriever (here, returning the top-3 nearest neighbors)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Initialize your LLM (OpenAI's model in this case; ensure your API key is set in your environment)
llm = OpenAI(temperature=0)

# Create the RetrievalQA (RAG) chain tying the retriever and the LLM together
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever)

# Query the RAG system
query = "What are the key points discussed in the documents?"
result = qa_chain.run(query)
print(result)

In [None]:
bias_terms_loader = JSONLoader("RAG_data/bias_terms.json", jq_schema=".[] | {page_content: .Description, metadata: {Bias_Type: .Bias_Type}}")
document_metrics_loader = JSONLoader("RAG_data/document_metrics.json", jq_schema=".")
documents_text_loader = JSONLoader("RAG_data/documents_text.json", jq_schema=".")

bias_terms = bias_terms_loader.load()
#document_metrics = document_metrics_loader.load()
#documents_text = documents_text_loader.load()

# Combine the documents into one list
documents = bias_terms + document_metrics + documents_text

# Initialize HuggingFaceEmbeddings with a BERT model.
# This will automatically use the BERT tokenizer from Hugging Face.
embeddings = HuggingFaceEmbeddings(model_name="bert-base-uncased")

# Build a FAISS vectorstore from the documents
vectorstore = FAISS.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Initialize your LLM and set up the RetrievalQA chain
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever)

# Run a query
query = "What are the most important types biases discussed in the USA policy documents"
result = qa_chain.run(query)
print(result)

ValueError: Expected page_content is string, got <class 'dict'> instead.                     Set `text_content=False` if the desired input for                     `page_content` is not a string