# RAG Implementation and Evaluation

In [None]:
!pip install -U langchain-community
!pip install datasets
!pip install evaluate
!pip install ChromaDB
!pip install groq
!pip install sentence_transformers
!pip install light-the-torch >> /.tmp
!ltt install torch torchvision >> /.tmp
!pip install fastai --upgrade >> /.tmp
!pip install bert_score
!pip install rouge_score
!pip install meteor_score
!pip install ragas


## Load dataset

In [2]:
DATASET = "neural-bridge/rag-dataset-1200"

In [3]:
from datasets import load_dataset

rag_dataset = load_dataset(DATASET)

README.md:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

(…)-00000-of-00001-f0c158413defd454.parquet:   0%|          | 0.00/2.32M [00:00<?, ?B/s]

(…)-00000-of-00001-06d83c58a8ea10e8.parquet:   0%|          | 0.00/604k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/960 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/240 [00:00<?, ? examples/s]

## Split dataset context into chunks

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from datetime import datetime

"""Splits text into smaller chunks using a recursive character-based approach.

    This function is for breaking down large text documents into manageable chunks, 
    which can then be processed by models with input size limitations.

    Parameters:
        chunk_size (int): The maximum size of each text chunk, measured by the length function.
        
        chunk_overlap (int): The number of characters that adjacent chunks will overlap.
                             This helps maintain context between chunks. 
        length_function (function): A function to calculate the length of the text. 
                                    Default is the built-in `len` function.
        is_separator_regex (bool): If True, the separator used for splitting is treated as a regex pattern.
                                   If False, it is treated as a literal string. Default is False.

    Returns:
        RecursiveCharacterTextSplitter: An instance of the text splitter configured with the specified parameters.
"""
splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=30,
        length_function=len,
        is_separator_regex=False,
)

## Create the embeddings

In [5]:
from chromadb.utils import embedding_functions
from langchain.vectorstores.chroma import Chroma
import chromadb

# Initialize a Sentence Transformer embedding function using the "all-mpnet-base-v2" model.
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Create a persistent ChromaDB client to interact with a vector database stored at the specified path.
chroma_client = chromadb.PersistentClient(path='/content/drive/MyDrive/Colab Notebooks/RAG/db')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# If a collection with the name "context" already exists, it is retrieved; otherwise, a new collection is created.
# The `embedding_function` parameter specifies the function used to generate embeddings for the data stored in this collection.

chroma_collection = chroma_client.get_or_create_collection(
    name="context", embedding_function=sentence_transformer_ef
)
# debugging
chroma_collection.count()

0

In [7]:
import time
import hashlib
import random

def split_documents(documents):
    """
    Splits a list of documents into smaller chunks using a recursive character-based text splitter.
    Parameters:
        documents: A list of documents here we used contexts to be split into smaller chunks.

    Returns:
        list: A list of smaller text chunks, each with a maximum size defined by `chunk_size` and an overlap
              defined by `chunk_overlap`.
"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=30,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def get_context(ds):
    """
    Extracts and processes context data from dataset to create smaller, manageable chunks.

    This function iterates over the first 100 entries in the "context" field of the "train" split of the dataset.
    Each context is converted into a `Document` object, split into smaller chunks using the `split_documents` function,
    and stored in a list. The resulting chunks are returned as a list of dictionaries, each containing the text content
    of a chunk.

    Returns:
        list: A list of dictionaries, where each dictionary contains the text content of a chunk under the key "page_content". 
        """
    train_chunks = []
    for context in ds["train"]["context"][:100]:
        document = Document(page_content=context)
        chunked_documents = split_documents([document])
        for chunk in chunked_documents:
            train_chunks.append({
                "page_content": chunk.page_content
            })
    return train_chunks

def generate_unique_id(chunk):
    # Use current timestamp (in milliseconds) for uniqueness
    timestamp = str(int(time.time() * 1000))  # Current timestamp in milliseconds

    # Generate a random number to ensure uniqueness
    random_number = random.randint(100000, 999999)

    # use document content or metadata to make the ID more unique
    content_hash = hashlib.md5(chunk["page_content"].encode('utf-8')).hexdigest()[:8]  # First 8 chars of MD5 hash

    # Create a unique ID combining the timestamp and content hash
    unique_id = f"{timestamp}_{content_hash}_{random_number}"

    return unique_id



def add_context_to_chroma(chunks):
    # Assign unique IDs to chunks
    chunk_ids = []
    for chunk in chunks:
        chunk_ids.append(generate_unique_id(chunk))
        #print(chunk)

    # add them into DB
    chroma_collection.add(
        documents=[chunk["page_content"] for chunk in chunks],
        ids=chunk_ids
    )

In [8]:
# apply the functions on the context then save them in DB
chunks = []
chunks = get_context(rag_dataset)
add_context_to_chroma(chunks)

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

In [9]:
# debugging
db_content = chroma_collection.get()
chroma_collection.count()

872

## Prepare Queries

In [10]:
RAG_PROMPT = "Be helpful and answer questions concisely. \
              Utilize the context provided for accurate and specific information.\
              Context: "

NO_RAG_PROMPT = "Be helpful and answer questions concisely."

TEMPERATURE = 0.2

In [11]:
import random

answers = []
questions = []

n_questions = 100

# get the train answers provided from the dataset
for answer in rag_dataset["train"]["answer"][:n_questions]:
    answers.append(answer)


# get the train questions provided from the dataset
for question in rag_dataset["train"]["question"][:n_questions]:
    questions.append(question)



In [12]:
# preprocess function for context
def remove_newlines(text_list):
    return [text.replace('\n', ' ') for text in text_list]

In [17]:
import string
import re

# preprocess function for context
def remove_decorations(text):
    # Remove numeric lists (e.g., "\n1", "\n2")
    text = re.sub(r'\n\d+', '', text)
    text = re.sub(r'\n', '', text)
    # Remove bullet points (e.g., "*", "-", "•")
    text = re.sub(r'^[*•\-]\s*', '', text, flags=re.MULTILINE)
    # Remove other potential text decorations like multiple spaces or unwanted symbols
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Trim leading and trailing spaces
    
    return text

In [13]:
def query_rag(query_text: str, model):
  """
  This function performs the following steps:
    1. Queries the ChromaDB collection to retrieve the top 5 most relevant documents for the input query.
    2. Processes the retrieved documents by removing newlines and formatting them into a context string.
    3. Uses a language model (LLM) to generate a response based on the query and the retrieved context.
    4. Cleans up the generated response by removing unnecessary decorations and returns the final text along with 
    the retrieved documents.

    Returns:
        tuple: A tuple containing:
            - text (str): The generated response text, cleaned of decorations.
            - documents (list): The list of retrieved documents used as context for the response.
"""
    result = chroma_collection.query(
        query_texts=query_text,
        n_results=5
    )

  # Build context text from the top results
  documents = result["documents"][0]
  documents = remove_newlines(documents)
  
  llm = client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": f"{RAG_PROMPT} {documents}"
          },{
              "role": "user",
              "content": query_text,
          }
      ],
      model=model,
      temperature = TEMPERATURE
  )
  text = llm.choices[0].message.content
  text = remove_decorations(text)
  return text, documents

def query(query_text: str, model):
    """
     This function sends the input query to a language model and generates a response based solely on the model's
    pre-trained knowledge, without incorporating any external context or retrieved documents. The response is generated
    using a system prompt defined by `NO_RAG_PROMPT` (up in the code) and the user's query.

    Returns:
        str: The generated response text from the language model.
    """
  llm = client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": NO_RAG_PROMPT
          },{
              "role": "user",
              "content": query_text,
          }
      ],
      model=model,
      temperature = TEMPERATURE
  )
  return llm.choices[0].message.content


In [18]:
# try for on query
res, cont = query_rag("What are some of the potential negative impacts of charity as discussed in the context?", models[0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

## Run Queries

In [14]:
# Importing Necessary Libraries 
import os
from groq import Groq

# Instantiation of Groq Client
client = Groq(api_key="")

In [15]:
models = ["llama-3.3-70b-versatile", "gemma2-9b-it"]

In [18]:

predictions = {model:[] for model in models}
for question in questions:
    for model in models:
        answer_rag, context = query_rag(question, model)
        predictions[model].append({
            "question": question,
            "answer_with_rag": answer_rag,
            "answer_without_rag": query(question, model),
            "retrieved_contexts": context
        })

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

## Save Answers to CSV File

In [21]:
import csv

# save in CSV file for later instead of running the code each time
for model in models:
  with open('answers_' + model + '_' + str(n_questions) + '.csv', 'w', newline='') as file:
      writer = csv.DictWriter(file, fieldnames=["question", "answer_with_rag", "answer_without_rag", "retrieved_contexts"])
      writer.writeheader()
      writer.writerows(predictions[model])

In [22]:
import csv

# Define the file path
csv_file_path = "answers.csv"

# save the answers to a CSV file
with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Answer"])  
    for answer in answers:
        writer.writerow([answer])  



### Read Answer Files (if already exists)

In [None]:
import csv

# read from CSV file and save it into predictions
predictions = {model:[] for model in models}

with open('answers_' + models[0] + '_' + str(n_questions) + '.csv', 'r', newline='') as file:
    reader = csv.DictReader(file)
    predictions[model] = [row for row in reader]

# Evaluation

In [None]:
!pip install langchain openai weaviate-client ragas

In [None]:
from evaluate import load

# Load metrics
bertscore = load('bertscore')
rouge = load('rouge')
bleu = load('bleu')

In [None]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import openai
from getpass import getpass

OPENAI_API_KEY=""
!pip install langchain openai weaviate-client ragasimport os


openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [None]:
import json
import numpy as np

# Define metrics for BERTScore
bertscore_metrics = ["precision", "recall", "f1"]

# Experiment metadata
experiment_metadata = {
    "dataset": DATASET,
    "num_questions": n_questions,
    "temperature": TEMPERATURE,
    "rag_config": ""
}

# Compute metrics for each model
results = []
for model in models:
    rag_predictions = [entry["answer_with_rag"] for entry in predictions[model]]
    no_rag_predictions = [entry["answer_without_rag"] for entry in predictions[model]]
    
    # Compute BERTScore
    bertscore_rag = bertscore.compute(predictions=rag_predictions, references=answers, lang="en")
    bertscore_no_rag = bertscore.compute(predictions=no_rag_predictions, references=answers, lang="en")
    
    # Compute ROUGE
    rouge_rag = rouge.compute(predictions=rag_predictions, references=answers)
    rouge_no_rag = rouge.compute(predictions=no_rag_predictions, references=answers)
    
    # Compute BLEU
    bleu_rag = bleu.compute(predictions=rag_predictions, references=[[a] for a in answers])  # BLEU requires nested references
    bleu_no_rag = bleu.compute(predictions=no_rag_predictions, references=[[a] for a in answers])
    
    # Create results entry
    results.append({
        "experiment": experiment_metadata,
        "model": model,
        "metrics": {
            "RAG": {
                "BERTScore": {metric: np.mean(bertscore_rag[metric]) for metric in bertscore_metrics},
                "ROUGE": rouge_rag,
                "BLEU": bleu_rag
            },
            "No_RAG": {
                "BERTScore": {metric: np.mean(bertscore_no_rag[metric]) for metric in bertscore_metrics},
                "ROUGE": rouge_no_rag,
                "BLEU": bleu_no_rag
            }
        }
    })

# Save to JSONL
output_file = "experiment_results.jsonl"
with open(output_file, "a") as f:
    for result in results:
        f.write(json.dumps(result) + "\n")