# RAG Implementation and Evaluation

In [None]:
import locale

locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install -U langchain-community
!pip install datasets
!pip install evaluate
!pip install ChromaDB
!pip install groq
!pip install sentence_transformers
!pip install light-the-torch >> /.tmp
!ltt install torch torchvision >> /.tmp
!pip install fastai --upgrade >> /.tmp
!pip install bert_score
!pip install rouge_score
!pip install meteor_score
!pip install ragas


## Load dataset

In [3]:
DATASET = "neural-bridge/rag-dataset-1200"

In [4]:
from datasets import load_dataset

rag_dataset = load_dataset(DATASET)

README.md:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

(…)-00000-of-00001-f0c158413defd454.parquet:   0%|          | 0.00/2.32M [00:00<?, ?B/s]

(…)-00000-of-00001-06d83c58a8ea10e8.parquet:   0%|          | 0.00/604k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/960 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/240 [00:00<?, ? examples/s]

## Split dataset context into chunks

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from datetime import datetime

splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=30,
        length_function=len,
        is_separator_regex=False,
)

In [None]:
"""chunks = []
for i, context in enumerate(rag_dataset["train"]["context"][:100]):
    document = Document(page_content=context)
    chunked_documents = splitter.split_documents([document])

    for j, chunk in enumerate(chunked_documents):
        unique_id = f"chunk_{i}_{j}_{int(datetime.now().timestamp() * 1000)}"
        chunk.metadata["id"] = unique_id
        chunks.append(chunk)"""

## Create the embeddings

In [6]:
from chromadb.utils import embedding_functions
from langchain.vectorstores.chroma import Chroma
import chromadb

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

chroma_client = chromadb.PersistentClient(path='/content/drive/MyDrive/Colab Notebooks/RAG/db')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
"""chroma_collection = chroma_client.get_or_create_collection(
    name="context", embedding_function=sentence_transformer_ef
)

# Add train chunks to the Chroma collection
chroma_collection.add(
    documents=[chunk.page_content for chunk in chunks],
    metadatas=[chunk.metadata for chunk in chunks],
    ids=[chunk.metadata["id"] for chunk in chunks],
)"""

In [None]:
# check the content of DB to see wether there are duplicates or not
db_content = chroma_collection.get()
db_content

## try old code

In [7]:
chroma_collection = chroma_client.get_or_create_collection(
    name="context", embedding_function=sentence_transformer_ef
)
chroma_collection.count()

0

In [9]:
import time
import hashlib
import random

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=30,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def get_context(ds):
    train_chunks = []
    for context in ds["train"]["context"][:100]:
        document = Document(page_content=context)
        chunked_documents = split_documents([document])
        for chunk in chunked_documents:
            train_chunks.append({
                "page_content": chunk.page_content
            })
    return train_chunks

def generate_unique_id(chunk):
    # Use current timestamp (in milliseconds) for uniqueness
    timestamp = str(int(time.time() * 1000))  # Current timestamp in milliseconds

    # Generate a random number to ensure uniqueness
    random_number = random.randint(100000, 999999)

    # use document content or metadata to make the ID more unique
    content_hash = hashlib.md5(chunk["page_content"].encode('utf-8')).hexdigest()[:8]  # First 8 chars of MD5 hash

    # Create a unique ID combining the timestamp and content hash
    unique_id = f"{timestamp}_{content_hash}_{random_number}"

    return unique_id



def add_context_to_chroma(chunks):
    # Assign unique IDs to chunks
    chunk_ids = []
    for chunk in chunks:
        chunk_ids.append(generate_unique_id(chunk))
        #print(chunk)

    chroma_collection.add(
        documents=[chunk["page_content"] for chunk in chunks],
        ids=chunk_ids
    )

In [10]:
chunks = []
chunks = get_context(rag_dataset)
add_context_to_chroma(chunks)

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

In [11]:
db_content = chroma_collection.get()
chroma_collection.count()

872

## Prepare Queries

In [12]:
RAG_PROMPT = "Be helpful and answer questions concisely. \
              Utilize the context provided for accurate and specific information.\
              Context: "

NO_RAG_PROMPT = "Be helpful and answer questions concisely."

TEMPERATURE = 0.2

In [13]:
import random

answers = []
questions = []

n_questions = 100

# get the train answers provided from the dataset
for answer in rag_dataset["train"]["answer"][:n_questions]:
    answers.append(answer)


# get the train questions provided from the dataset
for question in rag_dataset["train"]["question"][:n_questions]:
    questions.append(question)



In [13]:
print(questions[0])
print(answers[0])

Who found the answer to a search query collar george herbert essay?
Francisco Rogers found the answer to a search query collar george herbert essay.


In [14]:
def query_rag(query_text: str, model):
  result = chroma_collection.query(
        query_texts=query_text,
        n_results=5
    )

  # Build context text from the top results
  #print(result)
  documents = result["documents"][0]
  # Flatten the documents list
  """documents = [doc for sublist in result["documents"] for doc in sublist]

    # Remove duplicates
  seen = set()
  unique_documents = []
    
  for doc in documents:
     if doc not in seen:
        unique_documents.append(doc)
        seen.add(doc)
  print(unique_documents)"""
  context = "\n\n---\n\n".join(documents)
  #print(context)
  llm = client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": f"{RAG_PROMPT} {context}"
          },{
              "role": "user",
              "content": query_text,
          }
      ],
      model=model,
      temperature = TEMPERATURE
  )
  return llm.choices[0].message.content, context

def query(query_text: str, model):
  llm = client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": NO_RAG_PROMPT
          },{
              "role": "user",
              "content": query_text,
          }
      ],
      model=model,
      temperature = TEMPERATURE
  )
  return llm.choices[0].message.content


In [None]:
# try for on query
res, cont = query_rag("What are some of the potential negative impacts of charity as discussed in the context?", models[0])

## Run Queries

In [15]:
# Importing Necessary Libraries 
import os
from groq import Groq

# Instantiation of Groq Client
client = Groq(api_key="")

In [16]:
models = ["llama-3.1-8b-instant", "gemma2-9b-it"]

In [18]:

predictions = {model:[] for model in models}
for question in questions:
    for model in models:
        answer_rag, context = query_rag(question, model)
        predictions[model].append({
            "question": question,
            "answer_with_rag": answer_rag,
            "answer_without_rag": query(question, model),
            "retrieved_contexts": context
        })

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
predictions

## Save Answers to CSV File

In [20]:
import csv

for model in models:
  with open('answers_' + model + '_' + str(n_questions) + '.csv', 'w', newline='') as file:
      writer = csv.DictWriter(file, fieldnames=["question", "answer_with_rag", "answer_without_rag", "retrieved_contexts"])
      writer.writeheader()
      writer.writerows(predictions[model])

### Read Answer Files (if already exists)

In [None]:
import csv

predictions = {model:[] for model in models}

with open('answers_' + models[0] + '_' + str(n_questions) + '.csv', 'r', newline='') as file:
    reader = csv.DictReader(file)
    predictions[model] = [row for row in reader]

# Evaluation

In [None]:
!pip install langchain openai weaviate-client ragas

In [19]:
from evaluate import load

# Load metrics
bertscore = load('bertscore')
rouge = load('rouge')
bleu = load('bleu')

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [45]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

OPENAI_API_KEY=""
!pip install langchain openai weaviate-client ragasimport os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

Please provide your OpenAI Key:  ········


In [70]:
import json

for model in models:
    rag_predictions = [entry["answer_with_rag"] for entry in predictions[model]]
    no_rag_predictions = [entry["answer_without_rag"] for entry in predictions[model]]
    context = [[entry["retrieved_contexts"]] for entry in predictions[model]]
    questions = [entry["question"] for entry in predictions[model]]

"""data = {
    "question": questions,
    "reference": rag_predictions,
    "contexts": context,
    "ground_truths": answers
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)"""

# Convert the data dictionary to a JSON string
json_data = json.dumps(data, ensure_ascii=False, indent=4)

# To make a dataset using the `datasets` library (optional)
dataset = Dataset.from_dict(data)

# Save JSON data to a file (optional)
with open("dataset.json", "w", encoding="utf-8") as f:
    f.write(json_data)


In [71]:

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

#print(dataset[:5]["contexts"])

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        #faithfulness,
        #answer_relevancy,
    ],
)

df = result.to_pandas()

Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

last try

In [81]:
import json
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

# Ensure models and predictions are defined
data = {}
for model in models:
    rag_predictions = [entry["answer_with_rag"] for entry in predictions[model]]
    no_rag_predictions = [entry["answer_without_rag"] for entry in predictions[model]]
    context = [[entry["retrieved_contexts"]] for entry in predictions[model]]  # Correct structure
    questions = [entry["question"] for entry in predictions[model]]

    # Ensure JSON-compatibility
    data = {
        "question": questions,
        "reference": rag_predictions,
        "contexts": context,
        "ground_truths": [entry for entry in answers],
    }

# Validate JSON structure
try:
    json_data = json.dumps(data, ensure_ascii=False, indent=4)
except (TypeError, ValueError) as e:
    print(f"JSON Serialization Error: {e}")
    raise

# Convert to Dataset using `datasets`
dataset = Dataset.from_dict(data)

# Save JSON data to a file (optional)
with open("dataset.json", "w", encoding="utf-8") as f:
    f.write(json_data)

# Evaluate with Ragas
result = evaluate(
    dataset=dataset, 
    metrics=[
        context_precision,
        context_recall,
        # Uncomment if needed
        #faithfulness,
        #answer_relevancy,
    ],
)

# Convert results to a pandas DataFrame
df = result.to_pandas()

# Handle NaN values in results
df = df.fillna("N/A")  # Replace NaN with a placeholder


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
dataset['ground_truths']

In [None]:
from ragas.dataset_schema import SingleTurnSample 
from ragas.metrics import Faithfulness


sample = []
for model in models:
    rag_predictions = [entry["answer_with_rag"] for entry in predictions[model]]
    no_rag_predictions = [entry["answer_without_rag"] for entry in predictions[model]]
    context = [entry["context"] for entry in predictions[model]]
    questions = [entry["question"] for entry in predictions[model]]

for i in range(len(rag_predictions)):
    sample.append(SingleTurnSample(
            user_input= questions[i],
            response= rag_predictions[i],
            retrieved_contexts=[context[i]]
        ))
    scorer = Faithfulness(llm=llm)
    await scorer.single_turn_ascore(sample[i])

In [None]:
import json
import numpy as np

# Define metrics for BERTScore
bertscore_metrics = ["precision", "recall", "f1"]

# Experiment metadata
experiment_metadata = {
    "dataset": DATASET,
    "num_questions": n_questions,
    "temperature": TEMPERATURE,
    "rag_config": ""
}

# Compute metrics for each model
results = []
for model in models:
    rag_predictions = [entry["answer_with_rag"] for entry in predictions[model]]
    no_rag_predictions = [entry["answer_without_rag"] for entry in predictions[model]]
    
    # Compute BERTScore
    bertscore_rag = bertscore.compute(predictions=rag_predictions, references=answers, lang="en")
    bertscore_no_rag = bertscore.compute(predictions=no_rag_predictions, references=answers, lang="en")
    
    # Compute ROUGE
    rouge_rag = rouge.compute(predictions=rag_predictions, references=answers)
    rouge_no_rag = rouge.compute(predictions=no_rag_predictions, references=answers)
    
    # Compute BLEU
    bleu_rag = bleu.compute(predictions=rag_predictions, references=[[a] for a in answers])  # BLEU requires nested references
    bleu_no_rag = bleu.compute(predictions=no_rag_predictions, references=[[a] for a in answers])
    
    # Create results entry
    results.append({
        "experiment": experiment_metadata,
        "model": model,
        "metrics": {
            "RAG": {
                "BERTScore": {metric: np.mean(bertscore_rag[metric]) for metric in bertscore_metrics},
                "ROUGE": rouge_rag,
                "BLEU": bleu_rag
            },
            "No_RAG": {
                "BERTScore": {metric: np.mean(bertscore_no_rag[metric]) for metric in bertscore_metrics},
                "ROUGE": rouge_no_rag,
                "BLEU": bleu_no_rag
            }
        }
    })

# Save to JSONL
output_file = "experiment_results.jsonl"
with open(output_file, "a") as f:
    for result in results:
        f.write(json.dumps(result) + "\n")