### ChatGPT first prompt :

"Identify the key areas of improvements in the code you previously gave for R2AG and rewrite them to improve the accuracy of the model."

### ChatGPT last prompt :

" I'm getting the following error :
TypeError: sequence item 0: expected str instance, dict found"

In [1]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import sklearn
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer

import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initializing OpenAI client
client = OpenAI()


To evaluate this model, we will use the MuSiQue dataset using pandas which consists of questions with their ground truths/answers for comparison.

In [3]:
# Loading the dataset
splits = {'train': 'musique_ans_v1.0_train.jsonl', 'validation': 'musique_ans_v1.0_dev.jsonl'}
df = pd.read_json("hf://datasets/dgslibisey/MuSiQue/" + splits["train"], lines=True)
df = df[["question", "answer", "paragraphs"]].dropna()
df = df.head(10)
df["answer"] = df["answer"].apply(lambda x: x[0] if isinstance(x, list) else x)



In [4]:
# Loading embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# R2Former definition
class R2Former(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(R2Former, self).__init__()
        self.self_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=8, batch_first=True)
        self.linear = nn.Linear(input_dim, hidden_dim)

    def forward(self, x):
        attn_output, _ = self.self_attention(x, x, x)
        return self.linear(attn_output)

# Initializing R2Former with correct input_dim to match embedding dim
r2former = R2Former(input_dim=384, hidden_dim=384)

In [5]:
# Retriever class
class Retriever:
    def __init__(self, documents, model):
        self.documents = documents
        self.model = model
        self.doc_embeddings = model.encode(documents, convert_to_tensor=True)

    def retrieve(self, query, top_k=5):
        query_embedding = self.model.encode([query], convert_to_tensor=True)
        cos_scores = torch.nn.functional.cosine_similarity(query_embedding, self.doc_embeddings)
        top_results = torch.topk(cos_scores, k=top_k)
        return [self.documents[idx] for idx in top_results.indices.tolist()]

# Preparing corpus for retrieval
corpus = df["paragraphs"].tolist()
retriever = Retriever(corpus, embedding_model)

In [6]:
# Re-ranking using R2Former
def transform_retrievals(retriever, query, r2former, top_k=5):
    retrieved_docs = retriever.retrieve(query, top_k=top_k)
    embeddings = retriever.model.encode(retrieved_docs, convert_to_tensor=True)
    embeddings = embeddings.unsqueeze(0)  # Adding batch dim
    refined_embeddings = r2former(embeddings).squeeze(0)
    query_embedding = retriever.model.encode([query], convert_to_tensor=True).squeeze(0)
    sim_scores = torch.nn.functional.cosine_similarity(refined_embeddings, query_embedding.unsqueeze(0), dim=1)
    top_indices = torch.topk(sim_scores, k=3).indices.tolist()

    top_docs = []
    for i in top_indices:
        doc = retrieved_docs[i]
        
        # If doc is a dictionary, trying to get the text key
        if isinstance(doc, dict):
            text = doc.get('text', '')  
            top_docs.append(text)
        
        # If doc is a list, joining all its elements into a single string
        elif isinstance(doc, list):
            # Assuming the list contains text data
            top_docs.append(" ".join(str(item) for item in doc))  # Converting each item to string and join
        
        # If doc is a plain string, appending it directly
        elif isinstance(doc, str):
            top_docs.append(doc)

    return refined_embeddings, top_docs

In [14]:
# Response generation
def generate_response(context, query):
    prompt = f"""
You are a helpful assistant. You are given paragraphs as context. 
Only use the information present in these paragraphs to answer the question. 
Do not make up any information, and do not use external knowledge.
Give only one or two word answer, nothing else.

Context:
{context}

Question: {query}
Answer (only using the above context):
"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]
    )
    return response.choices[0].message.content.strip()

In [None]:
# Sample question and context for testing
sample_context = """
Exercise is important because it helps to improve physical health, boosts mental well-being, and supports the immune system. Regular physical activity can prevent chronic diseases like heart disease, diabetes, and obesity. Additionally, it improves sleep, reduces stress, and boosts overall mood.
"""
sample_question = "Why is exercise important?"

# Generating response using the sample context and question
generated_answer = generate_response(sample_context, sample_question)

# The result :
print(f"Sample Question: {sample_question}")
print(f"Generated Answer: {generated_answer}")

Sample Question: Why is exercise important?
Generated Answer: Exercise is important because it helps to improve physical health, boosts mental well-being, and supports the immune system. It can prevent chronic diseases like heart disease, diabetes, and obesity. Additionally, exercise improves sleep, reduces stress, and boosts overall mood.


In [15]:
# ROUGE and F1 score evaluation
def rouge_score_evaluation(predicted_answer, ground_truth):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    scores = scorer.score(ground_truth, predicted_answer)
    return scores["rougeL"].fmeasure

def f1_score_evaluation(predicted_answer, ground_truth):
    pred_tokens = predicted_answer.lower().split()
    true_tokens = ground_truth.lower().split()
    common = set(pred_tokens) & set(true_tokens)
    if not pred_tokens or not true_tokens:
        return 0, 0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(true_tokens)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    return round(f1, 4), 1 if f1 > 0.5 else 0

In [16]:
# Evaluation loop
rouge_scores = []
f1_preds = []
f1_trues = []

for index, row in df.iterrows():
    query = row["question"]
    ground_truth = row["answer"]
    refined_embeddings, top_docs = transform_retrievals(retriever, query, r2former)
    context = "\n".join(top_docs)
    generated_answer = generate_response(context, query)
    rouge = rouge_score_evaluation(generated_answer, ground_truth)
    f1_val, f1_binary = f1_score_evaluation(generated_answer, ground_truth)

    rouge_scores.append(rouge)
    f1_preds.append(f1_binary)
    f1_trues.append(1)

    print(f"\nQuestion: {query}\nGenerated Answer: {generated_answer}\nGround Truth: {ground_truth}\nROUGE-L: {rouge}\nF1 Score: {f1_val}")


Question: When was the institute that owned The Collegian founded?
Generated Answer: 1960
Ground Truth: 1960
ROUGE-L: 1.0
F1 Score: 1.0

Question: What year saw the creation of the region where the county of Hertfordshire is located?
Generated Answer: 1994
Ground Truth: 1994
ROUGE-L: 1.0
F1 Score: 1.0

Question: When was the abolishment of the studio that distributed The Game?
Generated Answer: Not mentioned
Ground Truth: 1999
ROUGE-L: 0.0
F1 Score: 0

Question: When was the publisher of Crux launched?
Generated Answer: Not mentioned
Ground Truth: 1998
ROUGE-L: 0.0
F1 Score: 0

Question: Jan Šindel's was born in what country?
Generated Answer: Not mentioned
Ground Truth: Czech Republic
ROUGE-L: 0.0
F1 Score: 0

Question: What city is the person who broadened the doctrine of philosophy of language from?
Generated Answer: Copenhagen
Ground Truth: Copenhagen
ROUGE-L: 1.0
F1 Score: 1.0

Question: When was the baseball team winning the world series in 2015 baseball created?
Generated Answe

In [17]:
# Final evaluation
average_rouge = sum(rouge_scores) / len(rouge_scores)
overall_f1 = f1_score(f1_trues, f1_preds)
print(f"\nAverage ROUGE-L Score: {average_rouge}")
print(f"Overall F1 Score: {overall_f1}")


Average ROUGE-L Score: 0.5
Overall F1 Score: 0.6666666666666666
