In [None]:
# Load all required Libraries
import pandas as pd
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)
import os
from dotenv import load_dotenv
import numpy as np
load_dotenv()
API_KEY = os.getenv(OPENAI_API_KEY)


  from .autonotebook import tqdm as notebook_tqdm


True

# Read Passages from the Datasets and Drop rows if they are NA or empty

In [2]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(passages.shape)
passages.head()

(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


# Tokenize Text and Generate Embeddings using Sentence Transformers

In [3]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode Text
embeddings = embedding_model.encode(
    passages['passage'].tolist()
)


embeddings_np = np.asarray(embeddings)
embeddings_np.shape

(3200, 384)

# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [5]:
# Define every column of your schema
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType


id_ = FieldSchema(name = "id", dtype = DataType.INT64, is_primary = True)        
passage = FieldSchema(name = "passage", dtype = DataType.VARCHAR, max_length = 9000)
embedding = FieldSchema(name = "embedding", dtype=DataType.FLOAT_VECTOR, dim = 384)  

In [6]:
schema = CollectionSchema(
    fields=[id_, passage, embedding],
    description="rag_mini"
)

In [7]:


client = MilvusClient("rag_wikipedia_mini.db")

# Create the Collection with Collection Name = "rag_mini". Make sure you define the schema variable while creating the collection


collection_name = "rag_mini"


client.create_collection(
    collection_name=collection_name,
    schema=schema
)
print("Collection created")

  from pkg_resources import DistributionNotFound, get_distribution
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collection created


**Convert your Pandas Dataframe to a list of dictionaries**
- The Dictionary at least have 3 keys [id, passage, embedding]

In [8]:
rag_data = [
    {"id": int(i), "passage": passages.iloc[i]["passage"], "embedding": embeddings[i].tolist()}
    for i in range(len(passages))
]
len(rag_data), rag_data[0].keys()


(3200, dict_keys(['id', 'passage', 'embedding']))

In [9]:
# Code to insert the data to your DB
res = client.insert(collection_name="rag_mini", data=rag_data)

print(res)

{'insert_count': 3200, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

- Do a Sanity Check on your database 

**Do not delete the below line during your submission**

In [10]:
print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
print("Collection schema:", client.describe_collection("rag_mini"))

Entity count: 41600
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'rag_mini', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 9000}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}


# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

In [11]:
import pandas as pd

queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")
queries
len(queries)

queries = queries.reset_index()
queries = queries[['question','answer']]
queries.head()

Unnamed: 0,question,answer
0,Was Abraham Lincoln the sixteenth President of...,yes
1,Did Lincoln sign the National Banking Act of 1...,yes
2,Did his mother die of pneumonia?,no
3,How many long was Lincoln's formal education?,18 months
4,When did Lincoln begin his political career?,1832


In [12]:

# Get the first question first
query = queries.iloc[0]["question"]
query_embedding = embedding_model.encode([query], normalize_embeddings=True)
import numpy as np
query_embedding = np.asarray(query_embedding, dtype="float32")
print(query_embedding.shape)

(1, 384)


#### Create Index on the embedding column on your DB

In [13]:
index_params = MilvusClient.prepare_index_params()


index_params.add_index(
    field_name="embedding",
    index_type="AUTOINDEX",
    metric_type="COSINE",
    params={"M": 8, "efConstruction": 64}
)


try:
    client.create_index(collection_name="rag_mini", index_params=index_params)
    print("Index created")
except Exception as e:
    print(f"Index creation result: {e}")

# Load collection into memory (required for search)
client.load_collection("rag_mini")
print("Collection loaded into memory")

Index created
Collection loaded into memory


In [14]:

output_ = client.search(
    collection_name="rag_mini",
    data=query_embedding,               
    anns_field="embedding",
    limit=50,
    output_fields=["id", "passage"],
    search_params={"metric_type": "COSINE", "params": {"ef": 64}}
)
output = list(output_)

In [15]:
len(output[0])

4

## Now get the Context 
- Initially use the first passage ONLY as your context
- In Later Experiments, you must try at least 2 different passage selection strategies (Top 3 / Top 5 / Top 10) and pass to your prompt

In [16]:

top = output_[0][0]

context = top['entity']['passage']
context[:500]

'Young Abraham Lincoln'

**Develop your Prompt**

In [17]:
system_prompt = f"You are helpful assistant. Answer the question using the provided context. If the answer is not contained in the context, say you don't know."

prompt = f"""{system_prompt} \n Context: {context}: \n Question: {query} """
print(prompt)

You are helpful assistant. Answer the question using the provided context. If the answer is not contained in the context, say you don't know. 
 Context: Young Abraham Lincoln: 
 Question: Was Abraham Lincoln the sixteenth President of the United States? 


# RAG Response for a Single Query

In [None]:
# Load the LLM Model you want to use
import os
from openai import OpenAI

# key = os.getenv("OPENAI_API_KEY")
# key = str(key)
llm_client = OpenAI(api_key = API_KEY)
def generate_answer(prompt, model = "gpt-5-nano-2025-08-07"):
    response = llm_client.responses.create(
        model = model,
        input = prompt
    )
    return response


answer_text = generate_answer(prompt).output_text
print(answer_text)



I don't know. The provided context doesn't include that information.


# Generate Responses for all the Queries in the Dataset

NOTE: Since processing each query takes at least 5-6 second, getting answers for all 918 rows of data is time-consume and costly. Therefore, for this analysis, we'll be only using 200 rows from the dataset to evaluate accuracy for the RAG protocol. 

In [None]:
TOP_K_CONTEXT = 5
MODEL_NAME = "gpt-4o-mini"

llm_client = OpenAI(api_key = API_KEY)
def generate_answer(prompt, model = "gpt-5-nano-2025-08-07"):
    response = llm_client.responses.create(
        model = model,
        input = prompt
    )
    return response

def generate_prompt(context, question):
    return f"""
        INSTRUCTIONS FOR ANSWERING QUESTIONS:
        TASK: Answer the given question using ONLY the provided context.
        PROCESS:
        1. Read the context thoroughly
        2. Determine if the context contains sufficient information to answer
        3. If sufficient: Provide a direct, accurate answer (yes/no/one word answer)
        4. If insufficient: State "unknown"
        REQUIREMENTS:
        - Base your answer strictly on the provided context
        - Do not use external knowledge
        - Be concise but complete
        - Maintain factual accuracy
        Context: {context}

        Question: {question}

        Following the instructions above, provide your answer:
    """

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
answers = []

queries_200 = queries.head(200)

for i, row in queries_200.iterrows():
    q = row['question']
    q_emb = embedding_model.encode([q], normalize_embeddings = True)
    q_emb = np.asarray(q_emb,dtype = "float32")
    output_ = client.search(
        collection_name="rag_mini",
        data=q_emb,               
        anns_field="embedding",
        limit=5,
        output_fields=["id", "passage"],
        search_params={"metric_type": "COSINE", "params": {"ef": 16}}

    )
    context = output_[0][0]['entity']['passage']
    
    prompt = generate_prompt(context, q)
    print(f"prompt {i} length: {len(prompt)}")
    answer = generate_answer(prompt)
    answers.append(answer.output_text)
    if i%10 == 0:
        print(f"processed {i} queries")

prompt 0 length: 767
processed 0 queries
prompt 1 length: 1797
prompt 2 length: 1131
prompt 3 length: 1613
prompt 4 length: 1163
prompt 5 length: 917
prompt 6 length: 835
prompt 7 length: 1791
prompt 8 length: 1671
prompt 9 length: 763
prompt 10 length: 1166
processed 10 queries
prompt 11 length: 1172
prompt 12 length: 1082
prompt 13 length: 744
prompt 14 length: 727
prompt 15 length: 1649
prompt 16 length: 1676
prompt 17 length: 1476
prompt 18 length: 1124
prompt 19 length: 1608
prompt 20 length: 1936
processed 20 queries
prompt 21 length: 730
prompt 22 length: 821
prompt 23 length: 1197
prompt 24 length: 964
prompt 25 length: 743
prompt 26 length: 1093
prompt 27 length: 1153
prompt 28 length: 843
prompt 29 length: 1028
prompt 30 length: 946
processed 30 queries
prompt 31 length: 1078
prompt 32 length: 1125
prompt 33 length: 872
prompt 34 length: 1181
prompt 35 length: 1306
prompt 36 length: 1165
prompt 37 length: 971
prompt 38 length: 1392
prompt 39 length: 834
prompt 40 length: 1415

In [23]:
len(answers)

200

# Finding out the Basic QA Metrics (F1 score, EM score)

In [27]:
# F1 Score and EM Score Implementation
import re
import string
from collections import Counter
import numpy as np

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.IGNORECASE)
        return re.sub(regex, ' ', text)
    
    def white_space_fix(text):
        return ' '.join(text.split())
    
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    
    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    """Calculate F1 score between prediction and ground truth."""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)
    
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0
    
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return f1

def exact_match_score(prediction, ground_truth):
    """Calculate exact match score between prediction and ground truth."""
    return int(normalize_answer(prediction) == normalize_answer(ground_truth))

# Calculate F1 and EM scores for all 200 queries
f1_scores = []
em_scores = []

print("Calculating F1 and EM scores...")
print("=" * 50)

for i in range(len(queries_200)):
    question = queries_200.iloc[i]['question']
    generated_answer = answers[i]
    ground_truth = queries_200.iloc[i]['answer'] 
    
    f1 = f1_score(generated_answer, ground_truth)
    em = exact_match_score(generated_answer, ground_truth)
    
    f1_scores.append(f1)
    em_scores.append(em)
    
    # Print first few examples
    if i == 0:
        print(f"\nExample {i+1}:")
        print(f"Question: {question}")
        print(f"Ground Truth: {ground_truth}")
        print(f"Generated Answer: {generated_answer}")
        print(f"F1 Score: {f1:.4f}")
        print(f"EM Score: {em}")
        print("-" * 30)

# Calculate overall metrics
overall_f1 = np.mean(f1_scores)
overall_em = np.mean(em_scores)

print(f"\n{'='*50}")
print(f"OVERALL RESULTS:")
print(f"{'='*50}")
print(f"Average F1 Score: {overall_f1:.4f}")
print(f"Average EM Score: {overall_em:.4f}")
print(f"Total Questions Evaluated: {len(queries_200)}")
print(f"{'='*50}")

# Additional statistics
print(f"\nF1 Score Statistics:")
print(f"Min F1: {min(f1_scores):.4f}")
print(f"Max F1: {max(f1_scores):.4f}")
print(f"Std F1: {np.std(f1_scores):.4f}")

print(f"\nEM Score Statistics:")
print(f"Exact Matches: {sum(em_scores)}")
print(f"Exact Match Rate: {sum(em_scores)/len(em_scores)*100:.2f}%")


Calculating F1 and EM scores...

Example 1:
Question: Was Abraham Lincoln the sixteenth President of the United States?
Ground Truth: yes
Generated Answer: unknown
F1 Score: 0.0000
EM Score: 0
------------------------------

OVERALL RESULTS:
Average F1 Score: 0.4675
Average EM Score: 0.4000
Total Questions Evaluated: 200

F1 Score Statistics:
Min F1: 0.0000
Max F1: 1.0000
Std F1: 0.4732

EM Score Statistics:
Exact Matches: 80
Exact Match Rate: 40.00%


# Add Two Advaned RAG Features

In [19]:
# Advanced RAG Integration - Query Rewriting and Reranking
# Following the same structure as your existing code

# Install required package first:
# pip install sentence-transformers

from sentence_transformers import CrossEncoder
import re

# Initialize reranker (one-time setup)
print("Initializing reranker...")
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
print("Reranker ready!")

def rewrite_query(query):
    """
    Query Rewriting: Generate multiple query variations
    """
    # Extract key terms from query
    words = re.findall(r'\b\w+\b', query.lower())
    stop_words = {'what', 'who', 'when', 'where', 'why', 'how', 'is', 'are', 'was', 'were', 'the', 'a', 'an', 'and', 'or', 'but'}
    key_terms = [w for w in words if len(w) > 2 and w not in stop_words]
    
    # Generate query variations
    variations = [query]  # Always include original query
    
    if key_terms:
        main_term = key_terms[0]
        variations.extend([
            f"Information about {main_term}",
            f"Details regarding {main_term}",
            f"What is {main_term}?",
            f"Tell me about {main_term}"
        ])
    
    return variations[:4]  # Limit to 4 variations

def enhanced_search_with_reranking(query, top_k=5):
    """
    Enhanced search combining query rewriting and reranking
    """
    # Step 1: Query rewriting
    query_variations = rewrite_query(query)
    print(f"Generated {len(query_variations)} query variations")
    
    # Step 2: Search for each variation (same as your existing search)
    all_docs = []
    for variation in query_variations:
        q_emb = embedding_model.encode([variation], normalize_embeddings=True)
        q_emb = np.asarray(q_emb, dtype="float32")
        
        output_ = client.search(
            collection_name="rag_mini",
            data=q_emb,
            anns_field="embedding",
            limit=20,  # Get more docs for reranking
            output_fields=["id", "passage"],
            search_params={"metric_type": "COSINE", "params": {"ef": 16}}
        )
        
        for result in output_[0]:
            all_docs.append({
                'id': result['entity']['id'],
                'passage': result['entity']['passage'],
                'distance': result['distance']
            })
    
    # Step 3: Remove duplicates (keep best distance for each doc)
    unique_docs = {}
    for doc in all_docs:
        doc_id = doc['id']
        if doc_id not in unique_docs or doc['distance'] < unique_docs[doc_id]['distance']:
            unique_docs[doc_id] = doc
    
    unique_documents = list(unique_docs.values())
    print(f"Retrieved {len(unique_documents)} unique documents")
    
    # Step 4: Reranking
    if len(unique_documents) > 1:
        query_doc_pairs = [(query, doc['passage']) for doc in unique_documents]
        relevance_scores = reranker.predict(query_doc_pairs)
        
        for i, doc in enumerate(unique_documents):
            doc['relevance_score'] = float(relevance_scores[i])
        
        # Sort by relevance score (higher is better)
        reranked_docs = sorted(unique_documents, key=lambda x: x['relevance_score'], reverse=True)
    else:
        reranked_docs = unique_documents
    
    return reranked_docs[:top_k]

# Enhanced answer generation function (same structure as your existing code)
def generate_enhanced_answer(context_docs, question):
    """
    Generate answer using enhanced context (multiple documents)
    """
    # Combine top documents as context
    context = "\n\n".join([doc['passage'] for doc in context_docs])
    
    # Use your existing prompt structure
    prompt = generate_prompt(context, question)
    
    # Use your existing generate_answer function
    answer = generate_answer(prompt)
    return answer.output_text

# Test the enhanced system
# def test_enhanced_rag():
#     """Test the enhanced RAG system with a sample query"""
    
#     sample_query = "Who was Abraham Lincoln?"
    
#     print("Testing Enhanced RAG System")
#     print("=" * 60)
    
#     # Get enhanced results
#     enhanced_docs = enhanced_search_with_reranking(sample_query, top_k=3)
    
#     print(f"\nTop {len(enhanced_docs)} documents after reranking:")
#     for i, doc in enumerate(enhanced_docs):
#         print(f"\nDocument {i+1}:")
#         relevance_score = doc.get('relevance_score')
#         if relevance_score is not None:
#             print(f"Relevance Score: {relevance_score:.3f}")
#         else:
#             print("Relevance Score: N/A")
#         print(f"Content: {doc['passage'][:200]}...")
    
#     # Generate answer
#     print(f"\nGenerating answer for: {sample_query}")
#     answer = generate_enhanced_answer(enhanced_docs, sample_query)
#     print(f"Answer: {answer}")
    
#     return enhanced_docs, answer

# # Run the test
# enhanced_results, enhanced_answer = test_enhanced_rag()

Initializing reranker...
Reranker ready!


In [35]:
TOP_K_CONTEXT = 3  
MODEL_NAME = "gpt-4o-mini"


print("Processing queries with Enhanced RAG (Query Rewriting + Reranking)...")
print("=" * 70)

enhanced_answers = []
queries_100 = queries.head(100)
for i, row in queries_100.iterrows():
    q = row['question']
    # Enhanced search with query rewriting and reranking
    enhanced_docs = enhanced_search_with_reranking(q, top_k=TOP_K_CONTEXT)
    
    # Generate answer using enhanced context
    answer = generate_enhanced_answer(enhanced_docs, q)
    enhanced_answers.append(answer)
    
    if i % 10 == 0:
        print(f"Processed {i} queries")
        print(f"Sample: {q[:50]}... -> {answer[:100]}...")

print(f"\nCompleted processing {len(enhanced_answers)} queries with Enhanced RAG!")

Processing queries with Enhanced RAG (Query Rewriting + Reranking)...
Generated 4 query variations
Retrieved 4 unique documents
Processed 0 queries
Sample: Was Abraham Lincoln the sixteenth President of the... -> Yes...
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 7 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 4 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Processed 10 queries
Sample: Did Lincoln start his political career in 1832?... -> Yes...
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique docu

## Evaluation of Advaned RAG Features (F1 score and EM)

In [38]:

# Calculate F1 and EM scores for Advanced RAG (100 queries)
advanced_f1_scores = []
advanced_em_scores = []

for i in range(len(queries_100)):
    question = queries_100.iloc[i]['question']
    generated_answer = enhanced_answers[i]  # Assuming enhanced_answers contains 100 answers
    ground_truth = queries_100.iloc[i]['answer']
    
    f1 = f1_score(generated_answer, ground_truth)
    em = exact_match_score(generated_answer, ground_truth)
    
    advanced_f1_scores.append(f1)
    advanced_em_scores.append(em)

# Calculate overall metrics for Advanced RAG
advanced_overall_f1 = np.mean(advanced_f1_scores)
advanced_overall_em = np.mean(advanced_em_scores)

print(f"\n{'='*50}")
print(f"ADVANCED RAG RESULTS:")
print(f"{'='*50}")
print(f"Average F1 Score: {advanced_overall_f1:.4f}")
print(f"Average EM Score: {advanced_overall_em:.4f}")
print(f"Total Questions Evaluated: {len(queries_100)}")
print(f"{'='*50}")

# Additional statistics for Advanced RAG
print(f"\nAdvanced RAG F1 Score Statistics:")
print(f"Min F1: {min(advanced_f1_scores):.4f}")
print(f"Max F1: {max(advanced_f1_scores):.4f}")
print(f"Std F1: {np.std(advanced_f1_scores):.4f}")

print(f"\nAdvanced RAG EM Score Statistics:")
print(f"Exact Matches: {sum(advanced_em_scores)}")
print(f"Exact Match Rate: {sum(advanced_em_scores)/len(advanced_em_scores)*100:.2f}%")

# Comparison with Basic RAG (assuming you have basic_answers for 200 queries)
print(f"\n{'='*60}")
print(f"COMPARISON: Basic RAG vs Advanced RAG")
print(f"{'='*60}")

# Basic RAG metrics (from your existing evaluation)
basic_overall_f1 = np.mean(f1_scores)  # Your existing f1_scores from 200 queries
basic_overall_em = np.mean(em_scores)   # Your existing em_scores from 200 queries

print(f"Basic RAG (200 queries):")
print(f"  Average F1 Score: {basic_overall_f1:.4f}")
print(f"  Average EM Score: {basic_overall_em:.4f}")
print(f"  Exact Match Rate: {sum(em_scores)/len(em_scores)*100:.2f}%")

print(f"\nAdvanced RAG (100 queries):")
print(f"  Average F1 Score: {advanced_overall_f1:.4f}")
print(f"  Average EM Score: {advanced_overall_em:.4f}")
print(f"  Exact Match Rate: {sum(advanced_em_scores)/len(advanced_em_scores)*100:.2f}%")

# Improvement analysis
f1_improvement = advanced_overall_f1 - basic_overall_f1
em_improvement = advanced_overall_em - basic_overall_em

print(f"\nIMPROVEMENT ANALYSIS:")
print(f"F1 Score Improvement: {f1_improvement:+.4f} ({f1_improvement/basic_overall_f1*100:+.1f}%)")
print(f"EM Score Improvement: {em_improvement:+.4f} ({em_improvement/basic_overall_em*100:+.1f}%)")

# Performance summary
if f1_improvement > 0:
    print(f"\n✅ Advanced RAG shows {f1_improvement:.4f} improvement in F1 score")
else:
    print(f"\n❌ Advanced RAG shows {f1_improvement:.4f} decrease in F1 score")

if em_improvement > 0:
    print(f"✅ Advanced RAG shows {em_improvement:.4f} improvement in EM score")
else:
    print(f"❌ Advanced RAG shows {em_improvement:.4f} decrease in EM score")

print(f"{'='*60}")


ADVANCED RAG RESULTS:
Average F1 Score: 0.6320
Average EM Score: 0.5400
Total Questions Evaluated: 100

Advanced RAG F1 Score Statistics:
Min F1: 0.0000
Max F1: 1.0000
Std F1: 0.4437

Advanced RAG EM Score Statistics:
Exact Matches: 54
Exact Match Rate: 54.00%

COMPARISON: Basic RAG vs Advanced RAG
Basic RAG (200 queries):
  Average F1 Score: 0.4675
  Average EM Score: 0.4000
  Exact Match Rate: 40.00%

Advanced RAG (100 queries):
  Average F1 Score: 0.6320
  Average EM Score: 0.5400
  Exact Match Rate: 54.00%

IMPROVEMENT ANALYSIS:
F1 Score Improvement: +0.1645 (+35.2%)
EM Score Improvement: +0.1400 (+35.0%)

✅ Advanced RAG shows 0.1645 improvement in F1 score
✅ Advanced RAG shows 0.1400 improvement in EM score


In [None]:
llm_client = OpenAI(api_key = API_KEY)
def generate_answer(prompt, model = "gpt-5-nano-2025-08-07"):
    response = llm_client.responses.create(
        model = model,
        input = prompt
    )
    return response

def generate_prompt(context, question):
    return f"""
        INSTRUCTIONS FOR ANSWERING QUESTIONS:
        TASK: Answer the given question using ONLY the provided context.
        PROCESS:
        1. Read the context thoroughly
        2. Determine if the context contains sufficient information to answer
        3. If sufficient: Provide a direct, accurate answer (yes/no/one word answer)
        4. If insufficient: State "unknown"
        REQUIREMENTS:
        - Base your answer strictly on the provided context
        - Do not use external knowledge
        - Be concise but complete
        - Maintain factual accuracy
        Context: {context}

        Question: {question}

        Following the instructions above, provide your answer:
    """

# Advanced Evaluation using RAGAs

In [28]:
# Simple RAGAs evaluation - avoiding column issues
from datasets import Dataset
import numpy as np

# Use first 10 queries
sample_queries = queries.head(10)

print("Generating data for RAGAs evaluation...")
print("=" * 50)

# Generate Basic RAG data
basic_answers_sample = []
basic_contexts_sample = []

for i in range(10):
    q = sample_queries.iloc[i]['question']
    
    # Basic search
    q_emb = embedding_model.encode([q], normalize_embeddings=True)
    q_emb = np.asarray(q_emb, dtype="float32")
    
    basic_output = client.search(
        collection_name="rag_mini",
        data=q_emb,
        anns_field="embedding",
        limit=1,
        output_fields=["id", "passage"],
        search_params={"metric_type": "COSINE", "params": {"ef": 16}}
    )
    
    basic_context = basic_output[0][0]['entity']['passage']
    basic_contexts_sample.append(basic_context)
    
    # Generate basic answer
    basic_prompt = f"Context: {basic_context}\nQuestion: {q}\nAnswer:"
    basic_answer = generate_answer(basic_prompt)
    basic_answers_sample.append(basic_answer.output_text)

# Generate Enhanced RAG data
enhanced_answers_sample = []
enhanced_contexts_sample = []

for i in range(10):
    q = sample_queries.iloc[i]['question']
    
    # Enhanced search
    enhanced_docs = enhanced_search_with_reranking(q, top_k=3)
    enhanced_contexts_sample.append(enhanced_docs)
    
    # Generate enhanced answer
    enhanced_answer = generate_enhanced_answer(enhanced_docs, q)
    enhanced_answers_sample.append(enhanced_answer)

print("Data generated successfully!")

# Prepare datasets with ALL required columns
basic_data = {
    "question": sample_queries['question'].tolist(),
    "answer": basic_answers_sample,
    "contexts": [[ctx] for ctx in basic_contexts_sample],
    "reference": sample_queries['answer'].tolist(),  # For context_precision
    "ground_truths": sample_queries['answer'].tolist()  # For other metrics
}

enhanced_data = {
    "question": sample_queries['question'].tolist(),
    "answer": enhanced_answers_sample,
    "contexts": [[doc['passage'] for doc in ctx_list] for ctx_list in enhanced_contexts_sample],
    "reference": sample_queries['answer'].tolist(),  # For context_precision
    "ground_truths": sample_queries['answer'].tolist()  # For other metrics
}

# Convert to datasets
basic_dataset = Dataset.from_dict(basic_data)
enhanced_dataset = Dataset.from_dict(enhanced_data)

print("Datasets prepared!")

# Run evaluation
print("\nRunning RAGAs evaluation...")
print("=" * 50)

basic_result = evaluate(
    dataset=basic_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision]
)

enhanced_result = evaluate(
    dataset=enhanced_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision]
)

# Display results
print("\nRAGAs EVALUATION RESULTS")
print("="*60)
print(f"{'Metric':<20} {'Basic RAG':<12} {'Enhanced RAG':<12} {'Improvement':<12}")
print("-" * 60)

metrics = ['faithfulness', 'answer_relevancy', 'context_precision']

for metric in metrics:
    basic_score = basic_result[metric]
    enhanced_score = enhanced_result[metric]
    
    # Handle lists
    if isinstance(basic_score, list):
        basic_score = np.mean(basic_score)
    if isinstance(enhanced_score, list):
        enhanced_score = np.mean(enhanced_score)
    
    improvement = enhanced_score - basic_score
    improvement_pct = (improvement / basic_score) * 100 if basic_score > 0 else 0
    
    print(f"{metric:<20} {basic_score:<12.4f} {enhanced_score:<12.4f} {improvement:+.4f} ({improvement_pct:+.1f}%)")

print("-" * 60)

# Analysis
improvements = 0
for m in metrics:
    basic_score = basic_result[m]
    enhanced_score = enhanced_result[m]
    
    if isinstance(basic_score, list):
        basic_score = np.mean(basic_score)
    if isinstance(enhanced_score, list):
        enhanced_score = np.mean(enhanced_score)
    
    if enhanced_score > basic_score:
        improvements += 1

print(f"\nANALYSIS:")
print(f"✅ Enhanced RAG improves {improvements}/{len(metrics)} metrics")

if improvements >= 2:
    print("🏆 Enhanced RAG significantly outperforms Basic RAG")
elif improvements >= 1:
    print("✅ Enhanced RAG shows moderate improvements")
else:
    print("⚠️ Enhanced RAG shows limited improvements")

print("\n" + "="*60)
print("EVALUATION COMPLETED")
print("="*60)

Generating data for RAGAs evaluation...
Generated 4 query variations
Retrieved 4 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 7 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 6 unique documents
Generated 4 query variations
Retrieved 4 unique documents
Data generated successfully!
Datasets prepared!

Running RAGAs evaluation...


Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]Exception raised in Job[10]: IndexError(list index out of range)
Evaluating:   3%|▎         | 1/30 [00:01<00:56,  1.95s/it]Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[7]: IndexError(list index out of range)
Exception raised in Job[4]: IndexError(list index out of range)
Evaluating: 100%|██████████| 30/30 [00:25<00:00,  1.16it/s]
Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[10]: IndexError(list index out of range)
Exception raised in Job[4]: IndexError(list index out of range)
Evaluating:   3%|▎         | 1/30 [00:03<01:47,  3.72s/it]Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[7]: IndexError(list index out of range)
Evaluating:  23%|██▎       | 7/30 [00:07<00:19,  1.19it/s]Exception raised in Jo


RAGAs EVALUATION RESULTS
Metric               Basic RAG    Enhanced RAG Improvement 
------------------------------------------------------------
faithfulness         0.7417       0.9000       +0.1583 (+21.3%)
answer_relevancy     nan          nan          +nan (+0.0%)
context_precision    0.7000       1.0000       +0.3000 (+42.9%)
------------------------------------------------------------

ANALYSIS:
✅ Enhanced RAG improves 2/3 metrics
🏆 Enhanced RAG significantly outperforms Basic RAG

EVALUATION COMPLETED
