### Setup RAG System

In [1]:
import os

import faiss
import openai
import numpy as np
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
docs = [
    "Paris is the capital and most populous city of France. The city is famed for the Eiffel Tower.",
    "Jane Austen was an English novelist best known for 'Pride and Prejudice' and 'Sense and Sensibility'.",
    "The Great Wall of China is a series of fortifications built to protect the ancient Chinese states.",
    "Mount Everest, part of the Himalayas, is Earth’s highest mountain above sea level.",
    "Mike loves the color pink more than any other color."
]

In [3]:
client = openai.OpenAI()

In [4]:
def get_embedding(text):
    response = client.embeddings.create(model="text-embedding-3-small", input=text)
    return response.data[0].embedding

In [5]:
embeddings = np.array([get_embedding(d) for d in docs]).astype('float32')
index = faiss.IndexFlatIP(embeddings.shape[1])
faiss.normalize_L2(embeddings)
index.add(embeddings)

In [6]:
def retrieve(query, k):
    query_embedding = np.array([get_embedding(query)]).astype("float32")
    
    faiss.normalize_L2(query_embedding)
    _, idx = index.search(query_embedding, k)
    
    return [docs[i] for i in idx[0]]
    

def generate_answer(question, contexts):
    prompt = (
        "Answer the user question **only** with facts found in the context.\n\n"
        "Context:\n"
        + "\n".join(f"- {c}" for c in contexts)
        + f"\n\nQuestion: {question}\nAnswer:"
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )

    return response.choices[0].message.content.strip()

### Evaluate RAG System with Ragas

In [7]:
from datasets import Dataset

questions = [
    "What is the capital of France?",
    "Who wrote Pride and Prejudice?",
    "Where is Mount Everest located?",
    "What is Mike's favorite color?"
]

ground_truths = [
    "Paris",
    "Jane Austen",
    "the Himalayas",
    "Pink"
]

rows = []

for question, ground_truth in zip(questions, ground_truths):
    context = retrieve(question, k=2)
    answer = generate_answer(question, context)
    rows.append(
        {
            "question": question,
            "contexts": context,
            "answer": answer,
            "reference": ground_truth,
        }
    )

evaluation_dataset = Dataset.from_list(rows)

In [8]:
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    faithfulness,
    context_precision,
    context_recall,
)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

[{'question': 'What is the capital of France?', 'contexts': ['Paris is the capital and most populous city of France. The city is famed for the Eiffel Tower.', 'Mike loves the color pink more than any other color.'], 'answer': 'Paris is the capital of France.', 'reference': 'Paris'}, {'question': 'Who wrote Pride and Prejudice?', 'contexts': ["Jane Austen was an English novelist best known for 'Pride and Prejudice' and 'Sense and Sensibility'.", 'Mike loves the color pink more than any other color.'], 'answer': "Jane Austen wrote 'Pride and Prejudice'.", 'reference': 'Jane Austen'}, {'question': 'Where is Mount Everest located?', 'contexts': ['Mount Everest, part of the Himalayas, is Earth’s highest mountain above sea level.', 'Paris is the capital and most populous city of France. The city is famed for the Eiffel Tower.'], 'answer': 'Mount Everest is located in the Himalayas.', 'reference': 'the Himalayas'}, {'question': "What is Mike's favorite color?", 'contexts': ['Mike loves the co

### Metrics Definitions

https://docs.ragas.io/en/v0.1.21/concepts/metrics/answer_correctness.html

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/faithfulness/#example

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_precision/

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_recall/

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/answer_relevance/

https://docs.ragas.io/en/v0.1.21/concepts/metrics/index.html

### High Scores

In [9]:
rows = []

context = docs[-1:]
question = questions[-1]
answer = generate_answer(question, context)

rows.append(
    {
        "user_input": question,
        "retrieved_contexts": context,
        "response": answer,
        "reference": ground_truths[-1]
    }
)

evaluation_dataset = Dataset.from_list(rows)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

[{'user_input': "What is Mike's favorite color?", 'retrieved_contexts': ['Mike loves the color pink more than any other color.'], 'response': "Mike's favorite color is pink.", 'reference': 'Pink'}]
{'answer_correctness': 0.9645, 'answer_relevancy': 1.0000, 'faithfulness': 1.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}


### Wrong Context

In [10]:
rows = []

context = ['Vienna is the capital of Austria']
question = questions[-1]
answer = generate_answer(question, context)

rows.append(
    {
        "user_input": question,
        "retrieved_contexts": context,
        "response": answer,
        "reference": ground_truths[-1]
    }
)

evaluation_dataset = Dataset.from_list(rows)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

[{'user_input': "What is Mike's favorite color?", 'retrieved_contexts': ['Vienna is the capital of Austria'], 'response': "The context does not provide information about Mike's favorite color.", 'reference': 'Pink'}]
{'answer_correctness': 0.1968, 'answer_relevancy': 0.0000, 'faithfulness': 1.0000, 'context_precision': 0.0000, 'context_recall': 0.0000}


### Correct Answer with Wrong Context

In [11]:
rows = []

context = ['Vienna is the capital of Austria']
question = questions[-1]
answer = generate_answer(question, context)

rows.append(
    {
        "user_input": question,
        "retrieved_contexts": context,
        "response": "Mike's favorite color is pink!",
        "reference": ground_truths[-1]
    }
)

evaluation_dataset = Dataset.from_list(rows)

scores = evaluate(
    evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)

print(rows)
print(scores)

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

[{'user_input': "What is Mike's favorite color?", 'retrieved_contexts': ['Vienna is the capital of Austria'], 'response': "Mike's favorite color is pink!", 'reference': 'Pink'}]
{'answer_correctness': 0.9644, 'answer_relevancy': 1.0000, 'faithfulness': 0.0000, 'context_precision': 0.0000, 'context_recall': 0.0000}


## Ollama Integration

In [12]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

from langchain_ollama.chat_models import ChatOllama
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [None]:
llm = ChatOllama(model="qwen3:4b", temperature=0)
ragas_llm = LangchainLLMWrapper(llm)

emb = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5")
ragas_emb = LangchainEmbeddingsWrapper(emb)

scores = evaluate(
    evaluation_dataset,
    metrics=[answer_correctness, answer_relevancy, faithfulness,
             context_precision, context_recall],
    llm=ragas_llm,
    embeddings=ragas_emb,
)

In [None]:
print(scores)