In [1]:
!pip install ragas datasets langchain elasticsearch openai langchain-openai

Collecting ragas
  Using cached ragas-0.2.15-py3-none-any.whl (190 kB)
Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl (491 kB)
Collecting langchain
  Using cached langchain-0.3.26-py3-none-any.whl (1.0 MB)
Collecting elasticsearch
  Using cached elasticsearch-9.0.2-py3-none-any.whl (914 kB)
Collecting openai
  Using cached openai-1.93.0-py3-none-any.whl (755 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.3.27-py3-none-any.whl (70 kB)
Collecting diskcache>=5.6.3
  Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Collecting appdirs
  Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting pydantic>=2
  Using cached pydantic-2.11.7-py3-none-any.whl (444 kB)
Collecting numpy
  Using cached numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl (5.3 MB)
Collecting langchain-community
  Using cached langchain_community-0.3.27-py3-none-any.whl (2.5 MB)
Collecting tiktoken
  Using cached tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl (1.0 MB)
Co

In [2]:
import os
from getpass import getpass
from elasticsearch import Elasticsearch
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from ragas.llms import LangchainLLMWrapper
from datasets import Dataset
from langchain_openai import ChatOpenAI
import pandas

In [3]:
es = Elasticsearch(
    getpass("Host: "),
    api_key=getpass("API Key: "),
)

Host:  ········
API Key:  ········


In [4]:
index_name = "books-local-test"

In [5]:
API_KEY = os.getenv("OPENAI_API_KEY")

chat_llm = ChatOpenAI(
    model="gpt-4o",  
    temperature=0.1,
    api_key=API_KEY  
)

In [6]:
def search_books(query, top_k=2):
    search_body = {
        "size": top_k,
        "query": {
            "multi_match": {
                "query": query,
                "fields": [
                    "book_description",
                    "book_title",
                    "author_name"
                ],
                "type": "best_fields"
            }
        },
        "_source": ["book_title", "author_name", "book_description", "rating_score"]
    }

    response = es.search(index=index_name, body=search_body)
    hits = response["hits"]["hits"]
    print(f"🔎 Found {len(hits)} results for: {query}")

    contexts = []
    books_info = [] 
    for hit in hits:
        book = hit["_source"]
        context = f"{book['book_title']} by {book['author_name']}: {book['book_description']}"
        contexts.append(context)
        books_info.append(book)

    return contexts, books_info

In [7]:
def generate_answer(question, contexts):
    context_text = "\n\n".join(contexts)
    
    print("Context: ")
    print(context_text)

    prompt = f"""You are a helpful assistant that recommends books.
Use only the information from the context below to answer the question.
Do not include any books, authors, or details that are not explicitly present in the context.

Repeat the exact book title and author from the context in your answer.

Context:
{context_text}

Question:
{question}

Answer:"""

    response = chat_llm.invoke(prompt)
    return response.content.strip()

In [8]:
def create_dynamic_ground_truth(question, books_info):
    if not books_info:
        return "No relevant books found."
    
    best_book = max(books_info, key=lambda x: float(x.get('rating_score', 0)))
    
    if "science fiction" in question.lower():
        return f"A good science fiction book is '{best_book['book_title']}' by {best_book['author_name']}."
    elif "fantasy" in question.lower():
        return f"'{best_book['book_title']}' by {best_book['author_name']} is a good fantasy book."
    elif "mystery" in question.lower():
        return f"'{best_book['book_title']}' by {best_book['author_name']} is a good mystery novel."
    else:
        return f"I recommend '{best_book['book_title']}' by {best_book['author_name']}."

In [9]:
def explore_index_data():
    print("\n🧭 Exploring index data: ")
    
    sample_query = {
        "size": 5,
        "query": {"match_all": {}},
        "_source": ["book_title", "author_name", "rating_score"]
    }
    
    response = es.search(index=index_name, body=sample_query)
    hits = response["hits"]["hits"]
    
    print("📙 Sample books in your index:")
    for hit in hits:
        book = hit["_source"]
        print(f"- '{book['book_title']}' by {book['author_name']} (Rating: {book.get('rating_score', 'N/A')})")
    
    return hits

In [10]:
def run_ragas_demo():
    print("🚀 Demo: \n")
    
    sample_books = explore_index_data()
    
    demo_questions = [
        "What's a good science fiction book with high ratings?",
        "Can you suggest a fantasy book by a popular author?", 
        "What's a highly rated mystery novel?",
        "Recommend a book with good reviews"
    ]

    questions, contexts_list, answers, ground_truths = [], [], [], []

    for i, question in enumerate(demo_questions, 1):
        print(f"\n📚 Question {i}: {question}")
        
        try:
            contexts, books_info = search_books(question, top_k=3)  # Increased top_k
            
            if not contexts:
                print(f"No contexts found for question {i}")
                continue

            answer = generate_answer(question, contexts)
            print(f"Answer: {answer[:100]}...")
            
            ground_truth = create_dynamic_ground_truth(question, books_info)
            print(f"Ground Truth: {ground_truth}")
            
            questions.append(question)
            contexts_list.append(contexts)
            answers.append(answer)
            ground_truths.append(ground_truth)
            
        except Exception as e:
            print(f"Error processing question {i}: {e}")
            continue

    if not questions:
        print("\nNo valid Q&A pairs generated.")
        return

    eval_dataset = Dataset.from_dict({
        "question": questions,
        "contexts": contexts_list,
        "answer": answers,
        "ground_truth": ground_truths,
    })

    print(f"\n Created dataset with {len(questions)} Q&A pairs")
    
    # Print dataset for debugging
    print("\nDataset preview: ")
    for i in range(len(questions)):
        print(f"\nQ{i+1}: {questions[i]}")
        print(f"Contexts: {len(contexts_list[i])} items")
        print(f"Answer: {answers[i][:80]}...")
        print(f"Ground Truth: {ground_truths[i]}")

    print("\nRunning RAGAS evaluation: ")
    try:
        result = evaluate(
            dataset=eval_dataset,
            metrics=[context_precision, faithfulness, answer_relevancy],
            llm=chat_llm,
            embeddings=None
        )

        df = result.to_pandas()
        
        print("\nRAGAS EVALUATION RESULTS:")
        print(df)

        mean_scores = df.mean(numeric_only=True)
        print("\nAVERAGED METRICS:")
        for metric, value in mean_scores.items():
            print(f"{metric}: {value:.3f}")

        # Save results
        df.to_csv("ragas_evaluation_results.csv", index=False)
        print("\nResults saved to ragas_evaluation_results.csv")

        return result
        
    except Exception as e:
        print(f"RAGAS evaluation failed: {e}")
        print("This might be due to API rate limits or data format issues.")
        return None
    print("\nExploring index data: ")
    
    sample_query = {
        "size": 5,
        "query": {"match_all": {}},
        "_source": ["book_title", "author_name", "rating_score"]
    }
    
    response = es.search(index=index_name, body=sample_query)
    hits = response["hits"]["hits"]
    
    print("Sample books in your index:")
    for hit in hits:
        book = hit["_source"]
        print(f"- '{book['book_title']}' by {book['author_name']} (Rating: {book.get('rating_score', 'N/A')})")
    
    return hits

In [11]:
try:
    results = run_ragas_demo()
    if results:
        print(f"\n🎉 Demo completed successfully!")
    else:
        print(f"\n⚠️ Demo completed with issues.")

except Exception as e:
    print(f"❌ Error during demo: {e}")
    import traceback
    traceback.print_exc()

🚀 Demo: 


🧭 Exploring index data: 
📙 Sample books in your index:
- 'Lucky 7' by Rae D. Magdon (Rating: 4.34)
- 'Salvation Lost' by Peter F. Hamilton (Rating: 4.34)
- 'Alien Warrior's Mate' by Vi Voxley (Rating: 3.74)
- 'On the Steel Breeze' by Alastair Reynolds (Rating: 4.03)
- 'Salvage Marines' by Sean-Michael Argo (Rating: 3.56)

📚 Question 1: What's a good science fiction book with high ratings?
🔎 Found 3 results for: What's a good science fiction book with high ratings?
Context: 
The Island of Doctor Moreau by H.G. Wells: Ranked among the classic novels of the English language and the inspiration for several unforgettable movies, this early work of H. G. Wells was greeted in 1896 by howls of protest from reviewers, who found it horrifying and blasphemous. They wanted to know more about the wondrous possibilities of science shown in his first book, The Time Machine, not its potential for misuse and terror. In The Island of Dr. Moreau, a shipwrecked gentleman named Edward Prendick, 

Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]


RAGAS EVALUATION RESULTS:
                                          user_input  \
0  What's a good science fiction book with high r...   
1  Can you suggest a fantasy book by a popular au...   
2               What's a highly rated mystery novel?   
3                 Recommend a book with good reviews   

                                  retrieved_contexts  \
0  [The Island of Doctor Moreau by H.G. Wells: Ra...   
1  [There Will Be Time by Poul Anderson:  Time tr...   
2  [On the Steel Breeze by Alastair Reynolds: It ...   
3  [The Island of Doctor Moreau by H.G. Wells: Ra...   

                                            response  \
0        "The Island of Doctor Moreau" by H.G. Wells   
1   I suggest "There Will Be Time" by Poul Anderson.   
2  The Book of Time by Guillaume Prévost is a hig...   
3          The Island of Doctor Moreau by H.G. Wells   

                                           reference  context_precision  \
0  A good science fiction book is 'Alien Warrior'...   