## 0. Installing Libraries

In [3]:
!pip install llama-index llama-index-llms-gemini llama-index-embeddings-huggingface
!pip install llama-index-vector-stores-chroma pypdf pandas requests tmdbsimple tqdm
!pip install transformers torch
!pip install llama-index-retrievers-bm25
!pip install datasets  # For loading Amazon product datasets

Collecting llama-index
  Using cached llama_index-0.12.38-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-llms-gemini
  Using cached llama_index_llms_gemini-0.5.0-py3-none-any.whl.metadata (3.3 kB)
Collecting llama-index-embeddings-huggingface
  Using cached llama_index_embeddings_huggingface-0.5.4-py3-none-any.whl.metadata (458 bytes)
Collecting llama-index-agent-openai<0.5,>=0.4.0 (from llama-index)
  Using cached llama_index_agent_openai-0.4.8-py3-none-any.whl.metadata (438 bytes)
Collecting llama-index-cli<0.5,>=0.4.1 (from llama-index)
  Using cached llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-embeddings-openai<0.4,>=0.3.0 (from llama-index)
  Using cached llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Using cached llama_index_indices_managed_llama_cloud-0.7.1-py3-none-any.whl.metadata (3.3 kB)
Collecting llama-index-llms-openai<0.4

In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
import time
from llama_index.core import Settings, VectorStoreIndex, Document
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.retrievers.bm25 import BM25Retriever
import pickle

from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
import logging
import tmdbsimple as tmdb
import sys
from dotenv import load_dotenv
from datasets import load_dataset

load_dotenv()

# Setting up logging - crucial for debugging RAG pipelines
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# Get API key for Gemini (you'll need to get one from Google AI Studio)
os.environ["GOOGLE_API_KEY"] = "AIzaSyB6ThHLCgEeR1YSPjntI1wyCLsTpwyd-1I"
#os.environ.get("GENAI_API_KEY")

# Configuration using Gemini and BGE embeddings
Settings.llm = Gemini(model="models/gemini-2.0-flash", temperature=0.1)
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-large-en-v1.5",
    embed_batch_size=10
)
Settings.chunk_size = 512
Settings.chunk_overlap = 50

  Settings.llm = Gemini(model="models/gemini-2.0-flash", temperature=0.1)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']


## 1. Getting Data 
### (You can start with pickled file if you don't want to download a lot of big files)

In [2]:
# Load the Amazon Books dataset
print("Loading Amazon Books dataset...")
books_review_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Books", trust_remote_code=True, cache_dir='.')
books_metadata_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Books", trust_remote_code=True, cache_dir='.')

# Convert to pandas for easier manipulation
books_review_df = books_review_dataset['full'].to_pandas()
books_metadata_df = books_metadata_dataset['full'].to_pandas()

print(f"Loaded {len(books_review_df)} reviews and {len(books_metadata_df)} book metadata entries")

Loading Amazon Books dataset...


Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/28 [00:00<?, ?it/s]

Loaded 29475453 reviews and 4448181 book metadata entries


In [4]:
# Filter for books with substantial review data
book_review_counts = books_review_df['parent_asin'].value_counts()
popular_books = book_review_counts[book_review_counts >= 10].head(5000).index.tolist()

# Get metadata for these popular books
popular_metadata = books_metadata_df[books_metadata_df['parent_asin'].isin(popular_books)].copy()
popular_reviews = books_review_df[books_review_df['parent_asin'].isin(popular_books)].copy()

print(f"Selected {len(popular_metadata)} popular books for our recommendation system")

Selected 5000 popular books for our recommendation system


In [5]:
popular_metadata['price'] = pd.to_numeric(popular_metadata['price'], errors='coerce')
popular_metadata['average_rating'] = pd.to_numeric(popular_metadata['average_rating'], errors='coerce')

In [6]:
import pandas as pd
import ast
from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm

def safe_parse_vectorized(series, field_type='string'):
    """Vectorized safe parsing for pandas Series"""
    def parse_single(val):
        try:
            parsed = ast.literal_eval(str(val))
            if field_type == 'author' and isinstance(parsed, dict):
                return parsed.get('name', '')
            return parsed
        except:
            return {} if field_type == 'dict' else ""
    
    return series.apply(parse_single)

def process_metadata_batch(metadata_df):
    """Vectorized processing of metadata batch"""
    # Create result dictionary with vectorized operations
    result_df = pd.DataFrame({
        'asin': metadata_df['parent_asin'].astype(str),
        'title': metadata_df['title'].astype(str),
        'description': metadata_df['description'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x)),
        'categories': metadata_df['categories'].apply(lambda x: list(x) if isinstance(x, (list, tuple)) else []),
        'author': safe_parse_vectorized(metadata_df['author'], 'author'),
        'rating': pd.to_numeric(metadata_df['average_rating'], errors='coerce').fillna(0),
        'price': pd.to_numeric(metadata_df['price'], errors='coerce').fillna(0),
        'features': metadata_df['features'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x)),
        'details': safe_parse_vectorized(metadata_df['details'], 'dict'),
        'store': metadata_df['store'].astype(str),
    })
    
    # Filter out books without titles
    return result_df[result_df['title'].str.len() > 0]

def get_reviews_batch(reviews_df, asins, max_reviews=5):
    """Efficiently get reviews for multiple books at once"""
    # Filter reviews for all ASINs at once
    relevant_reviews = reviews_df[reviews_df['parent_asin'].isin(asins)]
    
    if relevant_reviews.empty:
        return {asin: "" for asin in asins}
    
    # Sort once for all reviews
    relevant_reviews = relevant_reviews.sort_values(['verified_purchase', 'rating'], ascending=[False, False])
    
    # Group by ASIN and process
    review_dict = {}
    for asin in asins:
        book_reviews = relevant_reviews[relevant_reviews['parent_asin'] == asin].head(max_reviews)
        
        if book_reviews.empty:
            review_dict[asin] = ""
            continue
            
        review_texts = []
        for _, review in book_reviews.iterrows():
            parts = [f"Rating: {review.get('rating', 'N/A')}/5"]
            if pd.notna(review.get('title')) and review.get('title'):
                parts.append(f"Title: {review['title']}")
            if pd.notna(review.get('text')) and review.get('text'):
                parts.append(f"Review: {review['text']}")
            review_texts.append('\n'.join(parts))
        
        review_dict[asin] = '\n\n'.join(review_texts)
    
    return review_dict

def process_book_batch(metadata_batch, reviews_df, batch_size=1000):
    """Process a batch of books efficiently"""
    # Process metadata for the batch
    processed_metadata = process_metadata_batch(metadata_batch)
    
    if processed_metadata.empty:
        return []
    
    # Get all reviews for this batch at once
    asins = processed_metadata['asin'].tolist()
    reviews_dict = get_reviews_batch(reviews_df, asins)
    
    # Add reviews to each book
    processed_metadata['reviews'] = processed_metadata['asin'].map(reviews_dict)
    
    return processed_metadata.to_dict('records')

# Main processing with batching and parallelization
def process_all_books(metadata_df, reviews_df, batch_size=1000, n_jobs=None):
    """Process all books with optimized batching and parallelization"""
    if n_jobs is None:
        n_jobs = max(1, multiprocessing.cpu_count() - 1)
    
    print(f"Processing {len(metadata_df)} books in batches of {batch_size} using {n_jobs} workers...")
    
    # Create batches
    batches = [metadata_df[i:i+batch_size] for i in range(0, len(metadata_df), batch_size)]
    
    # Process batches in parallel
    results = Parallel(n_jobs=n_jobs, backend='threading')(
        delayed(process_book_batch)(batch, reviews_df, batch_size)
        for batch in tqdm(batches, desc="Processing batches")
    )
    
    # Flatten results
    processed_books = []
    for batch_result in results:
        processed_books.extend(batch_result)
    
    return processed_books

# Execute the optimized processing
processed_books = process_all_books(popular_metadata, popular_reviews, batch_size=20)
print(f"Successfully processed {len(processed_books)} books")

Processing 5000 books in batches of 20 using 47 workers...


Processing batches: 100%|██████████| 250/250 [00:59<00:00,  4.17it/s]


Successfully processed 5000 books


In [9]:
with open('processed_books.pkl', 'wb') as f:
    pickle.dump(processed_books, f)

In [2]:
with open('processed_books.pkl', 'rb') as f:
    processed_books = pickle.load(f)

## 2. Creating Documents for Our RAG System


In [3]:
def create_book_document(book_data):
    """Create a comprehensive document for a book"""
    
    # Main book information
    main_content = f"""
    Title: {book_data['title']}
    Author: {book_data['author'] if book_data['author'] else 'Unknown'}
    
    Description:
    {book_data['description']}
    
    Categories: {', '.join([cat for sublist in book_data['categories'] for cat in (sublist if isinstance(sublist, list) else [sublist])])}
    Average Rating: {book_data['rating']}/5.0)

    """
    
    # Add review content
    reviews_content = "\nReader Reviews:\n"
    if book_data['reviews']:
        reviews_content += book_data['reviews']
    else:
        reviews_content += "No reviews available."
    
    # Combine all content
    full_content = main_content + "\n" + reviews_content
    
    # Create rich metadata for filtering and relationships
    metadata = {
        'asin': book_data['asin'],
        'title': book_data['title'],
        'author': book_data['author'] if book_data['author'] else '',
        'categories': ','.join([cat for sublist in book_data['categories'] for cat in (sublist if isinstance(sublist, list) else [sublist])]),
        'rating': float(book_data['rating']) if book_data['rating'] else 0.0,
        'type': 'book',
        'has_reviews': bool(book_data['reviews'])
    }
    
    return Document(text=full_content, metadata=metadata)

# Create documents
documents = []
for book_data in tqdm(processed_books, desc="Creating documents"):
    doc = create_book_document(book_data)
    documents.append(doc)

print(f"Created {len(documents)} documents for our book RAG system")

Creating documents: 100%|██████████| 5000/5000 [00:00<00:00, 27478.30it/s]

Created 5000 documents for our book RAG system





## 4. Building an Advanced RAG Pipeline with LlamaIndex

### Step 1: Chunking Documents into Nodes


In [4]:
# Parse our documents into nodes (chunks)
parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50,
    paragraph_separator="\n\n",
)
nodes = parser.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} nodes from {len(documents)} documents")

Created 30960 nodes from 5000 documents


### Step 2: Setting Up Vector Storage


In [5]:
# Setup Chroma for efficient vector storage with 5k books
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from llama_index.core import StorageContext

# Create persistent Chroma database for our vector store
chroma_client = chromadb.PersistentClient("./book_chroma_db")
chroma_collection = chroma_client.get_or_create_collection("book_collection")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


### Step 3: Creating the Vector Index


In [6]:
# Create our vector index - this may take a while with 10K products
print("Creating vector index for 5K Books...")
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
print("Vector index created successfully!")

Creating vector index for 5K Books...
Vector index created successfully!


### Step 4: Implementing Query Transformation


In [7]:
from llama_index.core.prompts import PromptTemplate

# Query Transformation with HyDE
# This technique generates a hypothetical document that matches the query, 
# then uses that for retrieval instead of the original query
hyde = HyDEQueryTransform(
    llm=Settings.llm,
    hyde_prompt=PromptTemplate(
        "Given a query about books or literature, generate a detailed hypothetical book "
        "description that would match this query. Include plot elements, themes, character "
        "types, writing style, and genre information that would be relevant.\n"
        "Query: {context_str}\n"
        "Hypothetical Book Description: "
    )
)

### Step 5: Setting Up Retrieval Components


In [8]:
# Create BM25 retriever for keyword search
# BM25 excels at matching exact terms, complementing the semantic search
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
)

# Create our vector retriever for semantic search
vector_retriever = vector_index.as_retriever(
    similarity_top_k=5,
)

DEBUG:bm25s:Building index from IDs objects


### Step 6: Building the Hybrid Retrieval System


In [9]:
# Create hybrid retriever with query fusion
# This combines the results from both retrievers for better overall performance
hybrid_retriever = QueryFusionRetriever(
    [bm25_retriever, vector_retriever],
    similarity_top_k=8,
    mode="reciprocal_rerank"  # This weighting method works best for combining results
)

# Add query transformation to our retriever
from llama_index.core.retrievers import TransformRetriever

transform_retriever = TransformRetriever(
    retriever=hybrid_retriever,
    query_transform=hyde,
)

### Step 7: Adding Re-ranking


In [10]:
# Add re-ranking for precision
# This step rescores the retrieved documents to ensure the most relevant come first
reranker = SentenceTransformerRerank(
    model = "BAAI/bge-reranker-v2-m3",
    top_n = 5,
)

### Step 8: Creating the Final Query Engine with Structured Output


In [11]:
from llama_index.core.query_engine import RetrieverQueryEngine
from typing import List
from pydantic import BaseModel, Field

class Book(BaseModel):
    """Represents a single book recommendation."""
    asin: str = Field(description="The Amazon ASIN identifier for the book")
    title: str = Field(description="The title of the book")
    authors: str = Field(description="The author(s) of the book")
    rating: float = Field(description="Average rating of the book")
    reason: str = Field(description="Brief explanation of why this book matches the query")

class BookRecommendations(BaseModel):
    """Represents a list of book recommendations."""
    books: List[Book] = Field(description="List of recommended books")
    summary: str = Field(description="Brief summary of the recommendation rationale")

book_rag_engine = RetrieverQueryEngine.from_args(
    retriever=transform_retriever,
    node_postprocessors=[reranker],
    output_cls=BookRecommendations,
    allow_parallel_tool_calls=False
)

## 5. Testing Our Advanced RAG Pipeline


In [19]:
# Test complex thematic queries that require deep literary understanding
test_queries = [
    "books with unreliable narrators that make you question reality",
    "books about artificial intelligence that question what makes us human",
    "science fiction books that deal with time travel paradoxes",
    "books about dystopian futures that feel relevant to today's world",
    "literary fiction that explores family secrets across generations",
    "books where the main character has to choose between love and duty"
]

print("Testing our advanced book RAG system with complex thematic queries...\n")

for query in test_queries:
    print(f"📚 Query: {query}")
    print("-" * 80)
    
    try:
        response = book_rag_engine.query(query)
        
        # Handle structured output
        if hasattr(response, 'books') and response.books:
            print("📖 Recommended Books:")
            for book in response.books:
                print(f"• {book.title} by {book.authors}")
                print(f"  Rating: {book.rating}/5.0")
                print(f"  Why it matches: {book.reason}")
                print()
            
            if hasattr(response, 'summary') and response.summary:
                print(f"📝 Summary: {response.summary}")
        else:
            print(f"Response: {response}")
            
    except Exception as e:
        print(f"❌ Error: {str(e)}")
    
    print("\n" + "="*100 + "\n")
    time.sleep(10)  # Avoid rate limits

Testing our advanced book RAG system with complex thematic queries...

📚 Query: books with unreliable narrators that make you question reality
--------------------------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📖 Recommended Books:
• The Woman in the Window: A Novel by A. J. Finn
  Rating: 4.3/5.0
  Why it matches: Narrator with illusions, delusions, reality

• Gone Girl by Gillian Flynn
  Rating: 4.1/5.0
  Why it matches: Features unreliable narrators and plot twists

• Shutter Island by Dennis Lehane
  Rating: 4.4/5.0
  Why it matches: Psychological thriller with secrets and paranoia

📝 Summary: These books feature unreliable narrators and psychological themes that create suspense and challenge the reader's perception of reality. 


📚 Query: books about artificial intelligence that question what makes us human
--------------------------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📖 Recommended Books:
• Do Androids Dream of Electric Sheep?: The inspiration for the films Blade Runner and Blade Runner 2049 by Philip K. Dick
  Rating: 4.4/5.0
  Why it matches: Explores philosophical questions about androids, respect for life, and the manipulation of moods, questioning what makes us human.

• The Kraken Project: A Novel (Wyman Ford Series Book 4) by Douglas Preston
  Rating: 4.4/5.0
  Why it matches: Features self-modifying AI software and explores the potential dangers and value of strong AI.

📝 Summary: These books explore themes related to artificial intelligence and what it means to be human.


📚 Query: science fiction books that deal with time travel paradoxes
--------------------------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📖 Recommended Books:
• The Time Traveler's Wife by Audrey Niffenegger
  Rating: 4.4/5.0
  Why it matches: Deals with time travel and its impact on relationships.

• Timeline by Michael Crichton
  Rating: 4.3/5.0
  Why it matches: Features quantum teleportation to medieval France, exploring the perils of time travel.

• Ready Player One by Ernest Cline
  Rating: 4.6/5.0
  Why it matches: Mentions the plot holes inherent in time travel films, particularly Back to The Future.

• The Grand Design by Stephen Hawking
  Rating: 4.5/5.0
  Why it matches: Explores radical ideas about the universe and challenges common sense, relevant to the theoretical aspects of time travel.

📝 Summary: Here are some science fiction books that deal with time travel paradoxes


📚 Query: books about dystopian futures that feel relevant to today's world
--------------------------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📖 Recommended Books:
• Brave New World by Aldous Huxley
  Rating: 4.4/5.0
  Why it matches: satirical prediction of a distant future became reality in so short a time

• Wool - Omnibus Edition by Hugh Howey
  Rating: 4.6/5.0
  Why it matches: best dystopian future novel

• The Last Survivors: A Dystopian Society in a Post Apocalyptic World by Bobby Adair
  Rating: 4.1/5.0
  Why it matches: perfect example of why I love the fantasy/science fiction genre

• Unwind (Unwind Dystology) by Neal Shusterman
  Rating: 4.5/5.0
  Why it matches: issues raised could not be more provocative--the sanctity of life, the meaning of being human

• The Amber Project: A Dystopian Sci-fi Novel (The Variant Saga Book 1) by JN Chaney
  Rating: 4.3/5.0
  Why it matches: continues to captivate fans of dystopian science fiction,combining hard sci-fi, exploration, and gritty action with a healthy dose ofintrigue

📝 Summary: Here are some books about dystopian futures that feel relevant to today's world


📚 Query

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📖 Recommended Books:
• The Two-Family House: A Novel by Lynda Cohen Loigman
  Rating: 4.2/5.0
  Why it matches: explores family secrets across generations

• America's First Daughter: A Novel by Stephanie Dray
  Rating: 4.5/5.0
  Why it matches: historical family saga

• Defending Jacob: A Novel by William Landay
  Rating: 4.3/5.0
  Why it matches: domestic drama that shatters a family

• The Secret History by Donna Tartt
  Rating: 4.2/5.0
  Why it matches: explores the complexities of relationships and hidden pasts

📝 Summary: Here are some literary fiction recommendations that explore family secrets across generations, domestic dramas, and hidden pasts, with a focus on character relationships and historical sagas.


📚 Query: books where the main character has to choose between love and duty
--------------------------------------------------------------------------------


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📖 Recommended Books:
• A Game of Thrones (Song of Ice and Fire) by George R. R. Martin
  Rating: 4.7/5.0
  Why it matches: At the center of the conflict lie the Starks of Winterfell, a family as harsh and unyielding as the land they were born to. Sweeping from a land of brutal cold to a distant summertime kingdom of epicurean plenty, here is a tale of lords and ladies, soldiers and sorcerers, assassins and bastards, who come together in a time of grim omens.

• Prince Lestat and the Realms of Atlantis: The Vampire Chronicles by Anne Rice
  Rating: 4.5/5.0
  Why it matches: 'The Prince,’ whom Rhoshamandes hated, was a young undeserving maverick blood drinker named Lestat. Lestat had done an “unspeakable” thing to Rhoshamandes, cutting off his left hand and then his arm.

• A Knight of the Seven Kingdoms (A Song of Ice and Fire) by George R. R. Martin
  Rating: 4.5/5.0
  Why it matches: Set 90 years before the events in A Game of Thrones, they chronicle the experiences of Ser Duncan the 

## 6. Creating Specialized Tools for Multi-Step Retrieval

In [20]:
from llama_index.core.tools import QueryEngineTool, FunctionTool
from llama_index.core.agent import ReActAgent
from llama_index.core.vector_stores import MetadataFilters, MetadataFilter, FilterOperator

class BookRecommendationSystem:
    def __init__(self, book_rag_engine, processed_books, vector_index, bm25_retriever):
        self.book_rag_engine = book_rag_engine
        self.processed_books = processed_books
        self.vector_index = vector_index
        self.bm25_retriever = bm25_retriever
        
        # Create lookup dictionaries
        self.asin_to_book = {book['asin']: book for book in processed_books}
        self.title_to_book = {book['title'].lower(): book for book in processed_books}
    
    def book_similarity_search(self, book_title: str, num_recommendations: int = 5) -> str:
        """Find books similar to a given book using semantic similarity"""

        # First, find the book in our dataset to get context
        book_context = self.find_book_context(book_title)
        if not book_context[0]:  # If book not found
            return f"Book '{book_title}' not found in our database. Cannot provide similar recommendations."

        
        book_data, book_node = book_context

        
        filters= [MetadataFilter(
            key="asin", 
            value=book_data['asin'], 
            operator=FilterOperator.NE
            )]
        
        metadata_filters = MetadataFilters(filters=filters)
        filtered_retriever = self.vector_index.as_retriever(
            similarity_top_k=num_recommendations+2,
            filters=metadata_filters
        )


        # Use the book's content for similarity search
        query = f"books similar to {book_title} with themes like: {book_data.get('features', '')}"


        # Get similar books using vector similarity
        similar_nodes = filtered_retriever.retrieve(query)
        


        # Filter out the original book and format results
        recommendations = []
        seen_asin = []
        for node in similar_nodes:
            node_asin = node.metadata.get('asin', '')
            if node_asin not in seen_asin:  # Exclude the original book
                seen_asin.append(node_asin)
                book_info = self.asin_to_book[node_asin]
                recommendations.append({
                    'title': book_info['title'],
                    'author': book_info['author'],
                    'rating': book_info['rating'],
                    'asin': node_asin
                })

            if len(recommendations) >= num_recommendations:
                break

        if recommendations:
            result = f"Books similar to '{book_title}':\n\n"
            for i, book in enumerate(recommendations):
                author_str = book['author'] if book['author'] else 'Unknown'
                result += f"{i}. 📚 {book['title']} by {author_str}\n"
                result += f"   asin: {book['asin']}\n\n"
            return result
        else:
            return f"No similar books found for '{book_title}'"
                
    
    def author_and_rating_filter(self, author: str = None, min_rating: float = 0.0, 
                               max_results: int = 5) -> str:
        """Filter books by genre and minimum rating"""
        filters = []
        
        if min_rating > 0:
            filters.append(MetadataFilter(
                key="rating", 
                value=min_rating, 
                operator=FilterOperator.GTE
            ))
        
        if author:
            filters.append(MetadataFilter(
                key="author", 
                value=author, 
                operator=FilterOperator.EQ
            ))
        
        if filters:
            metadata_filters = MetadataFilters(filters=filters)
            filtered_retriever = self.vector_index.as_retriever(
                similarity_top_k=max_results,
                filters=metadata_filters
            )
            
            query = f"high quality books by {author}" if author else "high quality books"
            results = filtered_retriever.retrieve(query)
            seen_asins = []
            if results:
                response = f"Books matching criteria (author: {author}, min rating: {min_rating}):\n"
                for node in results:
                    if node.metadata.get('asin') not in seen_asins:
                        seen_asins.append(node.metadata.get('asin'))
                        title = node.metadata.get('title', 'Unknown')
                        authors = node.metadata.get('author', 'Unknown')
                        rating = node.metadata.get('rating', 0)
                        response += f"- {title} by {authors} (Rating: {rating}/5.0)\n"
                return response
            else:
                return f"No books found matching criteria"
        else:
            return "Please specify genre or minimum rating"
    
    def thematic_book_search(self, mood: str, context: str = "") -> str:
        """Search for books based on themes, moods, or abstract concepts"""
        query = f"Books Matching Mood: {mood}"
        if context!="":
            query+= "with context: {context}"
        
        response = self.book_rag_engine.query(query.lower())

        if hasattr(response, 'books') and response.books:
            result = f"Books matching mood/theme '{mood}'"
            if context:
                result += f" with context '{context}'"
            result += ":\n\n"

            for book in response.books:
                result += f"📚 {book.title} by {book.authors}\n"
                result += f"   Rating: {book.rating}/5.0\n"
                result += f"   Asin: {book.asin}\n"
                result += f"   Why it matches: {book.reason}\n\n"

            if hasattr(response, 'summary'):
                result += f"Summary: {response.summary}"

            return result
        else:
            return f"No books found matching mood/theme: {mood}"

    
    def find_book_details(self, title: str) -> str:
        """Get detailed information about a specific book"""
        book_context = self.find_book_context(title)
        if book_context[0]:
            book_data, _ = book_context
            return self._format_book_details(book_data)
        else:
            return f"Book '{title}' not found in our database"
    
    def find_book_context(self, title: str) -> tuple:
        """Helper method to find book data and node for a given title"""
        title_lower = title.lower()
        
        # Try exact match first
        if title_lower in self.title_to_book:
            book_data = self.title_to_book[title_lower]
            return book_data, None
        
        # Try BM25 search for partial matches
        try:
            results = self.bm25_retriever.retrieve(title)
            if results:
                # Get the book metadata from the first result
                asin = results[0].metadata.get('asin')
                if asin and asin in self.asin_to_book:
                    book_data = self.asin_to_book[asin]
                    return book_data, results[0]
            
            return None, None
        except Exception as e:
            print(f"Error in find_book_context: {str(e)}")
            return None, None
    
    def _format_book_details(self, book):
        """Helper method to format book details"""
        details = f"📚 {book['title']}\n"
        details += f"✍️ Author(s): {book['author'] if book['author'] else 'Unknown'}\n"
        details += f"⭐ Rating: {book['rating']}/5.0\n"
        details += f"📁 Categories: {', '.join([cat for sublist in book['categories'] for cat in (sublist if isinstance(sublist, list) else [sublist])])}\n"
        
        if book['description']:
            details += f"\n📖 Description:\n{book['description'][:10000]}{'...' if len(book['description']) > 300 else ''}\n"
        
        return details

# Initialize the recommendation system
rec_system = BookRecommendationSystem(book_rag_engine, processed_books, vector_index, bm25_retriever)

In [21]:
# Create specialized tools for book discovery
similarity_tool = FunctionTool.from_defaults(
    fn=rec_system.book_similarity_search,
    name="similarity_search",
    description="Find books similar to a given book title. Perfect for 'books like X' recommendations."
)


author_filter_tool = FunctionTool.from_defaults(
    fn=rec_system.author_and_rating_filter,
    name="author_rating_filter",
    description="Filter books by author, rating"
)

theme_tool = FunctionTool.from_defaults(
    fn=rec_system.thematic_book_search,
    name="thematic_search",
    description="Find books based on moods. Use mood parameter and context parameter."
)

book_details_tool = FunctionTool.from_defaults(
    fn=rec_system.find_book_details,
    name="book_details",
    description="Get detailed information about a specific book by title"
)

# Main RAG tool for general book queries
general_book_tool = QueryEngineTool.from_defaults(
    query_engine=book_rag_engine,
    name="general_book_search",
    description="General semantic search for books based on any query"
)

# Create the book recommendation agent
book_agent = ReActAgent.from_tools(
    [similarity_tool, author_filter_tool, theme_tool, book_details_tool, general_book_tool],
    llm=Settings.llm,
    verbose=True,
    system_prompt="""
    You are a knowledgeable book recommendation expert, like a librarian with access to a vast digital catalog.
    
    Your expertise covers:
    - Literature across all genres and time periods
    - Author relationships and writing styles
    - Thematic and mood-based recommendations
    - Series and book relationships
    
    Guidelines for tool usage:
    - For "books like [book title]" queries, use similarity_search
    - For author-specific or rating-filtered requests, use author_rating_filter
    - For mood-based, use thematic_search with mood and context parameters
    - For information about a specific book, use book_details
    - For general book discovery and themes use general_book_search
    
    Always provide thoughtful recommendations with explanations of why each book might appeal to the reader.
    Consider reading level, themes, writing style, and emotional impact in your suggestions.
    Be encouraging and help readers discover new authors and genres they might enjoy.
    """
)

In [25]:
test_queries = [
    "I loved 'The Hunger Games', what similar books should I read?",
    "Books that explore the meaning of consciousness and artificial intelligence",
    "High-rated books by George Orwell that aren't too hard to read",
    "I'm going through a difficult time and need something uplifting and inspiring",
    "Books similar to 'The Handmaid's Tale' but more hopeful about the future",
]

print("Testing our advanced book RAG system...\n")

for i, query in enumerate(test_queries, 1):
    print(f"📚 Query {i}: {query}")
    print("-" * 80)
    
    try:
        response = book_agent.chat(query)
        print(f"Response: {response}")
    except Exception as e:
        print(f"Error: {str(e)}")
    
    print("\n" + "="*100 + "\n")
    
    if i < len(test_queries):
        print("Waiting to avoid rate limits...")
        time.sleep(20)


Testing our advanced book RAG system...

📚 Query 1: I loved 'The Hunger Games', what similar books should I read?
--------------------------------------------------------------------------------
> Running step 580a9a47-4043-47e4-9084-30e27c722cef. Step input: I loved 'The Hunger Games', what similar books should I read?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use the similarity_search tool to find books similar to 'The Hunger Games'.
Action: similarity_search
Action Input: {'book_title': 'The Hunger Games', 'num_recommendations': 5}
[0m[1;3;34mObservation: Books similar to 'The Hunger Games':

0. 📚 Catching Fire (Hunger Games, Book Two): Volume 2 by Suzanne Collins
   asin: 0439023491

1. 📚 Mockingjay (The Hunger Games) by Suzanne Collins
   asin: 0439023513


[0m> Running step eb64538f-cdcd-4dba-9b66-ef8fa09afd21. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer:

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1;3;34mObservation: {"books":[{"asin":"B00GVQZR3C","title":"The Kraken Project: A Novel (Wyman Ford Series Book 4)","authors":"Douglas Preston","rating":4.4,"reason":"This book explores artificial intelligence in a thrilling story about a NASA program that escapes into the Internet and the attempt to control it for nefarious purposes"},{"asin":"0345404475","title":"Do Androids Dream of Electric Sheep?: The inspiration for the films Blade Runner and Blade Runner 2049","authors":"Philip K. Dick","rating":4.4,"reason":"This book delves into the nature of consciousness through the exploration of androids and their capacity for emotions"},{"asin":"1451697724","title":"The Soul of an Octopus: A Surprising Exploration into the Wonder of Consciousness","authors":"Sy Montgomery","rating":4.6,"reason":"This book explores the consciousness of octopuses, questioning the nature of mind and emotions in non-human beings"},{"asin":"B004G60FWM","title":"The Fall of Hyperion (Hyperion Cantos, Book 2)"

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1;3;34mObservation: Books matching mood/theme 'uplifting' with context 'difficult time':

📚 The Midnight Library: A Novel by Matt Haig
   Rating: 4.3/5.0
   Asin: 0525559477
   Why it matches: A powerful and uplifting story about regrets and the choices we make

📚 Good Days Start With Gratitude: A 52 Week Guide To Cultivate An Attitude Of Gratitude: Gratitude Journal by Unknown
   Rating: 4.6/5.0
   Asin: 1976436184
   Why it matches: An excellent way to focus on the positive and change your perspective to be thankful for the little things

📚 Go for No! Yes is the Destination, No is How You Get There by Andrea Waltz
   Rating: 4.7/5.0
   Asin: 0966398130
   Why it matches: Achieve courageous breakthrough performance through increasing failure rates and other unconventional approaches

Summary: These books are uplifting and focus on gratitude, personal growth, and overcoming regrets, offering an optimistic outlook on life and personal development.
[0m> Running step 8e36d6b1-7465-426e

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1;3;34mObservation: Books matching mood/theme 'hopeful' with context 'books':

📚 Maybe by Kobi Yamada
   Rating: 4.9/5.0
   Asin: 1946873756
   Why it matches: This book is a gentle, imaginative read brimming with positivity that encourages children to develop a creative, adventure-seeking spirit and invites readers to reflect on the possibilities within themselves and to stay resilient in the face of adversity.

📚 I've Been Thinking . . .: Reflections, Prayers, and Meditations for a Meaningful Life by Maria Shriver
   Rating: 4.7/5.0
   Asin: 0525522603
   Why it matches: This book is intended to uplift, inspire, and comfort the reader, offering wisdom, guidance, and strength.

📚 The Daily Stoic: 366 Meditations on Wisdom, Perseverance, and the Art of Living by Ryan Holiday
   Rating: 4.8/5.0
   Asin: 0735211736
   Why it matches: This book provides meditations on wisdom, perseverance, and the art of living, promoting tranquility, fearlessness, and freedom through education and self