In [None]:
# BLOG POST: https://www.mlwhiz.com/p/genai-series-building-rag-applications

## 0. Download the Data and Install Libraries

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")

print("Path to dataset files:", path)

In [None]:
!mv /home/ec2-user/.cache/kagglehub/datasets/tmdb/tmdb-movie-metadata/versions/2/* .

In [None]:
!pip install sentence_transformers

In [None]:
!pip install chromadb

In [None]:
!pip install google-genai

## 1. Load and Preprocess

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import os
import json
from google import genai
from pydantic import BaseModel, Field
from dotenv import load_dotenv
load_dotenv()  

True

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Let's take a peek at what we're working with
print(f"Total movies: {len(movies)}")
movies[['title', 'overview', 'genres', 'release_date']].head(2)


Total movies: 4803


Unnamed: 0,title,overview,genres,release_date
0,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",2009-12-10
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",2007-05-19


In [3]:
# Extract genres from JSON string
def extract_genres(genres_json):
    try:
        genres_list = json.loads(genres_json.replace("'", '"'))
        return [genre['name'] for genre in genres_list]
    except:
        return []

# Apply extraction
movies['genres_list'] = movies['genres'].apply(extract_genres)

# Now create rich document representations with movie descriptions
def create_movie_documents(movies):
    """Create rich document representations of movies with descriptions."""
    documents = []
    
    for _, movie in movies.iterrows():
        movie_id = movie['id']
        title = movie['title']
        overview = movie['overview']
        
        # Extract year from release date if present
        year = ""
        if pd.notna(movie['release_date']) and len(movie['release_date']) >= 4:
            year = movie['release_date'][:4]  # Just the year
        
        genres = movie['genres_list']
        
        # Create document text - this will be embedded
        doc = f"Movie: {title}\n"
        if year:
            doc += f"Year: {year}\n"
        if genres and len(genres) > 0:
            doc += f"Genres: {', '.join(genres)}\n"
        if overview and pd.notna(overview):
            doc += f"Overview: {overview}\n"
        
        documents.append({
            'id': str(movie_id),
            'content': doc,
            'metadata': {
                'title': title,
                'year': year,
                'genres': "|".join(genres),
                'overview': overview if pd.notna(overview) else ""
            }
        })
    
    return documents

# Create our document collection
movie_documents = create_movie_documents(movies)
print(f"Created {len(movie_documents)} movie documents")
print(f"Sample document:")
print(movie_documents[1]['content'])


Created 4803 movie documents
Sample document:
Movie: Pirates of the Caribbean: At World's End
Year: 2007
Genres: Adventure, Fantasy, Action
Overview: Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.



## 3. Create our Embeddings

In [4]:
# Initialize our embedding model
# Using a smaller model for speed, but you can use more powerful ones
model = SentenceTransformer('all-mpnet-base-v2')

# Initialize ChromaDB
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_movie_index")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Process in batches to avoid memory issues
batch_size = 1000
for i in range(0, len(movie_documents), batch_size):
    batch = movie_documents[i:i+batch_size]
    
    ids = [doc['id'] for doc in batch]
    contents = [doc['content'] for doc in batch]
    metadatas = [doc['metadata'] for doc in batch]
    
    # Generate embeddings
    embeddings = model.encode(contents)
    
    # Add to collection
    collection.add(
        ids=ids,
        embeddings=embeddings.tolist(),
        documents=contents,
        metadatas=metadatas
    )

print(f"Added {collection.count()} movies to the vector database")

Added 4803 movies to the vector database


In [6]:
print("\nLet's test our vector store with a sample query...")
test_query = "sci-fi movies with philosophical themes"

test_embedding = model.encode(test_query).tolist()

results = collection.query(query_embeddings=[test_embedding], n_results=3)

for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"\nResult {i+1}:")
    print(f"Title: {metadata['title']}")
    print(f"Overview: {metadata['overview']}")
    print(f"Genres: {metadata['genres']}")



Let's test our vector store with a sample query...

Result 1:
Title: Interstellar
Overview: Interstellar chronicles the adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on human space travel and conquer the vast distances involved in an interstellar voyage.
Genres: Adventure|Drama|Science Fiction

Result 2:
Title: Transcendence
Overview: Two leading computer scientists work toward their goal of Technological Singularity,  as a radical anti-technology organization fights to prevent them from creating a world where computers can transcend the abilities of the human brain.
Genres: Thriller|Science Fiction|Drama|Mystery

Result 3:
Title: The Theory of Everything
Overview: The Theory of Everything is the extraordinary story of one of the world’s greatest living minds, the renowned astrophysicist Stephen Hawking, who falls deeply in love with fellow Cambridge student Jane Wilde.
Genres: Drama|Romance


## 4. Create Retrieval

### 4.1 Basic Retrieval

In [7]:
def query_expansion(query):
    """Expand the query to improve retrieval."""
    # Simple rule-based expansions
    expansions = [
        query,  # Original query
        f"Movies similar to {query}",
        f"Plot description: {query}",
        f"Movie themes and content about {query}"
    ]
    return expansions

def retrieve_movie_info(query, n_results=5):
    """Retrieve relevant movie information for a query."""
    # Expand query for better recall
    expanded_queries = query_expansion(query)
    
    all_results = []
    for expanded_query in expanded_queries:
        # Generate embedding for the query
        query_embedding = model.encode(expanded_query).tolist()
        
        # Retrieve relevant documents
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        
        # Add to results list
        for doc, metadata, id in zip(results['documents'][0], 
                                    results['metadatas'][0],
                                    results['ids'][0]):
            all_results.append({
                'id': id,
                'document': doc,
                'metadata': metadata,
                'query': expanded_query
            })
    
    # Remove duplicates (same movie ID)
    unique_results = {}
    for result in all_results:
        if result['id'] not in unique_results:
            unique_results[result['id']] = result
    
    return list(unique_results.values())

In [8]:
retrieve_movie_info("sci-fi movies with philosophical themes")

[{'id': '157336',
  'document': 'Movie: Interstellar\nYear: 2014\nGenres: Adventure, Drama, Science Fiction\nOverview: Interstellar chronicles the adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on human space travel and conquer the vast distances involved in an interstellar voyage.\n',
  'metadata': {'year': '2014',
   'genres': 'Adventure|Drama|Science Fiction',
   'title': 'Interstellar',
   'overview': 'Interstellar chronicles the adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on human space travel and conquer the vast distances involved in an interstellar voyage.'},
  'query': 'sci-fi movies with philosophical themes'},
 {'id': '157353',
  'document': 'Movie: Transcendence\nYear: 2014\nGenres: Thriller, Science Fiction, Drama, Mystery\nOverview: Two leading computer scientists work toward their goal of Technological Singularity,  as a radical anti-technology organiz

In [9]:
# Check we have access to Gemini
client = genai.Client(api_key=os.environ.get("GENAI_API_KEY"))

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=["whats your name"]
)
print(response.text)

I am a large language model, trained by Google.



In [10]:
class Content(BaseModel):
    title: str = Field(..., description="title of the movie")
    why_picked: str = Field(..., description="why did you pick it")


def generate_rag_response(query, context):
    # Format the prompt with retrieved context
    prompt = f"""
    You are a knowledgeable movie recommendation system with deep understanding of film plots, themes, and content. 
    Use the following retrieved information about movies to answer the user's question.
    
    User Query: {query}
    
    Retrieved Movie Information:
    {context}
    
    Based on this information, provide a helpful response to the user's query.
    If the user is asking about themes, plot elements, or specific content, use the movie overviews to provide detailed insights.

    If the retrieved information doesn't contain relevant details to answer the question, acknowledge the limitations and provide general movie information or suggestions.
    """
    
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[prompt],
        config={
        "response_mime_type": "application/json",
        "response_schema": list[Content],
    })
    
    return response.text


def movie_rag(user_query):
    # Retrieve relevant movie information
    results = retrieve_movie_info(user_query)
    
    # Format context from retrieved documents
    context = "\n\n".join([res['document'] for res in results])

    # Generate response
    response = generate_rag_response(user_query, context)
    
    return response, results

In [11]:
response, results = movie_rag("romantic movies but sci-fi")

In [12]:
print(response)

[
  {
    "title": "The Lovers",
    "why_picked": "This film is an epic romance and time travel adventure set against the backdrop of war, featuring a British officer and an Indian woman in 18th century India, and a present-day marine biologist and his wife."
  },
  {
    "title": "Her",
    "why_picked": "This movie explores an unconventional love story between a lonely writer and his newly developed operating system, blending science fiction with a sweet tale about love and technology."
  },
  {
    "title": "Code 46",
    "why_picked": "A futuristic love story where the romance is doomed by genetic incompatibility, fitting both the romance and science fiction themes."
  }
]


In [13]:
print("Retrieval Context:", [x['metadata']['title'] for x in results])

Retrieval Context: ['The Lovers', 'Her', 'Code 46', 'Interstellar', 'Sex Drive', 'Star Trek Beyond', 'Two Lovers', 'The Last Five Years', 'Beginners']


In [14]:
# Test queries
test_queries = [
    "Recommend me sci-fi movies like Inception",
    "What are the best comedy movies from the 90s?",
    "Movies about artificial intelligence with philosophical themes",
    "Family-friendly animated movies with good ratings"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    response, results = movie_rag(query)
    print("\nResponse:")
    print(response)
    print("\nRetrieved movies:")
    for res in results[:3]:  # Show top 3 retrieved movies
        print(f"- {res['metadata']['title']} ({res['metadata']['year']})")
    print("...")


Query: Recommend me sci-fi movies like Inception

Response:
[
  {
    "title": "Interstellar",
    "why_picked": "Like Inception, Interstellar is a complex science fiction film that delves into themes of reality, perception, and the human mind, all while presenting a visually stunning and thought-provoking narrative."
  },
  {
    "title": "Transcendence",
    "why_picked": "Transcendence explores the idea of implanting human consciousness into machines, similar to how Inception deals with planting ideas into the subconscious."
  },
  {
    "title": "Prometheus",
    "why_picked": "If you enjoyed the mystery and adventure aspects of Inception, Prometheus takes viewers on a journey to uncover the origins of mankind, filled with suspense and unexpected twists."
  },
  {
    "title": "I Origins",
    "why_picked": "I Origins shares the element of questioning reality and existence, this time through the eyes of a molecular biologist, akin to how Inception explores the nature of dreams and

### 4.2 Hyde Retrieval

In [15]:
def hyde_retrieval(query, n_results=5):
    """Implement HyDE for better retrieval."""
    # Step 1: Generate a hypothetical document using LLM
    
    hyde_prompt = f"""
    You are a movie expert. Generate a detailed description of a movie that would perfectly answer this query: "{query}"
    Include details about plot, themes, genre, and style.
    """
    
    hyde_response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[hyde_prompt],
        config={
        "response_mime_type": "application/json"
    })
    
    hypothetical_document = hyde_response.text
    
    # Step 2: Embed this hypothetical document instead of the query
    hyde_embedding = model.encode(hypothetical_document).tolist()
    
    # Step 3: Use this embedding for retrieval
    results = collection.query(
        query_embeddings=[hyde_embedding],
        n_results=n_results
    )
    
    # Format results
    formatted_results = []
    for doc, metadata, id in zip(results['documents'][0], 
                                results['metadatas'][0],
                                results['ids'][0]):
        formatted_results.append({
            'id': id,
            'document': doc,
            'metadata': metadata,
            'query': query
        })
    
    return formatted_results


def movie_hyde_rag(user_query):
    # Retrieve relevant movie information
    results = hyde_retrieval(user_query,15)
    
    # Format context from retrieved documents
    context = "\n\n".join([res['document'] for res in results])

    # Generate response
    response = generate_rag_response(user_query, context)
    
    return response, results

In [16]:
response, results = movie_hyde_rag("sci-fi movies like Inception")

In [17]:
print(response)

[
  {
    "title": "Project Almanac",
    "why_picked": "Features a group of teens who discover plans for a time machine and build one, leading to unforeseen consequences, similar to the high-stakes situations and technological themes in Inception."
  },
  {
    "title": "Frequency",
    "why_picked": "Involves altering past events with ripple effects on the present, creating a complex narrative of cause and effect, reminiscent of the layered reality manipulation in Inception."
  },
  {
    "title": "Transcendence",
    "why_picked": "Explores the idea of technological singularity and computers surpassing human intelligence, presenting a thought-provoking scenario about the nature of reality and consciousness, akin to the philosophical underpinnings of Inception."
  },
  {
    "title": "Timecrimes",
    "why_picked": "Deals with the consequences of time travel and the creation of multiple versions of oneself, resulting in a mind-bending plot with escalating stakes, similar to the intri

In [18]:
response, results = movie_rag("sci-fi movies like Inception")

In [19]:
print(response)

[
  {
    "title": "Transcendence",
    "why_picked": "Like Inception, Transcendence delves into the complex themes of the human mind and technology, presenting a thrilling and mysterious narrative about pushing the boundaries of what's possible."
  },
  {
    "title": "Interstellar",
    "why_picked": "Interstellar shares Inception's grand scale and mind-bending concepts, exploring the limits of human understanding and the potential of the human mind through space travel and complex scientific theories."
  },
  {
    "title": "I Origins",
    "why_picked": "Similar to Inception's exploration of reality and perception, I Origins blends science fiction with deep philosophical questions, challenging the protagonist's beliefs and understanding of the world through scientific discoveries."
  },
  {
    "title": "Dawn of the Planet of the Apes",
    "why_picked": "shares Inception's action and suspenseful elements, the film presents a world where the lines between different groups are blurr

### 4.3 Retrieval with Query Decomposition

In [20]:
class Query(BaseModel):
    query: str
    
def decompose_query(query):
    """Decompose complex query into simpler sub-queries."""
    decompose_prompt = f"""
    Break down this complex movie-related query into 2-3 simpler sub-queries:
    "{query}"
    """
    
    decompose_response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[decompose_prompt],
        config={
        "response_mime_type": "application/json",
        "response_schema": list[Query],
    })

    
    # Parse the response to get sub-queries
    sub_queries = json.loads(decompose_response.text)

    sub_queries = [q['query'].strip() for q in sub_queries]
        
    return sub_queries

In [21]:
complex_query = "I want science fiction movies that deal with time travel but also have strong character development and emotional depth"
decompose_query(complex_query)

['Find science fiction movies about time travel.',
 'Identify movies known for strong character development.',
 'Find movies known for their emotional depth.']

In [22]:
def retrieve_with_decomposition(query, n_results=3):
    """Retrieve using query decomposition for complex queries."""
    print(f"Original query: {query}")
    
    # Step 1: Decompose the query
    sub_queries = decompose_query(query)
    print(f"Decomposed into: {sub_queries}")
    
    # Step 2: Retrieve for each sub-query
    all_results = []
    for sub_query in sub_queries:
        # Get results for this sub-query
        sub_results = retrieve_movie_info(sub_query, n_results=n_results)
        print(f"Retrieved {len(sub_results)} results for: '{sub_query}'")
        all_results.extend(sub_results)
    
    # Step 3: Remove duplicates and sort by relevance
    unique_results = {}
    for result in all_results:
        if result['id'] not in unique_results:
            unique_results[result['id']] = result
    
    return list(unique_results.values())

def movie_decomposition_rag(user_query):
    # Retrieve relevant movie information
    results = retrieve_with_decomposition(user_query, 5)
    
    # Format context from retrieved documents
    context = "\n\n".join([res['document'] for res in results])

    # Generate response
    response = generate_rag_response(user_query, context)
    
    return response, results

In [23]:
response, results = movie_decomposition_rag(complex_query)

Original query: I want science fiction movies that deal with time travel but also have strong character development and emotional depth
Decomposed into: ['Find science fiction movies about time travel.', 'List movies known for strong character development.', 'Find movies known for their emotional depth and impact.']
Retrieved 7 results for: 'Find science fiction movies about time travel.'
Retrieved 12 results for: 'List movies known for strong character development.'
Retrieved 8 results for: 'Find movies known for their emotional depth and impact.'


In [24]:
response, results = movie_decomposition_rag(complex_query)
print(response)

Original query: I want science fiction movies that deal with time travel but also have strong character development and emotional depth
Decomposed into: ['Find science fiction movies featuring time travel.', 'Find movies with strong character development.', 'Find movies known for their emotional depth.']
Retrieved 7 results for: 'Find science fiction movies featuring time travel.'
Retrieved 11 results for: 'Find movies with strong character development.'
Retrieved 7 results for: 'Find movies known for their emotional depth.'
[
  {
    "title": "About Time",
    "why_picked": "This movie blends time travel with a strong emphasis on character development and emotional depth as the protagonist uses his abilities to improve his relationships and find personal fulfillment."
  },
  {
    "title": "Somewhere in Time",
    "why_picked": "This film uses time travel as a means to explore a deep and enduring love connection, focusing on the emotional journey of the main characters as they overcom

In [25]:
response, results = movie_rag(complex_query)

In [26]:
print(response)

[
  {
    "title": "About Time",
    "why_picked": "This movie blends time travel with strong character development and emotional depth. The plot focuses on Tim's personal journey as he uses his ability to travel through time to improve his life and find love, offering a mix of comedy, drama, and science fiction."
  },
  {
    "title": "The Time Traveler's Wife",
    "why_picked": "This film centers on the love story between Henry and Clare, complicated by Henry's genetic disorder that causes him to involuntarily travel through time. The narrative explores their relationship across different moments in time, emphasizing emotional depth and character development within a time travel context."
  }
]


In [27]:
response, results = movie_hyde_rag(complex_query)

In [28]:
print(response)

[
  {
    "title": "About Time",
    "why_picked": "This movie focuses on a character who uses time travel to improve his life and relationships, offering strong character development and emotional depth as he navigates the consequences of his actions."
  },
  {
    "title": "Frequency",
    "why_picked": "This film blends time travel with a crime drama, exploring the emotional bond between a father and son who communicate across time to prevent a tragedy, showcasing the impact of changing the past."
  },
  {
    "title": "The Butterfly Effect",
    "why_picked": "This movie delves into the consequences of altering past events, emphasizing character development as the protagonist grapples with the unforeseen and often devastating outcomes of his time-traveling actions, creating a suspenseful and emotionally charged narrative."
  }
]
