# Naive RAG

# A RAG pipeline

## Download Movies Dataset from Kaggle

Follow these steps to download the dataset:

1. **Get your Kaggle API credentials:**
   - Go to https://www.kaggle.com/settings/account
   - Scroll down to "API" section
   - Click "Create New Token"
   - This downloads a `kaggle.json` file

2. **Upload the kaggle.json file** to this workspace or run the cell below to set it up

In [None]:
# Setup Kaggle credentials
# Option 1: Upload kaggle.json manually, then run:
!mkdir -p ~/.kaggle
# If you uploaded kaggle.json to the workspace root:
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

### Download Movies Dataset 

In [None]:
# Download the Movies Dataset from Kaggle
!kaggle datasets download -d rounakbanik/the-movies-dataset

# Unzip the dataset
!unzip -o the-movies-dataset.zip

# List the files
!ls -lh *.csv

print("\n✅ Dataset downloaded and extracted!")

In [11]:
import pandas as pd
df = pd.read_csv('movies_metadata.csv')
df.head()

  df = pd.read_csv('movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [None]:
# Download required NLTK data
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
print("✅ NLTK data downloaded!")

# Create VectorStore using CromeDB

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import NLTKTextSplitter
import chromadb

df = df.loc[:5000,['original_title', 'overview']]

## Step 1: Chunk the Overview Column using NLTKTextSplitter

In [None]:




text_splitter = NLTKTextSplitter(chunk_size=1500)

def split_overview(overview):
    if pd.isna(overview):
        return []
    return text_splitter.split_text(str(overview))

df['chunks'] = df['overview'].apply(split_overview)

# Flatten the dataframe for easier processing
chunked_df = df.explode('chunks').reset_index(drop=True)




## Step 2: Embed with a SentenceTransformer Encoder

In [None]:

embedder = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose another model

# Make sure all chunks are strings and not empty
def encode_chunk(chunk):
    if not isinstance(chunk, str) or chunk.strip() == "":
        return None
    return embedder.encode(chunk).tolist()

chunked_df['embeddings'] = chunked_df['chunks'].apply(encode_chunk)

# Drop rows where 'embeddings' is None
chunked_df.dropna(subset=['embeddings'], inplace=True)



## Step 3: Store in ChromaDB

In [None]:

# Initialize ChromaDB client and collection
client = chromadb.Client()
collection = client.create_collection(name='movies')

# Insert data into ChromaDB
for idx, row in chunked_df.iterrows():
    collection.add(
        ids=[str(idx)],
        embeddings=[row['embeddings']],
        metadatas=[{
            'original_title': row['original_title'],
            'chunk': row['chunks']
        }]
    )

print("Data successfully stored in ChromaDB.")

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from chromadb import Client
from sentence_transformers import SentenceTransformer
import chromadb
import torch

# Load the SentenceTransformer model for encoding queries
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use the same model as for embedding documents

# Load the text generation model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map='auto' # load it in the current GPU
)
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=True,
    max_new_tokens=800
)







## Function to retrieve top_k documents from ChromaDB

In [None]:

def retrieve_documents(query, collection, top_k=5):
    # Embed the query using the SentenceTransformer model
    query_embedding = sentence_model.encode(query).tolist()
    
    # Search for top_k similar documents in the collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    
    if not results['documents']:
        print("No results found for the query.")
        return [], []
    
    # Extract chunks and titles from the results
    chunks = []
    titles = []
    for document in results['metadatas'][0]:
        chunks.append(document['chunk'])
        titles.append(document['original_title'])
    
    return chunks, titles



## Function to generate answer based on retrieved chunks and titles

In [None]:

def generate_answer(query, chunks, titles, text_generation_pipeline):
    # Prepare the context from chunks and titles
    context = "\n\n".join([f"Title: {title}\nChunk: {chunk}" for title, chunk in zip(titles, chunks)])
    
    # Prepare the prompt
    prompt = f"""[INST]
    Instruction: You're an expert in movie suggestions. Your task is to analyze carefully the context and come up with an exhaustive answer to the following question:
    {query}
    
    Here is the context to help you:

    {context}

    [/INST]"""
    
    # Generate the answer using the model
    generated_text = text_generation_pipeline(prompt)[0]['generated_text']
    
    return generated_text



## Example usage

In [None]:

client = chromadb.Client()
# Use get_or_create_collection to avoid errors if collection doesn't exist
#collection = client.get_or_create_collection(name='movies')

query = "What are some good movies to watch on a rainy day?"
top_k = 5

# Retrieve documents
chunks, titles = retrieve_documents(query, collection, top_k)
print(f"Retrieved Chunks: {chunks}")
print(f"Retrieved Titles: {titles}")

## Generate answer

In [None]:
# Generate answer
if chunks and titles:
    answer = generate_answer(query, chunks, titles, text_generation_pipeline)
    print(answer)
else:
    print("No relevant documents found to generate an answer.")