<a href="https://colab.research.google.com/github/MrPrabhathPeri/Cine-Chat-Project/blob/main/Cine_Chat_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import json

# 1. Load your dataset
df = pd.read_csv('tmdb_5000_movies.csv')

# 2. Function to extract names from the complicated JSON columns
def extract_names(text):
    try:
        # Convert string representation of list to actual list
        data_list = json.loads(text)
        # Extract the 'name' key from each item and join them with spaces
        return " ".join([item['name'] for item in data_list])
    except:
        return ""

# 3. Apply this cleaning to genres and keywords
print("Cleaning data... this might take a second.")
df['genre_names'] = df['genres'].apply(extract_names)
df['keyword_names'] = df['keywords'].apply(extract_names)

# 4. Fill missing plot summaries with empty strings
df['overview'] = df['overview'].fillna("")

# 5. Create the "Mega-Text" column for the AI
# This is what the AI will read. We combine Genre + Keywords + Overview.
df['combined_text'] = (
    "Genre: " + df['genre_names'] +
    " Keywords: " + df['keyword_names'] +
    " Plot: " + df['overview']
)

# 6. Save the clean data
# We only need the ID, Title, and the new Text column
final_df = df[['id', 'title', 'combined_text']]
final_df.to_csv('clean_movies.csv', index=False)

print("Success! Created 'clean_movies.csv'.")
print(final_df.head())

Cleaning data... this might take a second.
Success! Created 'clean_movies.csv'.
       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                       combined_text  
0  Genre: Action Adventure Fantasy Science Fictio...  
1  Genre: Adventure Fantasy Action Keywords: ocea...  
2  Genre: Action Adventure Crime Keywords: spy ba...  
3  Genre: Action Crime Drama Thriller Keywords: d...  
4  Genre: Action Adventure Science Fiction Keywor...  


In [2]:
!pip install chromadb tqdm

Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.0-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [3]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
import os

# 1. Setup the Database
# We create a folder called 'movie_db' to store the vectors permanently.
# This way, you don't have to rebuild it every time you run the app.
client = chromadb.PersistentClient(path="movie_db")

# 2. Set up the Embedding Model
# This is the "Translator" that turns text into numbers.
# We use a standard, lightweight model from Google/HuggingFace.
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# 3. Create (or get) the Collection
# Think of a 'collection' like a table in SQL.
collection = client.get_or_create_collection(
    name="movies",
    embedding_function=sentence_transformer_ef
)

# 4. Load your clean data
df = pd.read_csv('clean_movies.csv')

# Check if DB is already populated to save time
if collection.count() > 0:
    print(f"Database already contains {collection.count()} movies. Skipping build.")
else:
    print(f"Building database with {len(df)} movies... this might take 2-3 minutes.")

    # We add data in batches to be safe
    batch_size = 100

    # Prepare lists for ChromaDB
    # It needs: IDs (unique), Documents (the text), and Metadatas (extra info like Title)
    ids = [str(i) for i in df['id'].tolist()]
    documents = df['combined_text'].tolist()
    metadatas = df[['title', 'id']].to_dict(orient='records')

    # Loop and add
    for i in range(0, len(df), batch_size):
        end = min(i + batch_size, len(df))
        print(f"Adding batch {i} to {end}...")

        collection.add(
            ids=ids[i:end],
            documents=documents[i:end],
            metadatas=metadatas[i:end]
        )

print("------------------------------------------------")
print("✅ Database successfully built!")
print(f"Total movies stored: {collection.count()}")

# ------------------------------------------------
# SANITY CHECK: Let's test if it works right now
# ------------------------------------------------
print("\n🔎 Test Search: 'A movie about space and aliens'")
results = collection.query(
    query_texts=["A movie about space and aliens"],
    n_results=3
)

for i, title in enumerate(results['metadatas'][0]):
    print(f"{i+1}. {title['title']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Building database with 4803 movies... this might take 2-3 minutes.
Adding batch 0 to 100...
Adding batch 100 to 200...
Adding batch 200 to 300...
Adding batch 300 to 400...
Adding batch 400 to 500...
Adding batch 500 to 600...
Adding batch 600 to 700...
Adding batch 700 to 800...
Adding batch 800 to 900...
Adding batch 900 to 1000...
Adding batch 1000 to 1100...
Adding batch 1100 to 1200...
Adding batch 1200 to 1300...
Adding batch 1300 to 1400...
Adding batch 1400 to 1500...
Adding batch 1500 to 1600...
Adding batch 1600 to 1700...
Adding batch 1700 to 1800...
Adding batch 1800 to 1900...
Adding batch 1900 to 2000...
Adding batch 2000 to 2100...
Adding batch 2100 to 2200...
Adding batch 2200 to 2300...
Adding batch 2300 to 2400...
Adding batch 2400 to 2500...
Adding batch 2500 to 2600...
Adding batch 2600 to 2700...
Adding batch 2700 to 2800...
Adding batch 2800 to 2900...
Adding batch 2900 to 3000...
Adding batch 3000 to 3100...
Adding batch 3100 to 3200...
Adding batch 3200 to 3300.

In [5]:
!pip install groq

Collecting groq
  Downloading groq-0.37.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.37.1-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.37.1


In [8]:
import chromadb
from chromadb.utils import embedding_functions
import os
from groq import Groq

# --------------------------------------------------------------
# CONFIGURATION
# --------------------------------------------------------------
# PASTE YOUR KEY HERE
GROQ_API_KEY = "GROQ_API_KEY"

# --------------------------------------------------------------
# 1. SETUP THE BRAIN (LLM + VECTOR DB)
# --------------------------------------------------------------
# Initialize Groq Client (The LLM)
client = Groq(api_key=GROQ_API_KEY)

# Initialize ChromaDB (The Memory)
chroma_client = chromadb.PersistentClient(path="movie_db")
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = chroma_client.get_collection(name="movies", embedding_function=sentence_transformer_ef)

# --------------------------------------------------------------
# 2. THE RAG FUNCTION
# --------------------------------------------------------------
def get_movie_recommendation(user_query):
    print(f"\nThinking about: '{user_query}'...")

    # STEP A: RETRIEVE
    # Search DB for the 3 most relevant movies
    results = collection.query(
        query_texts=[user_query],
        n_results=3
    )

    # Extract the retrieved information
    retrieved_movies = results['documents'][0]
    movie_titles = [meta['title'] for meta in results['metadatas'][0]]

    # Create a "Context String" to feed the AI
    # This is where we "Augment" the knowledge
    context = ""
    for i, plot in enumerate(retrieved_movies):
        context += f"Movie {i+1} Title: {movie_titles[i]}\nPlot: {plot}\n\n"

    # STEP B: GENERATE
    # Construct the System Prompt
    system_prompt = f"""
    You are an expert movie recommender.
    I will provide you with information about 3 movies.
    Based strictly on these movies, answer the user's request.
    Recommend the best one and explain why.

    HERE IS THE MOVIE DATA:
    {context}
    """

    # Call the LLM
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_query}
        ],
        model="llama-3.3-70b-versatile", # The newest Llama 3.3 (Very Smart)
    )

    return chat_completion.choices[0].message.content

# --------------------------------------------------------------
# 3. TEST IT
# --------------------------------------------------------------
while True:
    user_input = input("\n\nAsk for a movie (or type 'quit'): ")
    if user_input.lower() == 'quit':
        break

    response = get_movie_recommendation(user_input)
    print("\nAI ANSWER:")
    print(response)



Ask for a movie (or type 'quit'): I want a sad romantic movie where someone dies.

Thinking about: 'I want a sad romantic movie where someone dies.'...

AI ANSWER:
Based on the movie data provided, I would recommend "Things We Lost in the Fire" as the best fit for a sad romantic movie where someone dies. 

This movie matches your request because it is a drama that involves the loss of a husband, and it explores the themes of sadness, loss, and grief. The plot revolves around a recent widow and her family coping with the death of her husband, which aligns with your request for a movie where someone dies. While it's not strictly a romantic movie, it does have romantic elements and deals with the emotional aftermath of a loss, making it a good fit for your request.

In contrast, "Run, Hide, Die" is more of a thriller horror movie, and while it does involve death, it doesn't quite fit the sad romantic genre. "Poetic Justice" does involve loss and sadness, but it's more focused on the poe

In [14]:
%%writefile app.py
import streamlit as st
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
import os
from groq import Groq

# --------------------------------------------------------------
# PAGE CONFIGURATION
# --------------------------------------------------------------
st.set_page_config(page_title="Cine-Chat", page_icon="🎬")

st.title("🎬 Cine-Chat: The AI Movie Expert")
st.caption("Powered by Llama 3.3 & RAG")

# --------------------------------------------------------------
# SETUP (CACHED)
# --------------------------------------------------------------
@st.cache_resource
def load_resources():
    try:
        GROQ_API_KEY = st.secrets["GROQ_API_KEY"]
    except:
        GROQ_API_KEY = "NGROK_AUTHTOKEN"  # <--- PASTE KEY HERE

    client = Groq(api_key=GROQ_API_KEY)

    db_path = "movie_db"
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    chroma_client = chromadb.PersistentClient(path=db_path)
    collection = chroma_client.get_or_create_collection(name="movies", embedding_function=sentence_transformer_ef)

    # BUILD DB IF EMPTY
    if collection.count() == 0:
        st.info("Building database... (~1 min)")
        if not os.path.exists('tmdb_5000_movies.csv'):
            st.error("CSV file not found. Please upload it!")
            st.stop()

        df = pd.read_csv('tmdb_5000_movies.csv')
        import json
        def extract_names(text):
            try: return " ".join([item['name'] for item in json.loads(text)])
            except: return ""

        df['combined_text'] = ("Genre: " + df['genres'].apply(extract_names) +
                               " Keywords: " + df['keywords'].apply(extract_names) +
                               " Plot: " + df['overview'].fillna(""))

        ids = [str(i) for i in df['id'].tolist()]
        documents = df['combined_text'].tolist()
        metadatas = df[['title', 'id']].to_dict(orient='records')

        batch_size = 200
        for i in range(0, len(df), batch_size):
            end = min(i + batch_size, len(df))
            collection.add(ids=ids[i:end], documents=documents[i:end], metadatas=metadatas[i:end])
        st.success("Database built!")

    return client, collection

client, collection = load_resources()

# --------------------------------------------------------------
# CHAT INTERFACE
# --------------------------------------------------------------
if "messages" not in st.session_state:
    st.session_state["messages"] = [{"role": "assistant", "content": "I am a movie expert. Ask me anything!"}]

for msg in st.session_state.messages:
    st.chat_message(msg["role"]).write(msg["content"])

# THIS WAS THE BROKEN LINE - I MADE IT SHORTER TO BE SAFE
if prompt := st.chat_input("Ask for a movie recommendation..."):
    st.chat_message("user").write(prompt)
    st.session_state.messages.append({"role": "user", "content": prompt})

    results = collection.query(query_texts=[prompt], n_results=3)

    context_text = ""
    for i, doc in enumerate(results['documents'][0]):
        title = results['metadatas'][0][i]['title']
        context_text += f"Movie: {title}\nPlot: {doc}\n\n"

    system_prompt = f"""
    You are a movie expert. The user wants a recommendation.
    Here are 3 relevant movies:
    {context_text}
    Recommend the best one and explain why.
    """

    chat_completion = client.chat.completions.create(
        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
        model="llama-3.3-70b-versatile",
    )

    response = chat_completion.choices[0].message.content
    st.chat_message("assistant").write(response)
    st.session_state.messages.append({"role": "assistant", "content": response})

Overwriting app.py


In [18]:
import os
from pyngrok import ngrok

# ---------------------------------------------------------
# 1. AUTHENTICATE
# ---------------------------------------------------------
# Paste your token inside the quotes below
# Example: ngrok.set_auth_token("2Fw7...5sA")
ngrok.set_auth_token("35JzI3yzqVwXVEDIk0OXtADAeAP_33Xpfbx5qsrhmiM9V6uNq")

# ---------------------------------------------------------
# 2. CLEANUP & LAUNCH
# ---------------------------------------------------------
# Kill any existing tunnels so we don't get errors
ngrok.kill()

# Run Streamlit in the background (silently)
# We send the logs to a text file so they don't clutter the screen
os.system("streamlit run app.py > /dev/null 2>&1 &")

# ---------------------------------------------------------
# 3. OPEN THE TUNNEL
# ---------------------------------------------------------
# Open a tunnel to port 8501 (where Streamlit lives)
try:
    public_url = ngrok.connect(8501).public_url
    print("✅ Success! Your App is running.")
    print(f"🚀 Click this link to use it: {public_url}")
except Exception as e:
    print("❌ Error starting ngrok:", e)

ERROR:pyngrok.process.ngrok:t=2025-12-05T11:27:10+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"
ERROR:pyngrok.process.ngrok:t=2025-12-05T11:27:10+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"


❌ Error starting ngrok: The ngrok process errored on start: authentication failed: The authtoken you specified does not look like a proper ngrok authtoken.\nYour authtoken: NGROK_AUTHTOKEN\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n.
