In [1]:
!pip install -q langchain langchain-groq

In [2]:
!pip -q install langchain-cohere

In [3]:
!pip install --upgrade --quiet langchain-chroma

In [4]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_cohere import CohereEmbeddings
from langchain_groq import ChatGroq

In [6]:
from google.colab import userdata

COHERE_API_KEY = userdata.get('COHERE_API_KEY')
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

In [7]:
embeddings = CohereEmbeddings(
    cohere_api_key=COHERE_API_KEY,
    model="embed-english-v3.0"
)



In [8]:
docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="A hacker discovers reality is a simulation and leads a rebellion against the machines controlling it.",
        metadata={"year": 1999, "director": "Lana Wachowski, Lilly Wachowski", "rating": 8.7, "genre": "science fiction"},
    ),
    Document(
        page_content="A young lion prince flees his kingdom only to learn the true meaning of responsibility and bravery.",
        metadata={"year": 1994, "rating": 8.5, "genre": "animated"},
    ),
    Document(
        page_content="Batman faces off against the Joker, a criminal mastermind who plunges Gotham into chaos.",
        metadata={"year": 2008, "director": "Christopher Nolan", "rating": 9.0, "genre": "action"},
    ),
    Document(
        page_content="A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.",
        metadata={"year": 2014, "director": "Christopher Nolan", "rating": 8.6, "genre": "science fiction"},
    )
]

In [9]:
vectorstore = Chroma.from_documents(docs, embeddings)

In [22]:
def smart_retrieve(query: str):
    # Simple LLM to parse "science fiction movies rated >8" â†’ filter
    llm=ChatGroq(model_name="llama-3.3-70b-versatile", api_key = GROQ_API_KEY)

    prompt = f"""
    Parse this query into metadata filters for movies:
    Query: {query}

    Return JSON: {{"genre": "value", "rating_gt": 8}}
    """

    response = llm.invoke(prompt)

    filter_dict = {"$and": [{"genre": "science fiction"}, {"rating": {"$gt": 7}}]}

    # Vector search + metadata filter
    results = vectorstore.similarity_search(
        query,
        k=5,
        filter=filter_dict
    )
    return results

# Test it
results = smart_retrieve("science fiction movies rated above 8")
print(results)
print([doc.metadata for doc in results])

[Document(id='c6bf3de3-4f71-4a49-af78-3eccc63c7f2c', metadata={'rating': 8.6, 'year': 2014, 'director': 'Christopher Nolan', 'genre': 'science fiction'}, page_content="A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival."), Document(id='435c5f0d-da81-4fd9-9d9e-1c3f4459a586', metadata={'director': 'Lana Wachowski, Lilly Wachowski', 'genre': 'science fiction', 'rating': 8.7, 'year': 1999}, page_content='A hacker discovers reality is a simulation and leads a rebellion against the machines controlling it.'), Document(id='ff4e6106-33f8-44d7-9701-094411d8b055', metadata={'genre': 'science fiction', 'rating': 7.7, 'year': 1993}, page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose')]
[{'rating': 8.6, 'year': 2014, 'director': 'Christopher Nolan', 'genre': 'science fiction'}, {'director': 'Lana Wachowski, Lilly Wachowski', 'genre': 'science fiction', 'rating': 8.7, 'year': 1999}, {'genre': 'science fiction', 'rating': 7

In [23]:
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class Movie:
    description: str
    director: Optional[str] = None
    year: Optional[int] = None
    rating: Optional[float] = None
    genre: Optional[str] = None

# Assuming `results` is already populated from the previous execution
structured_movies = []
for doc in results:
    movie_data = doc.metadata
    structured_movies.append(
        Movie(
            description=doc.page_content,
            director=movie_data.get('director'),
            year=movie_data.get('year'),
            rating=movie_data.get('rating'),
            genre=movie_data.get('genre')
        )
    )

# Print the structured movie data
for movie in structured_movies:
    print(f"Description: {movie.description}")
    print(f"Director: {movie.director if movie.director else 'N/A'}")
    print(f"Year: {movie.year if movie.year else 'N/A'}")
    print(f"Rating: {movie.rating if movie.rating else 'N/A'}")
    print(f"Genre: {movie.genre if movie.genre else 'N/A'}")
    print("----------------------------------------")

Description: A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.
Director: Christopher Nolan
Year: 2014
Rating: 8.6
Genre: science fiction
----------------------------------------
Description: A hacker discovers reality is a simulation and leads a rebellion against the machines controlling it.
Director: Lana Wachowski, Lilly Wachowski
Year: 1999
Rating: 8.7
Genre: science fiction
----------------------------------------
Description: A bunch of scientists bring back dinosaurs and mayhem breaks loose
Director: N/A
Year: 1993
Rating: 7.7
Genre: science fiction
----------------------------------------
