# Searching Strategies

## Data

In [9]:
movies = [
    {"title": "Inception", "description": "A thief who enters the dreams of others to steal secrets is given a task to plant an idea in someone's mind.", "genre": "Sci-Fi, Thriller"},
    {"title": "The Matrix", "description": "A hacker discovers the reality he lives in is a simulation and joins a rebellion against its controllers.", "genre": "Sci-Fi, Action"},
    {"title": "Interstellar", "description": "A team of explorers travel through a wormhole in space to save humanity from a dying Earth.", "genre": "Sci-Fi, Drama"},
    {"title": "The Dark Knight", "description": "Batman battles the Joker, a criminal mastermind who wants to create chaos in Gotham City.", "genre": "Action, Crime"},
    {"title": "Titanic", "description": "A love story unfolds between a young aristocrat and a poor artist aboard the ill-fated Titanic.", "genre": "Romance, Drama"},
    {"title": "Avatar", "description": "A paraplegic Marine is sent to an alien planet where he becomes part of an indigenous tribe and fights for their survival.", "genre": "Sci-Fi, Adventure"},
    {"title": "The Godfather", "description": "The aging patriarch of an organized crime dynasty transfers control of his empire to his reluctant son.", "genre": "Crime, Drama"},
    {"title": "Pulp Fiction", "description": "Interwoven stories of crime, violence, and redemption in Los Angeles.", "genre": "Crime, Drama"},
    {"title": "The Shawshank Redemption", "description": "A man wrongly imprisoned for murder finds friendship and hope inside a maximum-security prison.", "genre": "Drama"},
    {"title": "Forrest Gump", "description": "A man with a low IQ unknowingly influences major historical events while searching for his lost love.", "genre": "Drama, Romance"},
    {"title": "The Avengers", "description": "Earth's mightiest heroes must unite to stop an alien invasion led by Loki.", "genre": "Action, Sci-Fi"},
    {"title": "Gladiator", "description": "A betrayed Roman general fights for vengeance as a gladiator in the Colosseum.", "genre": "Action, Drama"},
    {"title": "Joker", "description": "A mentally troubled comedian's descent into madness leads to the birth of Gotham's infamous villain.", "genre": "Crime, Drama"},
    {"title": "The Lion King", "description": "A young lion cub flees after his father's murder but returns to reclaim his kingdom.", "genre": "Animation, Drama"},
    {"title": "The Lord of the Rings: The Fellowship of the Ring", "description": "A young hobbit embarks on a quest to destroy a powerful ring that could doom Middle-earth.", "genre": "Fantasy, Adventure"},
    {"title": "Star Wars: A New Hope", "description": "A farm boy joins a rebellion to defeat an evil empire and rescue a princess.", "genre": "Sci-Fi, Adventure"},
    {"title": "Black Panther", "description": "The king of Wakanda must protect his nation from enemies while embracing his role as the Black Panther.", "genre": "Action, Sci-Fi"},
    {"title": "The Terminator", "description": "A cyborg assassin from the future is sent to kill a woman whose unborn child will lead a resistance against machines.", "genre": "Sci-Fi, Action"},
    {"title": "Jurassic Park", "description": "Scientists clone dinosaurs for a theme park, but chaos ensues when security fails.", "genre": "Sci-Fi, Adventure"},
    {"title": "Deadpool", "description": "A wisecracking mercenary with accelerated healing seeks revenge on the man who experimented on him.", "genre": "Action, Comedy"}
]


## Create Documents

In [10]:
from langchain.schema import Document

documents = [
    Document(
        page_content=movie['description'],
        metadata={"title": movie['title'], "genre": movie['genre']}
    )
    for movie in movies  # Iterate over each dictionary in the list
]

# Check the output
print(documents)


[Document(metadata={'title': 'Inception', 'genre': 'Sci-Fi, Thriller'}, page_content="A thief who enters the dreams of others to steal secrets is given a task to plant an idea in someone's mind."), Document(metadata={'title': 'The Matrix', 'genre': 'Sci-Fi, Action'}, page_content='A hacker discovers the reality he lives in is a simulation and joins a rebellion against its controllers.'), Document(metadata={'title': 'Interstellar', 'genre': 'Sci-Fi, Drama'}, page_content='A team of explorers travel through a wormhole in space to save humanity from a dying Earth.'), Document(metadata={'title': 'The Dark Knight', 'genre': 'Action, Crime'}, page_content='Batman battles the Joker, a criminal mastermind who wants to create chaos in Gotham City.'), Document(metadata={'title': 'Titanic', 'genre': 'Romance, Drama'}, page_content='A love story unfolds between a young aristocrat and a poor artist aboard the ill-fated Titanic.'), Document(metadata={'title': 'Avatar', 'genre': 'Sci-Fi, Adventure'},

## Embedding and ChromaDB

In [11]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

vector_store = Chroma.from_documents(documents, embedding_model)

  embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


## Vector Search

In [17]:
def vector_search(query, top_k=2):
    """Search movies Vector"""
    results = vector_store.similarity_search(query, k=top_k)
    return [
        {"title": res.metadata["title"], "description": res.page_content, "genre": res.metadata["genre"]}
        for res in results
    ]

def search_by_genre(query, genre, top_k=3):
    """Search movies by query and filter by genre."""
    
    # Perform a broader similarity search to retrieve initial results
    all_results = vector_store.similarity_search(query, k=top_k)
    
    # Filter results based on genre (case-insensitive and allowing partial matches)
    filtered_results = [
        res for res in all_results 
        if "genre" in res.metadata and genre.lower() in res.metadata["genre"].lower()
    ]
    
    # Return the top-k filtered results
    return filtered_results[:top_k]

# Query 1: General Vector Search
query1 = "A complex story about exploring dreams and reality."
results1 = vector_search(query1, top_k=2)
print("General Vector Search Results:")
for movie in results1:
    print(f"Title: {movie['title']}, Genre: {movie['genre']}, Description: {movie['description']}")

# Query 2: Filtered Search by Genre
query2 = "An epic adventure through space and beyond."
results2 = search_by_genre(query2, genre="Sci-Fi", top_k=2)
print("\nFiltered Search by Genre Results:")
for movie in results2:
    print(f"Title: {movie.metadata.get('title', 'Unknown')}, "
          f"Genre: {movie.metadata.get('genre', 'Unknown')}, "
          f"Description: {movie.page_content}")

General Vector Search Results:
Title: Inception, Genre: Sci-Fi, Thriller, Description: A thief who enters the dreams of others to steal secrets is given a task to plant an idea in someone's mind.
Title: Pulp Fiction, Genre: Crime, Drama, Description: Interwoven stories of crime, violence, and redemption in Los Angeles.

Filtered Search by Genre Results:
Title: Interstellar, Genre: Sci-Fi, Drama, Description: A team of explorers travel through a wormhole in space to save humanity from a dying Earth.
Title: Avatar, Genre: Sci-Fi, Adventure, Description: A paraplegic Marine is sent to an alien planet where he becomes part of an indigenous tribe and fights for their survival.


## Semantic Search

In [None]:
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.schema import Document

# Step 2: Generate embeddings using Hugging Face Embeddings
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Convert movie data to Documents
documents = [
    Document(
        page_content=movie["description"],
        metadata={"title": movie["title"]}
    )
    for movie in movies
]

# Initialize ChromaDB with movie documents
vector_store = Chroma.from_documents(documents, embedding_model)

# Step 3: Set up a Hugging Face pipeline for LLM
llm_model_name = "google/flan-t5-large"  
hf_pipeline = pipeline("text2text-generation", model=llm_model_name, device=0)  
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Step 4: Create a Retrieval Chain using the retriever
retriever = vector_store.as_retriever()

# Custom prompt to ensure specific results
def custom_prompt(query: str):
    return f"""
    You are a helpful assistant tasked with retrieving movie titles based on descriptions.
    Query: {query}
    From the following dataset, only provide the movie titles that match:
    Dataset: {', '.join([doc.metadata['title'] for doc in documents])}
    Response:
    """

# Define the QA chain with the custom prompt
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Step 5: Perform semantic search
semantic_query = "Find me movies about dinosaurs"
custom_query = custom_prompt(semantic_query)
semantic_results = qa_chain.run(custom_query)

print("Semantic Search Results:")
print(semantic_results)


Device set to use mps:0


Semantic Search Results:
Jurassic Park
