# movie_search_solution.ipynb

# Section 1: Install & Import Libraries 

In [7]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# Section 2: Load Dataset 

In [8]:
df = pd.read_csv("movies.csv")
print(df.head())

              title                                               plot
0         Spy Movie  A spy navigates intrigue in Paris to stop a te...
1  Romance in Paris  A couple falls in love in Paris under romantic...
2      Action Flick  A high-octane chase through New York with expl...


# Section 3: Load Embedding Model 

In [9]:
model = SentenceTransformer("all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Section 4: Encode Plots 

In [10]:
# Convert each plot into embeddings
df["embeddings"] = df["plot"].apply(lambda x: model.encode(x))


# Section 5: Define Search Function

In [11]:

def search_movies(query, top_n=5):
    """
    Given a search query, return the top_n most relevant movies
    based on cosine similarity of embeddings.
    """
    query_embedding = model.encode(query)
    similarities = cosine_similarity([query_embedding], list(df["embeddings"]))[0]
    df["similarity"] = similarities
    results = df.sort_values("similarity", ascending=False).head(top_n)
    return results[["title", "plot", "similarity"]]


# Section 6: Test Example Query 

In [12]:
print(search_movies("spy thriller in Paris", top_n=5))

              title                                               plot  \
0         Spy Movie  A spy navigates intrigue in Paris to stop a te...   
1  Romance in Paris  A couple falls in love in Paris under romantic...   
2      Action Flick  A high-octane chase through New York with expl...   

   similarity  
0    0.769684  
1    0.388030  
2    0.256777  
