In [1]:
import os

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sentence_transformers import SentenceTransformer
import faiss

import pickle

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [2]:
df_movies = pd.read_csv(os.path.join('data', 'prepared_movies.csv'))
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16960 entries, 0 to 16959
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            16960 non-null  int64  
 1   cast          16960 non-null  object 
 2   directors     16960 non-null  object 
 3   writers       16960 non-null  object 
 4   keywords      16960 non-null  object 
 5   release_date  16960 non-null  object 
 6   title         16960 non-null  object 
 7   overview      16960 non-null  object 
 8   genres        16960 non-null  object 
 9   popularity    16960 non-null  float64
 10  vote_count    16960 non-null  int64  
 11  vote_average  16960 non-null  float64
 12  poster_path   16891 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 1.7+ MB


1. Select Specific Columns Only

In [3]:
df_plot = df_movies.copy()
df_plot = df_plot[['id', 'title', 'overview']]
df_plot.head()

Unnamed: 0,id,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
1,8844,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."
2,15602,Grumpier Old Men,"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe."
4,11862,Father of the Bride Part II,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own."


2. Use Sentence Transformers to get embeddings  
Reference: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

In [4]:
descriptions = df_plot['overview'].tolist()
model = SentenceTransformer('all-mpnet-base-v2')
embedding_matrix = model.encode(descriptions, show_progress_bar=True)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Batches:   0%|          | 0/530 [00:00<?, ?it/s]

Embedding matrix shape: (16960, 768)


In [5]:
df_plot = df_plot.reset_index(drop=True)
indices = pd.Series(df_plot.index, index=df_plot['title'])

3. Save matrices and indices

In [6]:
# Save embedding_matrix
with open('data/overview_embedding_matrix.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)

# Save indices
with open('data/overview_indices.pkl', 'wb') as f:
    pickle.dump(indices, f)

4. Use FAISS for 

In [7]:
faiss.normalize_L2(embedding_matrix)
embedding_dimension = embedding_matrix.shape[1]
faiss_index = faiss.IndexFlatIP(embedding_dimension) 
faiss_index.add(embedding_matrix)
print(f"Faiss index created with {faiss_index.ntotal} vectors.")

Faiss index created with 16960 vectors.


In [8]:
# Function to find movie index 
def find_movie_index(title, indices_map):
    if title in indices_map:
        idx = indices_map[title]
        return idx.iloc[0] if isinstance(idx, pd.Series) else idx
    return None

In [12]:
def get_recommendations(title, n=10):
    """
    Returns the top N most similar movies based on Overview using a Faiss ANN search.
    """
    # Find the index for the input title
    idx = find_movie_index(title, indices)
    
    # Get the embedding vector for the query movie
    query_vector = embedding_matrix[int(idx)].reshape(1, -1)
    
    # Search the Faiss index for the k nearest neighbors
    # We search for n+1 because the first result will be the movie itself
    distances, movie_indices = faiss_index.search(query_vector, n + 1)

    # Filter out invalid indices (-1 can be returned by Faiss) and ensure bounds
    # The results are returned as a list within a list, so we extract them and apply the mask
    valid_mask = (movie_indices[0] != -1) & (movie_indices[0] < len(df_plot))
    movie_indices = movie_indices[0][valid_mask]
    distances = distances[0][valid_mask]

    # Exclude self-match (1st result)if present
    self_mask = (movie_indices != int(idx))
    movie_indices = movie_indices[self_mask]
    similarity_scores = distances[self_mask]

    # Take top N after excluding self
    movie_indices = movie_indices[:n]
    similarity_scores = similarity_scores[:n]

    similarity_scores = [round(score, 2) for score in similarity_scores]

    # Ensure indices are valid for iloc
    valid_iloc_indices = [i for i in movie_indices if i < len(df_plot)]
    if not valid_iloc_indices:
        return pd.DataFrame(columns=['title', 'id', 'similarity'])
    results_df = df_plot.iloc[valid_iloc_indices][['title', 'id']].copy()

    # Ensure similarity_scores aligns with the potentially filtered results_df
    results_df['similarity'] = similarity_scores[:len(results_df)]

    return results_df

5. Sample Recommendations

In [13]:
# Example 1: A popular sci-fi movie
print('\n--- Recommendations for "Interstellar" ---')
display(get_recommendations("Interstellar"))

# Example 2: A romantic comedy
print('\n--- Recommendations for "10 Things I Hate About You" ---')
display(get_recommendations("10 Things I Hate About You"))

# Example 3: An animated film
print('\n--- Recommendations for "Toy Story" ---')
display(get_recommendations("Toy Story"))


--- Recommendations for "Interstellar" ---


Unnamed: 0,title,id,similarity
12134,Star Trek Beyond,188927,0.56
15895,ISRA 88,401222,0.55
9954,Age of Tomorrow,275619,0.54
3001,A Brief History of Time,1358,0.54
14360,The Visit: An Alien Encounter,308063,0.52
11446,Hollywood between Paranoia and Sci-Fi. The Power of Myth,326591,0.51
15383,Voyage of Time: Life's Journey,86822,0.51
14154,Time Runner,48315,0.51
12871,Another World,262897,0.51
7787,Prometheus,70981,0.5



--- Recommendations for "10 Things I Hate About You" ---


Unnamed: 0,title,id,similarity
14964,The Edge of Seventeen,376660,0.61
10344,BFFs,268174,0.61
9757,G.B.F.,174323,0.61
10639,It Felt Like Love,153854,0.6
11244,The DUFF,272693,0.59
6820,The Loved Ones,46420,0.59
9450,Scorned,242033,0.59
9609,Forgetting the Girl,183433,0.59
6448,Normal Adolescent Behavior,20478,0.59
2041,The Man in the Moon,17474,0.58



--- Recommendations for "Toy Story" ---


Unnamed: 0,title,id,similarity
6529,Toy Story 3,10193,0.83
1656,Toy Story 2,863,0.82
13977,Welcome to Happiness,340255,0.53
1208,Child's Play 3,11187,0.48
3073,Elf,10719,0.48
6196,Mickey's Once Upon a Christmas,15400,0.48
10175,Hawaiian Vacation,77887,0.47
8713,The Kings of Summer,156700,0.47
7164,Mickey's Twice Upon a Christmas,13378,0.46
9141,Toy Story of Terror!,213121,0.45
