# Loading Libraries

In [3]:
import pandas as pd
import os
import numpy as np

ROUGH

# Loading the Datasets for Content based recommendation

In [17]:
# Load TMDB content dataset
data_dir = os.path.join('..', 'data', 'processed_data')
tmdb_content_processed = pd.read_csv(os.path.join(data_dir, 'tmdb_content_processed.csv'))

print("TMDB content processed dataset shape:", tmdb_content_processed.shape)
tmdb_content_processed.head()


TMDB content processed dataset shape: (10922, 19)


Unnamed: 0,title,release_date,revenue,runtime,imdb_id,original_language,overview,production_companies,spoken_languages,plot_summary,plot_synopsis,genres,cast,directors,averageRating,numVotes,release_year,release_month,release_day
0,Inception,2010-07-15,825532764,148,1375666,English,"Cobb, a skilled thief who commits corporate es...","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili","Dom Cobb is a skilled thief, the absolute best...","Dominick ""Dom"" Cobb (Leonardo DiCaprio) and bu...","Action, Adventure, Sci-Fi, Thriller","Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",Christopher Nolan,8.8,2681459,2010,7,15
1,Interstellar,2014-11-05,701729206,169,816692,English,The adventures of a group of explorers who mak...,"Legendary Pictures, Syncopy, Lynda Obst Produc...",English,In the near future around the American Midwest...,"In the future, crop blight has caused civiliza...","Adventure, Drama, Sci-Fi","Ellen Burstyn, Matthew McConaughey, Mackenzie ...",Christopher Nolan,8.7,2342692,2014,11,5
2,The Dark Knight,2008-07-16,1004558444,152,468569,English,Batman raises the stakes in his war on crime. ...,"DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin",Set within a year after the events of Batman B...,A gang of 6 criminals rob a Gotham City mob ba...,"Action, Crime, Drama, Thriller","Christian Bale, Heath Ledger, Aaron Eckhart, M...",Christopher Nolan,9.0,3018672,2008,7,16
3,Avatar,2009-12-15,2923706026,162,499549,English,"In the 22nd century, a paraplegic Marine is di...","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish","When his brother is killed in a robbery, parap...","In 2154, humans have depleted Earth's natural ...","Action, Adventure, Fantasy, Sci-Fi","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron,7.9,1430332,2009,12,15
4,The Avengers,2012-04-25,1518815515,143,848228,English,When an unexpected enemy emerges and threatens...,Marvel Studios,"English, Hindi, Russian","Loki, the adopted brother of Thor, teams-up wi...",The Asgardian Loki (Tom Hiddleston) encounters...,"Action, Sci-Fi","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon,8.0,1507612,2012,4,25


# Sentence transformer based movie recommendations

In [14]:
def create_moviedata(row):
    def split_clean(val):
        if pd.isna(val):
            return []
        return [x.strip() for x in val.split(",") if x.strip()]

    title = row.get("title")
    rating = row.get("averageRating")
    plot = row.get("plot_synopsis")
    genres = ", ".join(split_clean(row.get("genres")))
    director = ", ".join(split_clean(row.get("directors")))
    cast = ", ".join(split_clean(row.get("cast"))[:10])
    companies = ", ".join(split_clean(row.get("production_companies")))
    language = str(row.get("original_language")).capitalize()

    return f"""title: {title}
Rating: {rating}
Plot: {plot}
Genres: {genres}
Director: {director}
Cast: {cast}
Production Companies: {companies}
Original Language: {language}"""

In [15]:
# Apply the function to create a new column for structured data
tmdb_content_processed['movie_description'] = tmdb_content_processed.apply(create_moviedata, axis=1)

In [16]:
# Save the processed movie metadata to CSV
output_dir = os.path.join('..', 'data', 'processed_data')
os.makedirs(output_dir, exist_ok=True)
tmdb_content_processed.to_csv(os.path.join(output_dir, 'movies_metadata.csv'), index=False)


In [9]:
print(tmdb_content_processed['movie_description'][0])

title: Inception
Rating: 8.8
Plot: Dominick "Dom" Cobb (Leonardo DiCaprio) and business partner Arthur (Joseph Gordon-Levitt) are "extractors", people who perform corporate espionage using an experimental military technology to infiltrate the subconscious of their targets and extract information while experiencing shared dreaming. Their latest target is Japanese businessman Saito (Ken Watanabe). The extraction from Saito fails when sabotaged by a memory of Cobb's deceased wife Mal (Marion Cotillard). After Cobb's and Arthur's associate sells them out, Saito reveals that he was actually auditioning the team to perform the difficult act of "inception": planting an idea in a person's subconscious.In order to break up the energy conglomerate of ailing competitor Maurice Fischer (Pete Postlethwaite), Saito wants Cobb to plant the idea of dissolving the company into the mind of Fischer's heir, son Robert Fischer (Cillian Murphy). Should Cobb succeed, Saito tells Cobb he will use his influenc

In [None]:
tmdb_content_processed.head()

OPTIONAL

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data files
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt_tab')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize text
def lemmatize_text(text):
    tokens = word_tokenize(text)  # Tokenize the text
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each token
    return ' '.join(lemmatized_tokens)  # Join tokens back into a single string

# Apply the lemmatization function to the 'overview' column
tmdb_content_processed['movie_description_processed'] = tmdb_content_processed['movie_description'].apply(lemmatize_text)

In [None]:
tmdb_content_processed['movie_description_processed'][0]


Creating Embeddings

If running the code for the first time then create the embeddings but if you already have the embeddings then you can skip this part.

In [None]:
import torch

print("Is CUDA available? ", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

In [12]:
# Function to batch encode movie descriptions
def batch_encode(texts, batch_size=512):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Batch encoding"):
        batch = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
    return embeddings

# Create embeddings for the movie descriptions
tmdb_content_processed["embedding"] = batch_encode(tmdb_content_processed["movie_description"].tolist())

Batch encoding: 100%|██████████| 22/22 [00:19<00:00,  1.11it/s]


--------------------- OPTIONAL SECTION ---------------------------------

In [None]:
#If you want to save the embeddings to a file as a numpy array uncomment the following lines
#import pandas as pd
#import numpy as np

# Save embeddings separately
#embedding_matrix = np.vstack(tmdb_content_processed["embedding"].values)
#np.save("../data/processed_data/embedding_matrix_content_ST.npy", embedding_matrix)

In [None]:
# if you want to save the embeddings as a pickle file and then load them:
embedding_df = pd.DataFrame({
    "imdb_id": tmdb_content_processed["imdb_id"],
    "embedding": list(tmdb_content_processed["embedding"])
})
embedding_df.to_pickle("../data/processed_data/embedding_with_ids.pkl")

In [None]:
# Load the embeddings from the .pkl file
loaded_embedding_df = pd.read_pickle("../data/processed_data/embedding_with_ids.pkl")

# Merge the embeddings back into the tmdb_content_processed DataFrame using 'imdb_id'
tmdb_content_processed = tmdb_content_processed.merge(
    loaded_embedding_df,
    on="imdb_id",
    how="left"
)

# Verify the merge
print("Shape of tmdb_content_processed after merging:", tmdb_content_processed.shape)
print(tmdb_content_processed.head())

--------------- END --------------------

In [22]:
import faiss
import numpy as np

# Create FAISS index
embedding_dim = len(tmdb_content_processed["embedding"].iloc[0])
index = faiss.IndexFlatL2(embedding_dim)

# Convert embeddings to numpy array
embedding_matrix = np.vstack(tmdb_content_processed["embedding"].values)

# Add to index
index.add(embedding_matrix)

In [23]:
# Normalize embeddings
embedding_matrix = np.vstack(tmdb_content_processed["embedding"].values)
embedding_matrix = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1, keepdims=True)

# Create FAISS index with cosine similarity
index = faiss.IndexFlatIP(embedding_dim)  # Inner Product for cosine similarity
index.add(embedding_matrix)

In [24]:
def recommend_movies_1(title: str, top_k: int = 5):
    if title not in tmdb_content_processed["title"].values:
        return f"No match found for '{title}'."

    idx = tmdb_content_processed[tmdb_content_processed["title"] == title].index[0]
    query_vector = tmdb_content_processed.loc[idx, "embedding"].reshape(1, -1)
    
    distances, indices = index.search(query_vector, top_k + 1)
    recommended_titles = tmdb_content_processed.iloc[indices[0][1:]]["title"].tolist()
    
    return recommended_titles

In [26]:
# Replace this with a known title from your dataset
recommend_movies_1("Inception", top_k=10)

['Sniper 2',
 'Source Code',
 'Enter the Void',
 "Sniper: Assassin's End",
 'Extracted',
 'The Cell',
 'Backtrace',
 'Glorious',
 'Extreme Ops',
 'Antitrust']

Writing the Embeddings to chroma db and using chromaDB native cosine similarity to recommend

In [19]:
import chromadb
from chromadb.config import Settings
# Initialize ChromaDB client
client = chromadb.Client(Settings())
# List all collections in ChromaDB
collections = client.list_collections()
print("Collections in ChromaDB:", collections)

Collections in ChromaDB: []


In [20]:
# Write embeddings into ChromaDB
collection_name = "movie_recommend_contentbased_ratings"
collection = client.create_collection(name=collection_name)

In [21]:
# Add embeddings and metadata to ChromaDB
batch_size = 1024
for i in range(0, len(tmdb_content_processed), batch_size):
    batch = tmdb_content_processed.iloc[i:i + batch_size]
    collection.add(
        ids=batch["imdb_id"].astype(str).tolist(),
        embeddings=np.vstack(batch["embedding"].values).tolist(),
        metadatas=batch[["title", "averageRating"]].to_dict(orient="records")
    )
    print(f"Processed batch {i // batch_size + 1}/{len(tmdb_content_processed) // batch_size + 1}")

Processed batch 1/11
Processed batch 2/11
Processed batch 3/11
Processed batch 4/11
Processed batch 5/11
Processed batch 6/11
Processed batch 7/11
Processed batch 8/11
Processed batch 9/11
Processed batch 10/11
Processed batch 11/11


In [34]:
# Recommendation function
def recommend_movies_with_ratings(title: str, top_k: int = 5):
    if title not in tmdb_content_processed["title"].values:
        return f"No match found for '{title}'."

    # Find the embedding for the given title
    idx = tmdb_content_processed[tmdb_content_processed["title"] == title].index[0]
    query_embedding = tmdb_content_processed.loc[idx, "embedding"]

    # Query ChromaDB for similar movies
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=10  # Generate 10 candidates
    )

    # Extract metadata for the candidates
    candidates = results["metadatas"][0][1:]  # Exclude the query movie itself

    # Sort candidates by averageRating in descending order
    sorted_candidates = sorted(candidates, key=lambda x: x["averageRating"], reverse=True)

    # Return the top-k recommended titles
    recommended_titles = [candidate["title"] for candidate in sorted_candidates[:top_k]]
    return recommended_titles
    

In [41]:
# Example usage
print(recommend_movies_with_ratings("The Dark Knight", top_k=5))

['Batman: The Dark Knight Returns, Part 2', 'Batman Begins', 'Batman: Under the Red Hood', 'Batman: The Dark Knight Returns, Part 1', 'The Batman']


# SVD based Recommender

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
import time # To time operations
import re

In [3]:
data_dir = os.path.join('..', 'data', 'processed_data')
movies_df  = pd.read_csv(os.path.join(data_dir, 'movielens_content_processed.csv'))
ratings_df = pd.read_csv(os.path.join(data_dir, 'movielens_ratings.csv'))


In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
import re
def extract_year_from_title(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        year = match.group(1)
        title_without_year = re.sub(r'\s*\(\d{4}\)\s*', '', title)
        return title_without_year, year
    return title, None

# Apply the function to the DataFrame
movies_df[['title', 'year']] = movies_df['title'].apply(lambda x: pd.Series(extract_year_from_title(x)))

movies_df.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,114709,862,1995
1,2,Jumanji,Adventure|Children|Fantasy,113497,8844,1995
2,3,Grumpier Old Men,Comedy|Romance,113228,15602,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,114885,31357,1995
4,5,Father of the Bride Part II,Comedy,113041,11862,1995


In [7]:
def replace_genres_separator(df):
    df['genres'] = df['genres'].str.replace('|', ', ')
    return df

# Apply the function to movies_df
movies_df = replace_genres_separator(movies_df)
movies_df.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year
0,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",114709,862,1995
1,2,Jumanji,"Adventure, Children, Fantasy",113497,8844,1995
2,3,Grumpier Old Men,"Comedy, Romance",113228,15602,1995
3,4,Waiting to Exhale,"Comedy, Drama, Romance",114885,31357,1995
4,5,Father of the Bride Part II,Comedy,113041,11862,1995


In [8]:
# Merge movies and ratings data for easier processing
df = pd.merge(ratings_df, movies_df, on='movieId')
print("\nSample of merged data:")
df


Sample of merged data:


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,imdbId,tmdbId,year
0,1,1,4.0,964982703,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",114709,862,1995
1,1,3,4.0,964981247,Grumpier Old Men,"Comedy, Romance",113228,15602,1995
2,1,6,4.0,964982224,Heat,"Action, Crime, Thriller",113277,949,1995
3,1,47,5.0,964983815,Seven (a.k.a. Se7en),"Mystery, Thriller",114369,807,1995
4,1,50,5.0,964982931,"Usual Suspects, The","Crime, Mystery, Thriller",114814,629,1995
...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split,"Drama, Horror, Thriller",4972582,381288,2017
100832,610,168248,5.0,1493850091,John Wick: Chapter Two,"Action, Crime, Thriller",4425200,324552,2017
100833,610,168250,5.0,1494273047,Get Out,Horror,5052448,419430,2017
100834,610,168252,5.0,1493846352,Logan,"Action, Sci-Fi",3315342,263115,2017


In [12]:
# Save the merged ratings and metadata dataframe
output_path = '../data/processed_data/ratings_metadata.csv'
df.to_csv(output_path, index=False)
print(f"\nSaved merged ratings and metadata to: {output_path}")



Saved merged ratings and metadata to: ../data/processed_data/ratings_metadata.csv


In [9]:
# --- Block 2: Create User-Item Matrix & Apply SVD ---
start_time = time.time()

user_ids = ratings_df['userId'].unique() # 1. Map user and movie IDs to contiguous integer indices (0 to N-1)
movie_ids_from_ratings = ratings_df['movieId'].unique() # Movies that actually have ratings

user_id_to_idx = {user_id: i for i, user_id in enumerate(user_ids)}
movie_id_to_idx_for_matrix = {movie_id: i for i, movie_id in enumerate(movie_ids_from_ratings)} # We need to ensure movie_id_to_idx mapping is consistent for Vt (movie_embeddings), The columns of our final R matrix will correspond to these movie_ids_from_ratings
idx_to_movie_id_from_matrix = {i: movie_id for movie_id, i in movie_id_to_idx_for_matrix.items()}


# Map the original IDs in ratings_df to these new indices
row_indices = ratings_df['userId'].map(user_id_to_idx)
col_indices = ratings_df['movieId'].map(movie_id_to_idx_for_matrix)
rating_values = ratings_df['rating']

# Create the sparse matrix
# Filter out any NaNs that might have occurred if a movieId in ratings_df wasn't in movie_ids_from_ratings (shouldn't happen)
valid_indices = pd.Series(row_indices).notna() & pd.Series(col_indices).notna()
user_movie_rating_sparse_matrix = csr_matrix(
    (rating_values[valid_indices], (row_indices[valid_indices], col_indices[valid_indices])),
    shape=(len(user_ids), len(movie_ids_from_ratings))
)
print(f"\nShape of sparse user-movie rating matrix: {user_movie_rating_sparse_matrix.shape}")

R_dense = user_movie_rating_sparse_matrix.toarray()
user_ratings_mean = np.mean(R_dense, axis=1)
R_demeaned = R_dense - user_ratings_mean.reshape(-1, 1)

# Perform SVD
K = 50  # Number of latent factors (embedding dimension)
# Tip: Experiment with K. Smaller K is faster but might be less accurate. Larger K is slower.
print(f"Performing SVD with K={K} factors...")
U, sigma, Vt = svds(R_demeaned, k=K)

# The movie embeddings (latent factors) are in Vt.T
movie_embeddings = Vt.T
print(f"Shape of movie embeddings matrix: {movie_embeddings.shape}") # (n_movies_in_ratings, K)

matrix_svd_time = time.time() - start_time
print(f"Matrix creation and SVD took {matrix_svd_time:.2f} seconds.")


Shape of sparse user-movie rating matrix: (610, 9724)
Performing SVD with K=50 factors...
Shape of movie embeddings matrix: (9724, 50)
Matrix creation and SVD took 0.92 seconds.


In [11]:
# Save SVD embeddings and mappings for future use
print("\n--- Block 2.5: Saving SVD Embeddings and Mappings ---")

# Save movie embeddings as numpy array
np.save("../data/processed_data/movie_embeddings_movielens.npy", movie_embeddings)
print("Saved movie embeddings to ../data/processed_data/movie_embeddings_movielens.npy")

# Save index to movie ID mapping as JSON
pd.Series(idx_to_movie_id_from_matrix).to_json("../data/processed_data/idx_to_movie_id_movielens.json")
print("Saved index to movie ID mapping to ../data/processed_data/idx_to_movie_id_movielens.json")

# Save relevant movie metadata
movies_df.to_csv("../data/processed_data/movies_metadata_for_recs_movielens.csv", index=False)
print("Saved movie metadata to ../data/processed_data/movies_metadata_for_recs_movielens.csv")



--- Block 2.5: Saving SVD Embeddings and Mappings ---
Saved movie embeddings to ../data/processed_data/movie_embeddings_movielens.npy
Saved index to movie ID mapping to ../data/processed_data/idx_to_movie_id_movielens.json
Saved movie metadata to ../data/processed_data/movies_metadata_for_recs_movielens.csv


In [None]:
# --- Performance Tip: Save and Load Embeddings ---
# For frequent use, save the computed embeddings and mappings to avoid re-computing SVD every time.
# Example:
# np.save("movie_embeddings.npy", movie_embeddings)
# pd.Series(idx_to_movie_id_from_matrix).to_json("idx_to_movie_id.json")
# movies_df.to_csv("movies_metadata_for_recs.csv", index=False) # Save relevant movies_df

# To load:
# movie_embeddings = np.load("movie_embeddings.npy")
# idx_to_movie_id_from_matrix = pd.read_json("idx_to_movie_id.json", typ='series').to_dict()
# movies_df = pd.read_csv("movies_metadata_for_recs.csv")
# print("Pre-computed embeddings and mappings would be loaded here if available.")

In [17]:
print("\n--- Block 3: Recommendation Function (Returns List, Optional Printing) ---")

def get_recommendations_list(query_movie_id_original, embeddings_matrix,
                             idx_movie_map, all_movies_metadata_df,
                             num_recommendations=5): # Default changed to 5
    """
    Gets a list of movies similar to a given movie_id using learned embeddings.

    Args:
        query_movie_id_original (int): The original ID of the movie to get recommendations for.
        embeddings_matrix (np.ndarray): The matrix of movie embeddings (n_movies, k_features).
        idx_movie_map (dict): Mapping from internal SVD matrix index to original movieId.
        all_movies_metadata_df (pd.DataFrame): DataFrame with movie metadata (movieId, title, genres).
        num_recommendations (int): The number of similar movies to return.

    Returns:
        list: A list of dictionaries, where each dictionary represents a recommended movie
              and contains 'movieId', 'title', 'genres', and 'similarity_score'.
              Returns an empty list if the query movie is not found or no recommendations can be made.
    """
    recommendations_list = [] # Initialize an empty list to store results

    # Find the internal matrix index for the original query_movie_id
    movie_id_to_internal_idx_map = {v: k for k, v in idx_movie_map.items()}

    if query_movie_id_original not in movie_id_to_internal_idx_map:
        # We can print an error here or let the calling code handle an empty list
        # For now, let's print and return empty, but you might prefer silent return
        print(f"Error: Movie ID {query_movie_id_original} was not included in the SVD model (e.g., no ratings).")
        return recommendations_list # Return empty list

    query_movie_internal_idx = movie_id_to_internal_idx_map[query_movie_id_original]

    query_embedding = embeddings_matrix[query_movie_internal_idx, :].reshape(1, -1)
    similarity_scores = cosine_similarity(query_embedding, embeddings_matrix).flatten()
    similar_movie_indices_internal = np.argsort(similarity_scores)[::-1]

    recommendations_found_count = 0
    for internal_idx in similar_movie_indices_internal:
        if recommendations_found_count >= num_recommendations:
            break

        reco_movie_id_original = idx_movie_map.get(internal_idx)
        if reco_movie_id_original is None or reco_movie_id_original == query_movie_id_original:
            continue

        movie_details = all_movies_metadata_df[all_movies_metadata_df['movieId'] == reco_movie_id_original]
        if not movie_details.empty:
            title = movie_details.iloc[0]['title']
            genres = movie_details.iloc[0]['genres']
            score = similarity_scores[internal_idx]
            
            recommendations_list.append({
                'movieId': reco_movie_id_original,
                'title': title,
                'genres': genres,
                'similarity_score': score
            })
            recommendations_found_count += 1
            
    return recommendations_list


--- Block 3: Recommendation Function (Returns List, Optional Printing) ---


In [18]:
def print_recommendations(query_movie_id_original, recommendations_list, all_movies_metadata_df):
    """
    Prints the recommendations in a neat format.
    """
    if not recommendations_list:
        # Check if the query movie itself existed but had no recs, or didn't exist at all
        if not any(d['movieId'] == query_movie_id_original for d in recommendations_list): # This check is tricky here
             # Let's just rely on the calling context or get_recommendations_list having printed an error
             query_movie_title_display = all_movies_metadata_df[all_movies_metadata_df['movieId'] == query_movie_id_original]['title'].values
             title_to_print = query_movie_title_display[0] if len(query_movie_title_display) > 0 else f"ID: {query_movie_id_original}"
             if not all_movies_metadata_df[all_movies_metadata_df['movieId'] == query_movie_id_original].empty:
                 print(f"\nNo similar movies found for '{title_to_print}' (after filtering).")
        return

    query_movie_title = all_movies_metadata_df[all_movies_metadata_df['movieId'] == query_movie_id_original]['title'].values
    query_movie_title = query_movie_title[0] if len(query_movie_title) > 0 else f"ID: {query_movie_id_original}"

    print(f"\nTop {len(recommendations_list)} recommendations for '{query_movie_title}':")
    print("----------------------------------------------------")
    for i, movie in enumerate(recommendations_list):
        print(f"  Rank {i + 1}: {movie['title']} (ID: {movie['movieId']})")
        print(f"     Genres: {movie['genres']}")
        print(f"     Similarity Score: {movie['similarity_score']:.4f}\n")
    print("----------------------------------------------------")

In [20]:
example_movie_id = 79132# Toy Story (1995)
recommended_movies_list_1 = get_recommendations_list(
    query_movie_id_original=example_movie_id,
    embeddings_matrix=movie_embeddings,
    idx_movie_map=idx_to_movie_id_from_matrix,
    all_movies_metadata_df=movies_df,
    num_recommendations=5 # Get top 5
)
if recommended_movies_list_1: # Check if the list is not empty
    print_recommendations(example_movie_id, recommended_movies_list_1, movies_df)


Top 5 recommendations for 'Inception':
----------------------------------------------------
  Rank 1: Inglourious Basterds (ID: 68157)
     Genres: Action, Drama, War
     Similarity Score: 0.8168

  Rank 2: Dark Knight, The (ID: 58559)
     Genres: Action, Crime, Drama, IMAX
     Similarity Score: 0.7498

  Rank 3: Shutter Island (ID: 74458)
     Genres: Drama, Mystery, Thriller
     Similarity Score: 0.6898

  Rank 4: Dark Knight Rises, The (ID: 91529)
     Genres: Action, Adventure, Crime, IMAX
     Similarity Score: 0.6886

  Rank 5: Django Unchained (ID: 99114)
     Genres: Action, Drama, Western
     Similarity Score: 0.6645

----------------------------------------------------


In [51]:
# Example for a movie that might not be in the ratings matrix (and thus SVD model)
example_movie_id_not_rated = 999999 # Assuming this ID doesn't exist or has no ratings
print(f"\nAttempting recommendations for potentially non-existent Movie ID {example_movie_id_not_rated}:")
recommended_movies_list_4 = get_recommendations_list(
    example_movie_id_not_rated, movie_embeddings, idx_to_movie_id_from_matrix, movies_df, num_recommendations=5
)
if recommended_movies_list_4: # This list will likely be empty
    print_recommendations(example_movie_id_not_rated, recommended_movies_list_4, movies_df)


Attempting recommendations for potentially non-existent Movie ID 999999:
Error: Movie ID 999999 was not included in the SVD model (e.g., no ratings).
