## Content-Based Filtering

### Setup

Importing Libraries

In [1]:
import ipywidgets as widgets
import numpy as np
import os
import pandas as pd
import pickle
import torch

from IPython.display import display, clear_output
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Loading Preprocessed Data

In [2]:
df = pd.read_csv("preprocessed_data/merged_metadata_keywords.csv")
df_qualified = pd.read_csv("preprocessed_data/qualified_movies.csv")

### Sentence Embedding Generation and Similarity Calculation

Initialize the model with the 'all-MiniLM-L6-v2' pre-trained model

In [3]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

Encode a list of sentences in the 'soup' column using the pre-trained model

In [None]:
# Uncomment this cell if you run it for the first time

# sentence_embeddings = model.encode(df["soup"].tolist())

# folder_name = "embedding_data_cbf"

# if not os.path.exists(folder_name):
#     os.makedirs(folder_name)

# with open(f"{folder_name}\sentence_embeddings.pkl", "wb") as f:
#     pickle.dump(sentence_embeddings, f)

In [4]:
folder_name = "embedding_data_cbf"

# Import sentence_embeddings from the file
with open(f"{folder_name}\sentence_embeddings.pkl", "rb") as f:
    sentence_embeddings = pickle.load(f)

Compute cosine similarity

In [None]:
# Uncomment this cell if you run it for the first time

# cos_sim = cosine_similarity(sentence_embeddings)

# with open(f"{folder_name}\cos_sim.pkl", "wb") as f:
#     pickle.dump(cos_sim, f)

In [5]:
# Import cos_sim from the file
with open(f"{folder_name}\cos_sim.pkl", "rb") as f:
    cos_sim = pickle.load(f)

Construct a reverse map of movie titles to indices

In [6]:
movie_indices = pd.Series(
    df.index, index=df["title"].apply(lambda title: title.lower())
).drop_duplicates()
movie_indices.head()

title
toy story                      0
jumanji                        1
grumpier old men               2
waiting to exhale              3
father of the bride part ii    4
dtype: int64

### Pipeline
Steps:
1. Retrieve the sorted indices of movies based on their similarity scores to a given movie.
2. Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.
3. Print out the details of the top 5 most recommended movies.

In [7]:
def get_sorted_movie_indices(title: str, cos_sim: np.ndarray) -> list[int]:
    """
    Retrieve the sorted indices of movies based on their similarity scores to a given movie.

    :param title: The title of the movie to find similar movies for.
    :param cos_sim: The cosine similarity matrix of movies.
    :return: A list of sorted movie indices.
    """
    try:
        # Get the index of the movie that matches the title
        movie_index = movie_indices[title.lower()]

        # If there are multiple movies with the same title, pick the first one.
        if isinstance(movie_index, pd.Series):
            movie_index = movie_index[0]

    except KeyError:
        print(f"Movie '{title}' not found. Please enter a valid movie title.")
        return None

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cos_sim[movie_index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]

    # Get the movie indices
    sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]
    
    return sorted_movie_indices

In [8]:
def get_qualified_movies(df: pd.DataFrame, df_qualified:pd.DataFrame, sorted_movie_indices: list[int]) -> None:
    """
    Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.

    :param df: The DataFrame containing movie details.
    :param df_qualified: The DataFrame containing qualified movie details.
    :param sorted_movie_indices: A list of movie indices sorted by similarity scores.
    """
    movie_details = [
        "id",
        "title",
        "genres",
        "original_language",
        "production_countries",
        "release_date",
        "overview",
        "vote_count",
        "vote_average",
        "runtime",
    ]

    sorted_movies = df.loc[sorted_movie_indices, movie_details]
    qualified_movies = sorted_movies[sorted_movies['id'].isin(df_qualified['id'])]
    return qualified_movies

In [9]:
def get_movie_recommendations_cbf(title: str, num_recommendations: int = 5) -> None:
    """
    Get movie recommendations based on a given title using content-based filtering.

    :param title: The title of the movie to find similar movies for.
    :param num_recommendations: The number of recommended movies, defaults to 5.
    """
    sorted_movie_indices = get_sorted_movie_indices(title, cos_sim)
    qualified_movies = get_qualified_movies(df, df_qualified, sorted_movie_indices)
    recommended_movies = qualified_movies.head(num_recommendations)

    for _, movie in recommended_movies.iterrows():
        # Print the movie details
        print(f"Title: {movie['title']}")
        print(f"Overview: {movie['overview']}")
        print(f"Genres: {movie['genres']}")
        print(f"Original Language: {movie['original_language']}")
        print(f"Runtime: {int(movie['runtime'])} mins")
        print(f"Production Countries: {movie['production_countries']}")
        print(f"Release Date: {movie['release_date']}")
        print(
            f"Rating: {movie['vote_average']} out of 10 ({int(movie['vote_count'])} ratings)"
        )
        print("")

### Demo

In [None]:
dropdown = widgets.Dropdown(options=df["title"].unique())
search_box = widgets.Text(placeholder="Search movie title...")
button = widgets.Button(description="Get Movie Recommendations")
button.layout.width = "200px"


def on_search_box_value_change(change):
    """
    Event listener for the search box widget. Updates the dropdown options based on the search query.
    """

    search_value = change.new.lower()
    options = df[df["title"].str.lower().str.contains(search_value)][
        "title"
    ].unique()
    dropdown.options = options if len(options) > 0 else ["Movie not found"]
    dropdown.label = options[0] if len(options) > 0 else "Movie not found"


def on_button_click(button):
    """
    Event listener for the button widget. Displays the recommended movies.
    """

    clear_output()
    display(search_box)
    display(dropdown)
    display(button)
    title = dropdown.value

    if title != "Movie not found":
        search_box.value = dropdown.value
        get_movie_recommendations_cbf(title)

    else:
        print("Movie Not Found")


# Attach event listeners to the widgets
search_box.observe(on_search_box_value_change, names="value")
button.on_click(on_button_click)

# Display the widgets
display(search_box)
display(dropdown)
display(button)

Text(value='Iron Man', placeholder='Search movie title...')

Dropdown(index=1, options=('Tetsuo: The Iron Man', 'Iron Man', 'Iron Man 2', 'The Invincible Iron Man', 'Iron …

Button(description='Get Movie Recommendations', layout=Layout(width='200px'), style=ButtonStyle())

Title: Iron Man 3
Overview: When Tony Stark's world is torn apart by a formidable terrorist called the Mandarin, he starts an odyssey of rebuilding and retribution.
Genres: Action, Adventure, Science Fiction
Original Language: en
Runtime: 130 mins
Production Countries: China, United States of America
Release Date: 18/4/2013
Rating: 6.8 out of 10 (8951 ratings)

Title: Iron Man 2
Overview: With the world now aware of his dual life as the armored superhero Iron Man, billionaire inventor Tony Stark faces pressure from the government, the press and the public to share his technology with the military. Unwilling to let go of his invention, Stark, with Pepper Potts and James 'Rhodey' Rhodes at his side, must forge new alliances – and confront powerful enemies.
Genres: Adventure, Action, Science Fiction
Original Language: en
Runtime: 124 mins
Production Countries: United States of America
Release Date: 28/4/2010
Rating: 6.6 out of 10 (6969 ratings)

Title: Avengers: Age of Ultron
Overview: Wh