## Content-Based Filtering

### Setup

Importing Libraries

In [1]:
import gradio as gr
import numpy as np
import os
import pandas as pd
import pickle
import torch

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Loading Preprocessed Data

In [2]:
df = pd.read_csv("preprocessed_data/merged_metadata_keywords.csv")
df_qualified = pd.read_csv("preprocessed_data/qualified_movies.csv")

### Sentence Embedding Generation and Similarity Calculation

Initialize the model with the 'all-MiniLM-L6-v2' pre-trained model

In [3]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

Encode a list of sentences in the 'soup' column using the pre-trained model

In [4]:
# Uncomment this cell if you run it for the first time

# sentence_embeddings = model.encode(df["soup"].tolist())

# folder_name = "embedding_data_cbf"

# if not os.path.exists(folder_name):
#     os.makedirs(folder_name)

# with open(f"{folder_name}\sentence_embeddings.pkl", "wb") as f:
#     pickle.dump(sentence_embeddings, f)

In [5]:
folder_name = "embedding_data_cbf"

# Import sentence_embeddings from the file
with open(f"{folder_name}\sentence_embeddings.pkl", "rb") as f:
    sentence_embeddings = pickle.load(f)

Compute cosine similarity

In [6]:
# Uncomment this cell if you run it for the first time

# cos_sim = cosine_similarity(sentence_embeddings)

# with open(f"{folder_name}\cos_sim.pkl", "wb") as f:
#     pickle.dump(cos_sim, f)

In [7]:
# Import cos_sim from the file
with open(f"{folder_name}\cos_sim.pkl", "rb") as f:
    cos_sim = pickle.load(f)

Construct a reverse map of movie titles to indices

In [8]:
movie_indices = pd.Series(
    df.index, index=df["title"].apply(lambda title: title.lower())
).drop_duplicates()
movie_indices.head()

title
toy story                      0
jumanji                        1
grumpier old men               2
waiting to exhale              3
father of the bride part ii    4
dtype: int64

### Pipeline
Steps:
1. Retrieve the sorted indices of movies based on their similarity scores to a given movie.
2. Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.
3. Print out the details of the top 5 most recommended movies.

In [9]:
def get_sorted_movie_indices(title: str, cos_sim: np.ndarray) -> list[int]:
    """
    Retrieve the sorted indices of movies based on their similarity scores to a given movie.

    :param title: The title of the movie to find similar movies for.
    :param cos_sim: The cosine similarity matrix of movies.
    :return: A list of sorted movie indices.
    """
    try:
        # Get the index of the movie that matches the title
        movie_index = movie_indices[title.lower()]

        # If there are multiple movies with the same title, pick the first one.
        if isinstance(movie_index, pd.Series):
            movie_index = movie_index[0]

    except KeyError:
        print(f"Movie '{title}' not found. Please enter a valid movie title.")
        return None

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cos_sim[movie_index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]

    # Get the movie indices
    sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]

    return sorted_movie_indices

In [10]:
def get_qualified_movies(
    df: pd.DataFrame, df_qualified: pd.DataFrame, sorted_movie_indices: list[int]
) -> pd.DataFrame:
    """
    Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.

    :param df: The DataFrame containing movie details.
    :param df_qualified: The DataFrame containing qualified movie details.
    :param sorted_movie_indices: A list of movie indices sorted by similarity scores.
    :return: A Pandas DataFrame containing the qualified movies sorted by similarity scores.
    """
    movie_details = [
        "id",
        "title",
        "genres",
        "original_language",
        "production_countries",
        "release_date",
        "runtime",
    ]

    sorted_movies = df.loc[sorted_movie_indices, movie_details]
    qualified_movies = sorted_movies[sorted_movies["id"].isin(df_qualified["id"])]
    return qualified_movies

In [11]:
def get_movie_recommendations_cbf(
    title: str, num_recommendations: int = 5
) -> pd.DataFrame:
    """
    Get movie recommendations based on a given title using content-based filtering.

    :param title: The title of the movie to find similar movies for.
    :param num_recommendations: The number of recommended movies, defaults to 5.
    :return: A Pandas DataFrame containing the recommended movies
    """
    sorted_movie_indices = get_sorted_movie_indices(title, cos_sim)
    qualified_movies = get_qualified_movies(df, df_qualified, sorted_movie_indices)
    recommended_movies = qualified_movies.head(num_recommendations)
    recommended_movies.columns = [
        "ID",
        "Title",
        "Genres",
        "Language",
        "Production Countries",
        "Release Date",
        "Runtime",
    ]

    return recommended_movies

### Demo

In [12]:
with gr.Blocks(theme=gr.themes.Soft(text_size="lg")) as demo:
    gr.Markdown(
        """
    # Movie Recommendation System
    
    """
    )
    title = gr.Dropdown(
        choices=df["title"].unique().tolist(), label="Movie Title", value="Iron Man"
    )
    recommend_button = gr.Button("Get Movie Recommendations")
    recommended_movies = gr.DataFrame(label="Movie Recommendations")
    recommend_button.click(
        get_movie_recommendations_cbf, inputs=[title], outputs=recommended_movies
    )
    examples = gr.Examples(
        examples=[
            "Captain America: The First Avenger",
            "The Conjuring",
            "Toy Story",
            "Final Destination 5",
        ],
        inputs=[title],
    )

demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


