## Hybrid Recommendation System

### Importing Libraries

In [1]:
import ipywidgets as widgets
import numpy as np
import os
import pandas as pd
import pickle
import torch

from IPython.display import display, clear_output
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

### Data Loading

In [2]:
df_metadata_keywords = pd.read_csv("preprocessed_data/merged_metadata_keywords.csv")
df_qualified = pd.read_csv("preprocessed_data/qualified_movies.csv")
df_ratings = pd.read_csv("data/ratings_small.csv")
df_ids = pd.read_csv("data/links.csv")[["movieId", "tmdbId"]]

### Data Preprocessing

#### Rating
100,000 ratings from 700 users on 9,000 movies.

In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


#### IDs
Contains the movie IDs and TMDB IDs of all the movies featured in the Full MovieLens dataset.

In [4]:
df_ids.head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [5]:
df_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  45843 non-null  int64  
 1   tmdbId   45624 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 716.4 KB


In [6]:
# Drop rows with missing values
df_ids.dropna(inplace=True)

# Rename the columns of the DataFrame
df_ids.columns = ["movieId", "id"]

# Convert the "id" column to integer data type
df_ids["id"] = df_ids["id"].astype(int)

Merge the DataFrames df_metadata_keywords and df_ids based on the "id" column

In [7]:
df_merged = df_metadata_keywords.merge(df_ids, on="id")

Filter df_merged to keep only rows where the 'movieId' is present in df_ratings

In [8]:
df_merged = df_merged[df_merged['movieId'].isin(df_ratings['movieId'])]
df_merged = df_merged.reset_index(drop=True)
df_merged.shape

(8992, 15)

### Content-Based Filtering

Initialize the model with the 'all-MiniLM-L6-v2' pre-trained model

In [9]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

Encode a list of sentences in the 'soup' column using the pre-trained model

In [None]:
# Uncomment this cell if you run it for the first time

# sentence_embeddings = model.encode(df_merged["soup"].tolist())

# folder_name = "embedding_data_hrs"

# if not os.path.exists(folder_name):
#     os.makedirs(folder_name)

# with open(f"{folder_name}\sentence_embeddings.pkl", "wb") as f:
#     pickle.dump(sentence_embeddings, f)

In [10]:
folder_name = "embedding_data_hrs"

# Import sentence_embeddings from the file
with open(f"{folder_name}\sentence_embeddings.pkl", "rb") as f:
    sentence_embeddings = pickle.load(f)

Compute cosine similarity

In [None]:
# Uncomment this cell if you run it for the first time

# cos_sim = cosine_similarity(sentence_embeddings)

# with open(f"{folder_name}\cos_sim.pkl", "wb") as f:
#     pickle.dump(cos_sim, f)

In [11]:
# Import cos_sim from the file
with open(f"{folder_name}\cos_sim.pkl", "rb") as f:
    cos_sim = pickle.load(f)

Construct a reverse map of movie titles to indices

In [12]:
movie_indices = pd.Series(
    df_merged.index, index=df_merged["title"].apply(lambda title: title.lower())
).drop_duplicates()

### Collaborative Filtering

We'll be using the Surprise library to implement SVD.

In [13]:
reader = Reader()
data = Dataset.load_from_df(df_ratings[["userId", "movieId", "rating"]], reader)
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8947  0.8996  0.9015  0.8948  0.8898  0.8961  0.0041  
MAE (testset)     0.6898  0.6893  0.6955  0.6864  0.6862  0.6894  0.0034  
Fit time          0.70    0.68    0.65    0.66    0.64    0.67    0.02    
Test time         0.08    0.17    0.07    0.07    0.07    0.09    0.04    


{'test_rmse': array([0.89473957, 0.89960587, 0.90152149, 0.89482135, 0.88983788]),
 'test_mae': array([0.68983375, 0.68934321, 0.69547106, 0.68638129, 0.68619641]),
 'fit_time': (0.6957011222839355,
  0.676469087600708,
  0.6527445316314697,
  0.6585137844085693,
  0.6441705226898193),
 'test_time': (0.0780792236328125,
  0.16643476486206055,
  0.07138752937316895,
  0.07303571701049805,
  0.06867194175720215)}

In [14]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2224d6c5030>

In [15]:
df_ratings[df_ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


Predict the rating of User 1 for the movie with the movie ID 12.

In [16]:
user_id = 1
movie_id = 12

predicted_rating = svd.predict(user_id, movie_id)
predicted_rating.est

2.12875402332611

Create a new DataFrame with "id" as the index

In [17]:
indices_map = df_merged.set_index("id")
indices_map.head()

Unnamed: 0_level_0,keywords,title,genres,original_language,overview,tagline,production_countries,release_date,status,vote_average,vote_count,runtime,soup,movieId
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
862,"jealousy, toy, boy, friendship, friends, rival...",Toy Story,"Animation, Comedy, Family",en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released,7.7,5415.0,81.0,"animation, comedy, family en led by woody, and...",1
8844,"board game, disappearance, based on children's...",Jumanji,"Adventure, Fantasy, Family",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released,6.9,2413.0,104.0,"adventure, fantasy, family en when siblings ju...",2
15602,"fishing, best friend, duringcreditsstinger, ol...",Grumpier Old Men,"Romance, Comedy",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released,6.5,92.0,101.0,"romance, comedy en a family wedding reignites ...",3
31357,"based on novel, interracial relationship, sing...",Waiting to Exhale,"Comedy, Drama, Romance",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released,6.1,34.0,127.0,"comedy, drama, romance en cheated on, mistreat...",4
11862,"baby, midlife crisis, confidence, aging, daugh...",Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released,5.7,173.0,106.0,comedy en just when george banks has recovered...,5


### Hybrid Recommendation System

#### Pipeline
Steps:
1. Retrieve the sorted indices of movies based on their similarity scores to a given movie.
2. Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating and select the top 50 qualified movies.
3. Predict the user rating for these 50 qualified movies using SVD and sort the DataFrame based on the estimated user rating.
4. Print out the details of the top 5 most recommended movies.

In [18]:
def get_sorted_movie_indices(title: str, cos_sim: np.ndarray) -> list[int]:
    """
    Retrieve the sorted indices of movies based on their similarity scores to a given movie.

    :param title: The title of the movie to find similar movies for.
    :param cos_sim: The cosine similarity matrix of movies.
    :return: A list of sorted movie indices.
    """
    try:
        # Get the index of the movie that matches the title
        movie_index = movie_indices[title.lower()]

        # If there are multiple movies with the same title, pick the first one.
        if isinstance(movie_index, pd.Series):
            movie_index = movie_index[0]

    except KeyError:
        print(f"Movie '{title}' not found. Please enter a valid movie title.")
        return None

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cos_sim[movie_index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]

    # Get the movie indices
    sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]
    
    return sorted_movie_indices

In [19]:
def get_qualified_movies(df: pd.DataFrame, df_qualified:pd.DataFrame, sorted_movie_indices: list[int]) -> None:
    """
    Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.

    :param df: The DataFrame containing movie details.
    :param df_qualified: The DataFrame containing qualified movie details.
    :param sorted_movie_indices: A list of movie indices sorted by similarity scores.
    """
    movie_details = [
        "id",
        "title",
        "genres",
        "original_language",
        "production_countries",
        "release_date",
        "overview",
        "vote_count",
        "vote_average",
        "runtime",
    ]

    sorted_movies = df.loc[sorted_movie_indices, movie_details]
    qualified_movies = sorted_movies[sorted_movies['id'].isin(df_qualified['id'])]
    return qualified_movies

In [20]:
def predict_user_rating(userId: int, qualified_movies: pd.DataFrame, indices_map: pd.DataFrame) -> pd.DataFrame:
    """
    Predict the user rating for qualified movies using SVD and return the sorted DataFrame.

    :param userId: The ID of the user.
    :param qualified_movies:  A Pandas DataFrame containing qualified movies data.
    :return: A Pandas DataFrame containing the final qualified movies sorted by estimated user ratings.
    """
    # Calculate estimated user ratings for qualified movies using SVD
    qualified_movies["estimated_user_rating"] = qualified_movies["id"].apply(
        lambda x: svd.predict(userId, indices_map.loc[x]["movieId"]).est
    )
    final_qualified_movies = qualified_movies.sort_values(
        by=["estimated_user_rating"], ascending=False
    )

    return final_qualified_movies

In [21]:
def get_movie_recommendations_hybrid(title: str, userId: int) -> None:
    """
    Get movie recommendations based on a given title and user ID.

    :param title: The title of the movie to find similar movies for.
    :param userId: The ID of the user.
    """
    # Get recommended movie indices based on the given title
    sorted_movie_indices = get_sorted_movie_indices(title, cos_sim)

    # Filter out bad movies and select the top 50 qualified movies
    qualified_movies = get_qualified_movies(df_merged, df_qualified, sorted_movie_indices).head(50)

    # Predict user ratings for qualified movies and select the top recommended movies
    final_qualified_movies = predict_user_rating(userId, qualified_movies, indices_map).head(5)

    for _, movie in final_qualified_movies.iterrows():
        # Print the movie details
        print(f"Title: {movie['title']}")
        print(f"Overview: {movie['overview']}")
        print(f"Genres: {movie['genres']}")
        print(f"Original Language: {movie['original_language']}")
        print(f"Runtime: {int(movie['runtime'])} mins")
        print(f"Production Countries: {movie['production_countries']}")
        print(f"Release Date: {movie['release_date']}")
        print(
            f"Rating: {movie['vote_average']} out of 10 ({int(movie['vote_count'])} ratings)"
        )
        print("")

### Demo

In [22]:
dropdown = widgets.Dropdown(options=df_merged["title"].unique())
search_box = widgets.Text(placeholder="Search movie title...")
userId_text = widgets.Text(placeholder="Enter a user ID")
button = widgets.Button(description="Get Movie Recommendations")
button.layout.width = "200px"

container = widgets.HBox([search_box, userId_text])
container2 = widgets.HBox([dropdown, button])


def on_search_box_value_change(change):
    """
    Event listener for the search box widget. Updates the dropdown options based on the search query.
    """
    search_value = change.new.lower()
    options = df_merged[df_merged["title"].str.lower().str.contains(search_value)][
        "title"
    ].unique()
    dropdown.options = options if len(options) > 0 else ["Movie not found"]
    dropdown.label = options[0] if len(options) > 0 else "Movie not found"


def on_button_click(button):
    """
    Event listener for the button widget. Displays the recommended movies.
    """
    clear_output()
    display(container)
    display(container2)
    try:
        userId = int(userId_text.value)
        exists = userId in df_ratings['userId'].values
        if exists:
            title = dropdown.value
            if title != "Movie not found":
                search_box.value = dropdown.value
                get_movie_recommendations_hybrid(title, userId)
            else:
                print("Movie Not Found")
        else:
            raise ValueError

    except ValueError:
        print("User ID not found. Please enter a valid user ID!")

# Attach event listeners to the widgets
search_box.observe(on_search_box_value_change, names="value")
button.on_click(on_button_click)

print("Movie Recommendation System")
display(container)
display(container2)

HBox(children=(Text(value='Iron Man', placeholder='Search movie title...'), Text(value='1', placeholder='Enter…

HBox(children=(Dropdown(index=1, options=('Tetsuo: The Iron Man', 'Iron Man', 'Iron Man 2', 'Iron Man 3'), val…

Title: The Matrix
Overview: Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.
Genres: Action, Science Fiction
Original Language: en
Runtime: 136 mins
Production Countries: Australia, United States of America
Release Date: 30/3/1999
Rating: 7.9 out of 10 (9079 ratings)

Title: X-Men: First Class
Overview: Before Charles Xavier and Erik Lensherr took the names Professor X and Magneto, they were two young men discovering their powers for the first time. Before they were arch-enemies, they were closest of friends, working together with other mutants (some familiar, some new), to stop the greatest threat the world has ever known.
Genres: Action, Science Fiction, Adventure
Original Language: en
Runtime: 132 mins
Production Countries: United States of America
Release Date: 24/5/2011
Rating: 7.1 out of 10 (5252 ratings)

Title: Megamind
Overview: Bumbling supervil