## Hybrid Recommendation System

### Importing Libraries

In [1]:
import gradio as gr
import numpy as np
import os
import pandas as pd
import pickle
import torch

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

### Data Loading

In [2]:
df_metadata_keywords = pd.read_csv("preprocessed_data/merged_metadata_keywords.csv")
df_qualified = pd.read_csv("preprocessed_data/qualified_movies.csv")
df_ratings = pd.read_csv("data/ratings_small.csv")
df_ids = pd.read_csv("data/links.csv")[["movieId", "tmdbId"]]

### Data Preprocessing

#### Rating
100,000 ratings from 700 users on 9,000 movies.

In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


#### IDs
Contains the movie IDs and TMDB IDs of all the movies featured in the Full MovieLens dataset.

In [4]:
df_ids.head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [5]:
df_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  45843 non-null  int64  
 1   tmdbId   45624 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 716.4 KB


In [6]:
# Drop rows with missing values
df_ids.dropna(inplace=True)

# Rename the columns of the DataFrame
df_ids.columns = ["movieId", "id"]

# Convert the "id" column to integer data type
df_ids["id"] = df_ids["id"].astype(int)

Merge the DataFrames df_metadata_keywords and df_ids based on the "id" column

In [7]:
df_merged = df_metadata_keywords.merge(df_ids, on="id")

Filter df_merged to keep only rows where the 'movieId' is present in df_ratings

In [8]:
df_merged = df_merged[df_merged['movieId'].isin(df_ratings['movieId'])]
df_merged = df_merged.reset_index(drop=True)
df_merged.shape

(8992, 16)

### Content-Based Filtering

- Initialize the model with the 'all-MiniLM-L6-v2' pre-trained model
- Encode a list of sentences in the 'soup' column using the pre-trained model

In [9]:
# Uncomment this cell if you run it for the first time
# if torch.cuda.is_available():
#     device = "cuda"
# else:
#     device = "cpu"

# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# sentence_embeddings = model.encode(df_merged["soup"].tolist())

# folder_name = "embedding_data_hrs"

# if not os.path.exists(folder_name):
#     os.makedirs(folder_name)

# with open(f"{folder_name}\sentence_embeddings.pkl", "wb") as f:
#     pickle.dump(sentence_embeddings, f)

In [10]:
folder_name = "embedding_data_hrs"

# Import sentence_embeddings from the file
with open(f"{folder_name}\sentence_embeddings.pkl", "rb") as f:
    sentence_embeddings = pickle.load(f)

Compute cosine similarity

In [11]:
# Uncomment this cell if you run it for the first time

# cos_sim = cosine_similarity(sentence_embeddings)

# with open(f"{folder_name}\cos_sim.pkl", "wb") as f:
#     pickle.dump(cos_sim, f)

In [12]:
# Import cos_sim from the file
with open(f"{folder_name}\cos_sim.pkl", "rb") as f:
    cos_sim = pickle.load(f)

Construct a reverse map of movie titles to indices

In [13]:
movie_indices = pd.Series(
    df_merged.index, index=df_merged["title"].apply(lambda title: title.lower())
).drop_duplicates()

### Collaborative Filtering

We'll be using the Surprise library to implement SVD.

In [14]:
reader = Reader()
data = Dataset.load_from_df(df_ratings[["userId", "movieId", "rating"]], reader)
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8940  0.9001  0.8959  0.8980  0.8936  0.8963  0.0025  
MAE (testset)     0.6867  0.6966  0.6883  0.6896  0.6865  0.6895  0.0037  
Fit time          0.77    0.95    0.86    0.71    0.75    0.81    0.09    
Test time         0.15    0.08    0.08    0.09    0.09    0.10    0.03    


{'test_rmse': array([0.89398029, 0.90005274, 0.8958722 , 0.89803327, 0.89355723]),
 'test_mae': array([0.68665694, 0.69657699, 0.68831797, 0.68958196, 0.68654384]),
 'fit_time': (0.770005464553833,
  0.9543032646179199,
  0.8649904727935791,
  0.7105824947357178,
  0.7518835067749023),
 'test_time': (0.1527559757232666,
  0.07551002502441406,
  0.07500267028808594,
  0.08951592445373535,
  0.09251046180725098)}

In [15]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20d8d07b5e0>

In [16]:
df_ratings[df_ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


Predict the rating of User 1 for the movie with the movie ID 12.

In [17]:
user_id = 1
movie_id = 12

predicted_rating = svd.predict(user_id, movie_id)
predicted_rating.est

2.2207074826278803

Create a new DataFrame with "id" as the index

In [18]:
indices_map = df_merged.set_index("id")
indices_map.head()

Unnamed: 0_level_0,keywords,title,genres,original_language,overview,tagline,production_countries,release_date,status,vote_average,vote_count,runtime,soup,weighted_rating,movieId
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
862,"jealousy, toy, boy, friendship, friends, rival...",Toy Story,"Animation, Comedy, Family",en,"Led by Woody, Andy's toys live happily in his ...",,United States of America,30/10/1995,Released,7.7,5415.0,81.0,"animation, comedy, family en led by woody, and...",7.6,1
8844,"board game, disappearance, based on children's...",Jumanji,"Adventure, Fantasy, Family",en,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,United States of America,15/12/1995,Released,6.9,2413.0,104.0,"adventure, fantasy, family en when siblings ju...",6.8,2
15602,"fishing, best friend, duringcreditsstinger, ol...",Grumpier Old Men,"Romance, Comedy",en,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,United States of America,22/12/1995,Released,6.5,92.0,101.0,"romance, comedy en a family wedding reignites ...",5.9,3
31357,"based on novel, interracial relationship, sing...",Waiting to Exhale,"Comedy, Drama, Romance",en,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,United States of America,22/12/1995,Released,6.1,34.0,127.0,"comedy, drama, romance en cheated on, mistreat...",5.7,4
11862,"baby, midlife crisis, confidence, aging, daugh...",Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,United States of America,10/2/1995,Released,5.7,173.0,106.0,comedy en just when george banks has recovered...,5.7,5


### Hybrid Recommendation System

#### Pipeline
Steps:
1. Get a sorted DataFrame of movies based on their similarity scores to a given movie.
2. Filter out movies that are not in the qualified movies chart and sort the movies based on similarity scores and IMDB's weighted rating.
3. Predict the user rating for qualified movies using SVD and return the sorted DataFrame.
4. Get 5 movie recommendations based on a given title and a user ID.

In [19]:
def get_sorted_similar_movies(
    title: str, cos_sim: np.ndarray, df_merged: pd.DataFrame
) -> list[int]:
    """
    Get a sorted DataFrame of movies based on their similarity scores to a given movie.

    :param title: The title of the movie to find similar movies for.
    :param cos_sim: The cosine similarity matrix of movies.
    :param df_merged: The DataFrame containing movie details.
    :return: A sorted DataFrame of similar movies.
    """
    try:
        # Get the index of the movie that matches the title
        movie_index = movie_indices[title.lower()]

        # If there are multiple movies with the same title, pick the first one.
        if isinstance(movie_index, pd.Series):
            movie_index = movie_index[0]

    except KeyError:
        print(f"Movie '{title}' not found. Please enter a valid movie title.")
        return None

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cos_sim[movie_index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]

    # Get the movie indices
    sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]

    # Get the similarity scores
    sorted_similarity_scores = [format(sim_score[1], ".1f") for sim_score in sim_scores]

    movie_details = [
        "id",
        "title",
        "genres",
        "original_language",
        "production_countries",
        "release_date",
        "runtime",
        "weighted_rating",
    ]

    sorted_similar_movies = df_merged.loc[sorted_movie_indices, movie_details]

    sorted_similar_movies["similarity_scores"] = sorted_similarity_scores

    return sorted_similar_movies

In [20]:
def get_qualified_movies(
    df_qualified: pd.DataFrame, sorted_similar_movies: pd.DataFrame
) -> pd.DataFrame:
    """
    Filter out movies that are not in the qualified movies chart and sort the movies based on similarity scores and IMDB's weighted rating.

    :param df_qualified: The DataFrame containing qualified movie details.
    :param sorted_similar_movies: The DataFrame containing movie details sorted by similarity scores.
    :return: A Pandas DataFrame containing the qualified movies sorted by similarity scores and IMDB's weighted rating..
    """

    qualified_movies = sorted_similar_movies[
        sorted_similar_movies["id"].isin(df_qualified["id"])
    ]
    qualified_movies = qualified_movies.sort_values(
        by=["similarity_scores", "weighted_rating"], ascending=False
    )
    return qualified_movies

In [21]:
def predict_user_rating(
    userId: int, qualified_movies: pd.DataFrame, indices_map: pd.DataFrame
) -> pd.DataFrame:
    """
    Predict the user rating for qualified movies using SVD and return the sorted DataFrame.

    :param userId: The ID of the user.
    :param qualified_movies:  A Pandas DataFrame containing qualified movies data.
    :return: A Pandas DataFrame containing the final qualified movies sorted by estimated user ratings.
    """
    # Calculate estimated user ratings for qualified movies using SVD
    qualified_movies["predicted_user_rating"] = qualified_movies["id"].apply(
        lambda x: round(svd.predict(userId, indices_map.loc[x]["movieId"]).est, 1)
    )
    final_qualified_movies = qualified_movies.sort_values(
        by=["predicted_user_rating", "similarity_scores", "weighted_rating"],
        ascending=False,
    )
    return final_qualified_movies

In [33]:
def get_movie_recommendations_hybrid(title: str, user_id: int) -> pd.DataFrame:
    """
    Get movie recommendations based on a given title and user ID.

    :param title: The title of the movie to find similar movies for.
    :param userId: The ID of the user.
    :return: A Pandas DataFrame containing the recommended movies
    """
    # Get recommended movie indices based on the given title
    sorted_similar_movies = get_sorted_similar_movies(title, cos_sim, df_merged)

     # Filter out bad movies and select the top 50 qualified movies
    qualified_movies = get_qualified_movies(df_qualified, sorted_similar_movies).head(50)

    # Predict user ratings for qualified movies and select the top recommended movies
    recommended_movies = predict_user_rating(
        user_id, qualified_movies, indices_map
    ).head(5)

    recommended_movies.columns = [
        "ID",
        "Title",
        "Genres",
        "Language",
        "Production Countries",
        "Release Date",
        "Runtime",
        "Weighted Rating",
        "Similarity Score",
        "Predicted User Rating",
    ]

    recommendation_criteria = recommended_movies[["ID", "Title", "Predicted User Rating", "Similarity Score", "Weighted Rating"]]
    recommended_movies.drop(["Predicted User Rating", "Similarity Score", "Weighted Rating"], axis = 1, inplace=True)
    return recommended_movies, recommendation_criteria

### Demo

In [34]:
import gradio as gr

with gr.Blocks(theme=gr.themes.Soft(text_size="lg")) as demo:
    gr.Markdown(
        """
    # Movie Recommendation System
    """
    )
    title = gr.Dropdown(
        choices=df_merged["title"].unique().tolist(),
        label="Movie Title",
        value="Iron Man",
    )
    user_id = gr.Number(
        value=1, label="User ID", info="Please enter a number between 1 and 671!"
    )
    recommend_button = gr.Button("Get Movie Recommendations")
    recommended_movies = gr.DataFrame(label="Movie Recommendations")
    recommendation_criteria = gr.DataFrame(label="Recommendation Criteria")
    recommend_button.click(
        get_movie_recommendations_hybrid,
        inputs=[title, user_id],
        outputs=[recommended_movies, recommendation_criteria]
    )
    examples = gr.Examples(
        examples=[
            "Captain America: The First Avenger",
            "The Conjuring",
            "Toy Story",
            "Final Destination 5",
        ],
        inputs=[title],
    )

demo.launch()

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.


