# Section 0

Necessary Imports

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib
import numpy as np

# Section 1

Load files

In [8]:
ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv", usecols=["userId", "movieId", "rating"])
movies_df = pd.read_csv("data/ml-latest-small/movies.csv", usecols=["movieId", "title"])

df = ratings_df.merge(movies_df, on="movieId")

n_users = df.userId.nunique()
n_movies = df.movieId.nunique()

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df.userId = user_encoder.fit_transform(df["userId"])
df.movieId = movie_encoder.fit_transform(df["movieId"])

df.head(1)

Unnamed: 0,userId,movieId,rating,title
0,0,0,4.0,Toy Story (1995)


In [9]:
svd_predictions = joblib.load("svd_reconstructed")
kmeans = joblib.load("kmeans_model")

# Section 2
Define our `hybrid_recommendation()` function.

```
Arguments:
    user_id: The (not encoded) user id.
    n_movies: How many movies to return
    predicted_ratings: The factorized matrix returned by SVD, TSVD, NMF...
```
First encode the user ID. Then select the appropriate row of the ratings matrix and copy it to a new memory address. The indices of movies the user has rated are changed to zero to get filtered out of `argsort()`. Then select the clusters the user is interested in and combine the movies. All other movies are again filtered out by setting their rating to zero. Finally return the appropriate movies.

In [10]:
def hybrid_recommendation(user_id, n_movies, predicted_ratings):

    #Encode the user
    user_id = user_encoder.transform([user_id])[0]

    #Copy the user row from the svd predictions matrix.
    #Required because without copying we will be writting
    #in the same memory as the original svd_predictions matrix.
    user_ratings = predicted_ratings[user_id, :].copy()

    #Get a list of the movies the user has rated.
    user_rated_movies = df[df["userId"] == user_id].movieId.values

    #If a user has rated a specific movie then change the user's predicted
    #rating to 0 so that it won't be selected as a recommendation.
    for index, _ in enumerate(user_ratings):
        if index in user_rated_movies:
            user_ratings[index] = 0

    #Typical numpy argsort
    top_movie_indices = np.argsort(user_ratings)[::-1][:3]

    clusters_for_user = list()
    for movie_id in top_movie_indices:
        for cluster_id in range(kmeans.n_clusters):
            if movie_id in np.where(kmeans.labels_ == cluster_id)[0]:
                clusters_for_user.append(cluster_id)
                break
    clusters_for_user = list(set(clusters_for_user))
    
    clustered_movies_combined = list()
    for cluster_id in clusters_for_user:
        clustered_movies_combined.append(np.where(kmeans.labels_ == cluster_id)[0])
    clustered_movies_combined = np.concatenate(clustered_movies_combined)

    user_ratings[np.setdiff1d(df.movieId.unique(), clustered_movies_combined)] = 0
    top_movie_indices = np.argsort(user_ratings)[::-1][:n_movies]
    #Decode the movie indices. This is required since we will return
    #incorrect movie indices or get an if the label doesn't exist.
    top_movie_indices = movie_encoder.inverse_transform(top_movie_indices)
    return movies_df[movies_df["movieId"].isin(top_movie_indices)][["title"]]

In [13]:
top_movies_svd = hybrid_recommendation(1, 30, svd_predictions)
top_movies_svd

Unnamed: 0,title
9,GoldenEye (1995)
31,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
474,Blade Runner (1982)
483,"Nightmare Before Christmas, The (1993)"
507,Terminator 2: Judgment Day (1991)
613,Trainspotting (1996)
659,"Godfather, The (1972)"
706,2001: A Space Odyssey (1968)
793,Die Hard (1988)
896,One Flew Over the Cuckoo's Nest (1975)
