In [7]:
import pandas as pd

ratings = pd.read_csv('datas/ratings_filtered_vf.csv').head(500000)
movies = pd.read_csv('datas/movies.csv')



In [2]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

In [3]:
import numpy as np
from scipy.sparse import csr_matrix

def create_user_item_matrix(df):
    num_users = df['userId'].nunique()
    num_movies = df['movieId'].nunique()

    user_id_to_index = dict(zip(np.unique(df["userId"]), list(range(num_users))))
    movie_id_to_index = dict(zip(np.unique(df["movieId"]), list(range(num_movies))))
    
    index_to_user_id = dict(zip(list(range(num_users)), np.unique(df["userId"])))
    index_to_movie_id = dict(zip(list(range(num_movies)), np.unique(df["movieId"])))
    
    user_indices = [user_id_to_index[i] for i in df['userId']]
    movie_indices = [movie_id_to_index[i] for i in df['movieId']]

    user_item_matrix = csr_matrix((df["rating"], (user_indices, movie_indices)), shape=(num_users, num_movies))
    
    return user_item_matrix, user_id_to_index, movie_id_to_index, index_to_user_id, index_to_movie_id

user_item_matrix, user_id_to_index, movie_id_to_index, index_to_user_id, index_to_movie_id = create_user_item_matrix(ratings)

In [4]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20, n_iter=10)
movie_feature_matrix = svd.fit_transform(user_item_matrix.T)
movie_feature_matrix.shape

(19163, 20)

In [5]:
from sklearn.neighbors import NearestNeighbors

def recommend_movies_knn(movie_id, feature_matrix, movie_id_to_index, index_to_movie_id, num_recommendations, distance_metric='cosine'):
    if movie_id not in movie_id_to_index:
        print("No movie found")
        return []
    
    feature_matrix = feature_matrix.T
    recommended_movie_ids = []
    
    movie_index = movie_id_to_index[movie_id]
    movie_vector = feature_matrix[movie_index]
    if isinstance(movie_vector, (np.ndarray)):
        movie_vector = movie_vector.reshape(1, -1)
        
    knn_model = NearestNeighbors(n_neighbors=num_recommendations + 1, algorithm="brute", metric=distance_metric)
    knn_model.fit(feature_matrix)
    neighbors = knn_model.kneighbors(movie_vector, return_distance=False)
    for i in range(1, num_recommendations + 1):
        neighbor_index = neighbors.item(i)
        recommended_movie_ids.append(index_to_movie_id[neighbor_index])
    
    return recommended_movie_ids

In [6]:
movie_id = 109487
similar_movie_ids = recommend_movies_knn(movie_id, movie_feature_matrix.T, movie_id_to_index, index_to_movie_id, num_recommendations=10, distance_metric='cosine')
movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}:")
for similar_movie_id in similar_movie_ids:
    print(movie_titles[similar_movie_id])

Because you watched Interstellar (2014):
The Martian (2015)
Wolf of Wall Street, The (2013)
Dark Knight Rises, The (2012)
Inception (2010)
Django Unchained (2012)
The Imitation Game (2014)
Mad Max: Fury Road (2015)
Ex Machina (2015)
Shutter Island (2010)
Deadpool (2016)
