In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

df_movies = pd.read_csv(
    "../Data/ml-latest-small/movies.csv",
    usecols=["movieId", "title"],
    dtype={"movieIe": "int32", "title": "str"},
)
df_ratings = pd.read_csv(
    "../Data/ml-latest-small/ratings.csv",
    usecols=["userId", "movieId", "rating"],
    dtype={"userId": "int32", "movieId": "int32", "rating": "float"},
)


In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
# Sparse matrix

movies_users = df_ratings.pivot(index="movieId", columns="userId", values="rating").fillna(0)
mat_movies_users = csr_matrix(movies_users.values)

In [5]:
movies_users

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
mat_movies_users

<9724x610 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

In [7]:
# Euclidean Distance
# Manhattan Distance
# Minkowski Distance
# Cosine Similarity


model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20)

In [8]:
model_knn.fit(mat_movies_users)

In [9]:
# Recommender(movie_name) => List of Movies recommended

def recommender(movie_name, data, model, n_recommendations):
    model.fit(data)
    idx=process.extractOne(movie_name, df_movies["title"])[2]
    print(f"Movie Selected: {df_movies['title'][idx]} Index: {idx}")
    print("Searching for recommendations...")
    distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations)
    for i in indices:
        print(df_movies["title"][i].where(i!=idx))

recommender("shawshank redemption", mat_movies_users, model_knn, 20)

Movie Selected: Shawshank Redemption, The (1994) Index: 277
Searching for recommendations...
277                                           NaN
314                           Forrest Gump (1994)
257                           Pulp Fiction (1994)
510              Silence of the Lambs, The (1991)
46                     Usual Suspects, The (1995)
461                       Schindler's List (1993)
2224        Home Alone 2: Lost in New York (1992)
97                              Braveheart (1995)
1938                   Walk on the Moon, A (1999)
123                              Apollo 13 (1995)
43                    Seven (a.k.a. Se7en) (1995)
4791                           Cooler, The (2003)
659                         Godfather, The (1972)
1283                  For Richer or Poorer (1997)
398                          Fugitive, The (1993)
418                          Jurassic Park (1993)
3633                    White Water Summer (1987)
507             Terminator 2: Judgment Day (1991)
224    