In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process



In [3]:
movies='data/small/movies.csv'
ratings='data/small/ratings.csv'

df_movies=pd.read_csv(movies, usecols=['movieId', 'title'], dtype={'movieId' : 'int32', 'title' : 'str'})
df_ratings=pd.read_csv(ratings, usecols=['userId', 'movieId', 'rating'], dtype={'userId':'int32', 'movieId':'int32', 'rating':'float32'})

In [4]:
df_movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [5]:
df_movies[df_movies["movieId"]==50798]

Unnamed: 0,movieId,title
6398,50798,Epic Movie (2007)


In [6]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
mask = df_movies["movieId"].isin(df_ratings["movieId"].unique())

df_movies_adjusted = df_movies[mask].reset_index(drop=True)
df_movies_adjusted

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9719,193581,Black Butler: Book of the Atlantic (2017)
9720,193583,No Game No Life: Zero (2017)
9721,193585,Flint (2017)
9722,193587,Bungo Stray Dogs: Dead Apple (2018)


In [8]:
# Sparse Matrix
#           Users
#           [4,4,5] A
# Movies    [3,3,4] B ==    Cos(A,B)  => 0.95
#           [3,2,1]

movies_users=df_ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
mat_movies_users=csr_matrix(movies_users.values)

In [9]:
movies_users

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
movies_users[610]

movieId
1         5.0
2         0.0
3         0.0
4         0.0
5         0.0
         ... 
193581    0.0
193583    0.0
193585    0.0
193587    0.0
193609    0.0
Name: 610, Length: 9724, dtype: float32

In [12]:
print(movies_users.iloc[6428])

userId
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    0.0
607    0.0
608    0.0
609    0.0
610    0.0
Name: 51933, Length: 610, dtype: float32


In [13]:
mat_movies_users[6428]

<1x610 sparse matrix of type '<class 'numpy.float32'>'
	with 1 stored elements in Compressed Sparse Row format>

In [14]:
mat_movies_users.shape

(9724, 610)

In [15]:
mat_movies_users[4237].toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [16]:
mat_movies_users[3879]

<1x610 sparse matrix of type '<class 'numpy.float32'>'
	with 1 stored elements in Compressed Sparse Row format>

In [17]:
mat_movies_users[3879].toarray()

array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. 

In [18]:
df_movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [19]:
# Euclidean Distance
# Manhattan Distance
# Minkowski Distance
# Cosine Similarity

model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20)

In [20]:
#model_knn.fit(mat_movies_users)

In [21]:
temp = df_movies["movieId"][6398]
temp

50798

In [22]:
df_movies[df_movies["title"]=="Toy Story 2 (1999)"]

Unnamed: 0,movieId,title
2355,3114,Toy Story 2 (1999)


In [23]:
# Recommender function (movie_name) => List of movies recommended

def recommender(movie_name, data, model, n_recommendations):
    model.fit(data)
    idx=process.extractOne(movie_name, df_movies_adjusted["title"])[2]
    print("Movie Selected: ", df_movies_adjusted["title"][idx], "Index: ", idx)
    print("Searching for recommendations")
    distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations)
    print(indices)
    for i, d in zip(indices, distances):
        for index, distance in zip(i, d):
            if(index!=idx):
                print(f"{index}: {df_movies_adjusted['title'][index]} {distance}")

recommender("Toy Story", mat_movies_users, model_knn, 20)

Movie Selected:  Toy Story (1995) Index:  0
Searching for recommendations
[[   0 2353  418  615  224  314  322  910  546  963  968 3189  506  123
   257  897  815 1182   31  277]]
2353: Toy Story 2 (1999) 0.427398681640625
418: Jurassic Park (1993) 0.4343631863594055
615: Independence Day (a.k.a. ID4) (1996) 0.435738205909729
224: Star Wars: Episode IV - A New Hope (1977) 0.4426117539405823
314: Forrest Gump (1994) 0.45290398597717285
322: Lion King, The (1994) 0.4588547945022583
910: Star Wars: Episode VI - Return of the Jedi (1983) 0.4589107036590576
546: Mission: Impossible (1996) 0.461087167263031
963: Groundhog Day (1993) 0.4658311605453491
968: Back to the Future (1985) 0.4696187973022461
3189: Shrek (2001) 0.4720233678817749
506: Aladdin (1992) 0.47214072942733765
123: Apollo 13 (1995) 0.47967529296875
257: Pulp Fiction (1994) 0.48196732997894287
897: Star Wars: Episode V - The Empire Strikes Back (1980) 0.48581868410110474
815: Willy Wonka & the Chocolate Factory (1971) 0.48775

In [24]:
# Recommender function (movie_name) => List of movies recommended

def recommender(movie_name, data, model, n_recommendations):
    model.fit(data)
    idx=process.extractOne(movie_name, df_movies["title"])[2]
    print("Movie Selected: ", df_movies["title"][idx], "Index: ", idx)
    print("Searching for recommendations")
    distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations)
    print(indices)
    for i, d in zip(indices, distances):
        for index, distance in zip(i, d):
            if(index!=idx):
                print(f"{index}: {df_movies['title'][index]} {distance}")

recommender("Toy story", mat_movies_users, model_knn, 20)

Movie Selected:  Toy Story (1995) Index:  0
Searching for recommendations
[[   0 2353  418  615  224  314  322  910  546  963  968 3189  506  123
   257  897  815 1182   31  277]]
2353: 'night Mother (1986) 0.427398681640625
418: Jurassic Park (1993) 0.4343631863594055
615: Independence Day (a.k.a. ID4) (1996) 0.435738205909729
224: Star Wars: Episode IV - A New Hope (1977) 0.4426117539405823
314: Forrest Gump (1994) 0.45290398597717285
322: Lion King, The (1994) 0.4588547945022583
910: Once Upon a Time in the West (C'era una volta il West) (1968) 0.4589107036590576
546: Mission: Impossible (1996) 0.461087167263031
963: Diva (1981) 0.4658311605453491
968: Arsenic and Old Lace (1944) 0.4696187973022461
3189: Rififi (Du rififi chez les hommes) (1955) 0.4720233678817749
506: Aladdin (1992) 0.47214072942733765
123: Apollo 13 (1995) 0.47967529296875
257: Pulp Fiction (1994) 0.48196732997894287
897: Cheech and Chong's Up in Smoke (1978) 0.48581868410110474
815: Willy Wonka & the Chocolate Fa

In [25]:
movies_users.iloc[2353]

userId
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    0.0
607    3.0
608    2.5
609    0.0
610    5.0
Name: 3114, Length: 610, dtype: float32

In [26]:
df_movies[df_movies["movieId"]== 3114]

Unnamed: 0,movieId,title
2355,3114,Toy Story 2 (1999)
