In [1]:
import pandas as pd
import numpy as np

In [3]:
movies_df = pd.read_csv("movies.csv", usecols=["movieId", "title"], dtype={"movieId":"int32", "title":"str"})
ratings_df = pd.read_csv("ratings.csv", usecols = ["userId", 'movieId', 'rating'], 
                         dtype={"userId":"int32", "movieId":"int32", "rating":"float32"})

In [5]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
df = pd.merge(ratings_df, movies_df, on="movieId")
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [9]:
df.dropna(axis=0, subset=["title"], inplace=True)

total_rating_combined = (df.groupby(by=["title"])["rating"].count().
                        reset_index().
                        rename(columns = {"rating":"combinedrating"})
                        [["title", "combinedrating"]])
total_rating_combined.head()

Unnamed: 0,title,combinedrating
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [10]:
df = pd.merge(df, total_rating_combined, left_on="title", right_on="title", how="left")
df.head()

Unnamed: 0,userId,movieId,rating,title,combinedrating
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [12]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(df['combinedrating'].describe())

count   100836.000
mean        58.759
std         61.965
min          1.000
25%         13.000
50%         39.000
75%         84.000
max        329.000
Name: combinedrating, dtype: float64


In [13]:
popularity_threshold = 50

rating_popular_movie = df.query("combinedrating>=@popularity_threshold")
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,combinedrating
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [16]:
movies_features_df = rating_popular_movie.pivot_table(index="title", columns="userId", values="rating").fillna(0)
movies_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [17]:
from scipy.sparse import csr_matrix

movie_features_matrix = csr_matrix(movies_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric="cosine", algorithm="brute")
model_knn.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [18]:
query_index = np.random.choice(movies_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movies_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

120


In [22]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movies_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}'.format(i, movies_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Dead Poets Society (1989):

1: Rain Man (1988), with distance of 0.4642203450202942
2: Truman Show, The (1998), with distance of 0.5002989172935486
3: Good Will Hunting (1997), with distance of 0.5062745809555054
4: Stand by Me (1986), with distance of 0.5145639181137085
5: Back to the Future (1985), with distance of 0.5266998410224915
