In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from fuzzywuzzy import process
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn

In [2]:
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [3]:
count_ratings = ratings.groupby('movieId').count()
more_than_100_ratings = count_ratings.loc[count_ratings['rating'] >= 100]
filtered_ratings_dataset = ratings[ratings['movieId'].isin(more_than_100_ratings.index)]
filtered_ratings_dataset

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176
...,...,...,...,...
100217,610,48516,5.0,1479542152
100310,610,58559,4.5,1493844688
100326,610,60069,4.5,1493844866
100380,610,68954,3.5,1493844881


In [4]:
# Count no. of ratings for each movie
count_ratings = ratings.groupby('movieId').count()
    
# throw away movies that have been rated by less than 100 users
more_than_100_ratings = count_ratings.loc[count_ratings['rating'] >= 100]
filtered_ratings_dataset = ratings[ratings['movieId'].isin(more_than_100_ratings.index)]


In [9]:
ratings.movieId.nunique()

9724

In [13]:
count_ratings.sort_values(ascending = False,by = 'rating')

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,329,329,329
318,317,317,317
296,307,307,307
593,279,279,279
2571,278,278,278
...,...,...,...
4093,1,1,1
4089,1,1,1
58351,1,1,1
4083,1,1,1


In [14]:
more_than_100_ratings

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,215,215,215
2,110,110,110
6,102,102,102
10,132,132,132
32,177,177,177
...,...,...,...
48516,107,107,107
58559,149,149,149
60069,104,104,104
68954,105,105,105


In [15]:
filtered_ratings_dataset

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176
...,...,...,...,...
100217,610,48516,5.0,1479542152
100310,610,58559,4.5,1493844688
100326,610,60069,4.5,1493844866
100380,610,68954,3.5,1493844881


In [5]:
avg_rating = filtered_ratings_dataset.groupby('movieId').mean().sort_values('rating', ascending=False).drop(columns = 'timestamp')

In [16]:
avg_rating

Unnamed: 0_level_0,userId,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,302.299685,4.429022
858,297.744792,4.289062
2959,319.894495,4.272936
1221,307.581395,4.259690
48516,319.168224,4.252336
...,...,...
185,319.062500,3.040179
434,310.247525,3.034653
586,312.750000,2.995690
153,296.832117,2.916058


In [17]:
def get_watched_movies(userId):
    """
    It gets the userId and returns the list of movies rated by that particular user.
    (The user has given ratings means he/she has already watched it.)
    """
    watched_movies = ratings.loc[ratings['userId'] == userId]['movieId']
    return list(watched_movies)

liked_items = get_watched_movies(1)
liked_items

[1,
 3,
 6,
 47,
 50,
 70,
 101,
 110,
 151,
 157,
 163,
 216,
 223,
 231,
 235,
 260,
 296,
 316,
 333,
 349,
 356,
 362,
 367,
 423,
 441,
 457,
 480,
 500,
 527,
 543,
 552,
 553,
 590,
 592,
 593,
 596,
 608,
 648,
 661,
 673,
 733,
 736,
 780,
 804,
 919,
 923,
 940,
 943,
 954,
 1009,
 1023,
 1024,
 1025,
 1029,
 1030,
 1031,
 1032,
 1042,
 1049,
 1060,
 1073,
 1080,
 1089,
 1090,
 1092,
 1097,
 1127,
 1136,
 1196,
 1197,
 1198,
 1206,
 1208,
 1210,
 1213,
 1214,
 1219,
 1220,
 1222,
 1224,
 1226,
 1240,
 1256,
 1258,
 1265,
 1270,
 1275,
 1278,
 1282,
 1291,
 1298,
 1348,
 1377,
 1396,
 1408,
 1445,
 1473,
 1500,
 1517,
 1552,
 1573,
 1580,
 1587,
 1617,
 1620,
 1625,
 1644,
 1676,
 1732,
 1777,
 1793,
 1804,
 1805,
 1920,
 1927,
 1954,
 1967,
 2000,
 2005,
 2012,
 2018,
 2028,
 2033,
 2046,
 2048,
 2054,
 2058,
 2078,
 2090,
 2093,
 2094,
 2096,
 2099,
 2105,
 2115,
 2116,
 2137,
 2139,
 2141,
 2143,
 2161,
 2174,
 2193,
 2253,
 2268,
 2273,
 2291,
 2329,
 2338,
 2353,
 2366,
 

In [26]:
recommended_movie_ids = avg_rating[~avg_rating.index.isin(liked_items)].sort_values(by = 'movieId').iloc[0:5]['rating']

In [27]:
recommended_movie_ids

movieId
2     3.431818
10    3.496212
32    3.983051
34    3.652344
39    3.293269
Name: rating, dtype: float64

In [29]:
movies = pd.read_csv('./data/ml-latest-small/movies.csv', index_col=0)

In [30]:
movies.loc[movies.index.isin(recommended_movie_ids.index)]['title']

movieId
2                                Jumanji (1995)
10                             GoldenEye (1995)
32    Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
34                                  Babe (1995)
39                              Clueless (1995)
Name: title, dtype: object

In [31]:
recommended_movies = pd.DataFrame(movies.loc[movies.index.isin(recommended_movie_ids.index)]['title'])

In [32]:
recommended_movies

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
2,Jumanji (1995)
10,GoldenEye (1995)
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
34,Babe (1995)
39,Clueless (1995)


In [33]:
recommended_movies.join(recommended_movie_ids).sort_values('rating', ascending=False)

Unnamed: 0_level_0,title,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),3.983051
34,Babe (1995),3.652344
10,GoldenEye (1995),3.496212
2,Jumanji (1995),3.431818
39,Clueless (1995),3.293269
