In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Install surprise package
!pip install scikit-surprise



In [3]:
movies = pd.read_csv('movies_grouplens_movielens_100k.csv')
movies.shape

(9742, 3)

In [4]:
ratings = pd.read_csv('ratings_grouplens_movielens_100k.csv')
ratings.shape

(100836, 4)

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# In our case we don't need timestamp, so we drop it
ratings = ratings.drop(['timestamp'],axis = 1)

In [7]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [8]:
# Normally I would like to use "load_from_df"-method to load dataset directly, but it didn't work. 
# So i decided to write to csv and load it.
ratings.to_csv("new_ratings.csv",header = None, index = False)

In [9]:
from surprise import Dataset, Reader

file_path = 'new_ratings.csv'
reader = Reader(line_format='user item rating', sep=',', rating_scale = (1,5))

data = Dataset.load_from_file(file_path, reader=reader)

In [10]:
# build training dataset
trainset = data.build_full_trainset()

print("Number of users: ", trainset.n_users)
print("Number of items: ", trainset.n_items)

Number of users:  610
Number of items:  9724


In [11]:
#build the model
from surprise import KNNWithMeans

#set user_based True
sim_options = {'name':'pearson', 'user_based':True}

#Knn model: takes mean ratings of each user into account
#k: max numbers of neighbors, k_min: minimum number of neighbors taking into account
algo = KNNWithMeans(k=15, min_k = 5, sim_options = sim_options, verbose = True)

In [12]:
#accuracy of the model
from surprise.model_selection import cross_validate

results = cross_validate(algo = algo, data = data, measures = ['RMSE'], cv = 5, return_train_measures = True)
print(results['test_rmse'].mean())

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.8941736519002912


In [13]:
#training the model on our dataset
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x24a81fd4430>

In [14]:
# get top 10 predictions
from collections import defaultdict

def get_top_n_predictions(predictions, n=10):
    top_n = defaultdict(list)
    #map predictions to each user
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid,est))
        
    #sort the predictions for each user and get the k highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    #returns a dictionary where keys are user ids and values are lists of tuples.    
    return top_n

In [15]:
#create a dictionary to map movieId and movie names
movie_id_title_map = {}

for movie_id, title in zip(movies['movieId'].values, movies['title'].values):
    movie_id_title_map[str(movie_id)] = title

In [None]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
top_n_predictions = get_top_n_predictions(predictions, n=10)

In [None]:
class MovieHistory:
    movie_id = None
    title = None 
    rating = None

def movie_history_of_user(rating_df, user_id):
    user_ratings = rating_df[rating_df['userId'] == user_id]
    user_rating_list = []

    for index, row in user_ratings.iterrows():
        movie_history = MovieHistory()
        movie_id = int(row['movieId'])
        movie_history.movie_id = movie_id
        movie_history.title = movieid_title_map[str(movie_id)]
        movie_history.rating = row['rating']
        user_rating_list.append(movie_history)
    return user_rating_list

In [None]:
movie_history_of_user = movie_history_of_user(ratings, 5)
for x in movie_history_of_user:
    print(x.movie_id, x.title, x.rating)

In [None]:
# Top n movie recommendation for user
def movie_recommendations_for_user(user_id , top_n_predictions, movieid_title_map):
    print("Predictions for User Id : " , user_id)
    user_ratings = top_n_predictions[user_id]
    for item_id , rating in user_ratings :
        print(movieid_title_map[item_id] , " : " , rating)

In [None]:
movie_recommendations_for_user('5', top_n_predictions, movieid_title_map)