In [1]:
import pandas as pd
import os
from surprise import Dataset, KNNBasic, Reader, accuracy, SVD
from surprise.model_selection import cross_validate, PredefinedKFold

In [2]:
items_stream = open('ml-100k/u.item', 'r')
item_data = items_stream.read().split('\n')
item_data = list(map(lambda item: item.split('|')[:2], item_data))
items_stream.close()

In [3]:
database = pd.read_csv('ml-100k/u1.base.csv')
user_set = set(database.user_id)
item_set = set(database.item_id)
not_watch = {user: item_set.difference(database.query('user_id == %s' %(user)).item_id) for user in user_set}

In [4]:
files_dir = os.path.expanduser('ml-100k/')
reader = Reader('ml-100k')

train_file = files_dir + 'u1.base'
test_file = files_dir + 'u1.test'
folds_files = [(train_file, test_file)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

In [5]:
sim_options = {
    'name': 'cosine',
    'user_based': True  # compute  similarities between users
}

algo_knn = KNNBasic(sim_options=sim_options, k=4, min_k=2)
algo_svd = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo_knn.fit(trainset)
    algo_svd.fit(trainset)
    predictions_knn = algo_knn.test(testset)
    predictions_svd = algo_svd.test(testset)
    print('KNN RMSE: %.4f' % accuracy.rmse(predictions_knn, verbose=False))
    print('SVD RMSE: %.4f' % accuracy.rmse(predictions_svd, verbose=False))
    

Computing the cosine similarity matrix...
Done computing similarity matrix.
KNN RMSE: 1.1118
SVD RMSE: 0.9511


In [6]:
def get_top_5_knn(uid):
    top = []
    items = not_watch[int(uid)]
    
    for item in items:
        top.append((item, algo_knn.predict(uid=uid, iid=str(item)).est))
    
    return sorted(top, key=lambda item: item[1], reverse=True)[:5]


def get_top_5_movies_knn(uid):
    top_5 = get_top_5_knn(uid)
    return [item_data[int(item[0])][1] for item in top_5]

In [7]:
def get_top_5_svd(uid):
    top = []
    items = not_watch[int(uid)]
    
    for item in items:
        top.append((item, algo_svd.predict(uid=uid, iid=str(item)).est))
    
    return sorted(top, key=lambda item: item[1], reverse=True)[:5]


def get_top_5_movies_svd(uid):
    top_5 = get_top_5_svd(uid)
    return [item_data[int(item[0])][1] for item in top_5]

In [8]:
def get_top_5_neighbors(uid):
    inner_uid = algo_knn.trainset.to_inner_uid(uid)
    neighbords = algo_knn.get_neighbors(iid=inner_uid, k=5)
    return [algo_knn.trainset.to_raw_uid(iid) for iid in neighbords]