In [1]:
import pandas as pd
import os
from surprise import Dataset, KNNBasic, Reader, accuracy, SVD
from surprise.model_selection import cross_validate, PredefinedKFold

In [2]:
items_stream = open('ml-100k/u.item', 'r')
item_data = items_stream.read().split('\n')
item_data = list(map(lambda item: item.split('|')[:2], item_data))
items_stream.close()

In [3]:
data_stream = open('ml-100k/u1.base', 'r')
data_list = data_stream.read().split('\n')
data_list = list(map(lambda case: case.split('\t')[:2], data_list))
data_list = data_list[:len(data_list) -1]
data_stream.close()

user_id, item_id = zip(*data_list)
user_set = set(user_id)
item_set = set(item_id)

database = pd.DataFrame({'user_id': user_id, 'item_id': item_id})
not_watch = {user: item_set.difference(database.query('user_id == %s' %(user)).item_id) for user in user_set}

In [4]:
# path to dataset folder
files_dir = os.path.expanduser('ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in [1]]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

sim_options = {
    'name': 'cosine',
    'user_based': True  # compute  similarities between items
}

algo = KNNBasic(sim_options=sim_options, k=4, min_k=2)

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [5]:
def get_top_5(uid):
    top = []
    items = not_watch[uid]
    
    for item in items:
        top.append((item, algo.predict(uid=uid, iid=item).est))
    
    return sorted(top, key=lambda item: item[1], reverse=True)[:5]

In [6]:
five = get_top_5('100')
print(five)

for item in five:
    print(item_data[int(item[0])])

[('98', 5), ('478', 5), ('479', 5), ('315', 5), ('89', 5)]
['99', 'Snow White and the Seven Dwarfs (1937)']
['479', 'Vertigo (1958)']
['480', 'North by Northwest (1959)']
['316', 'As Good As It Gets (1997)']
['90', 'So I Married an Axe Murderer (1993)']
