In [1]:
import pandas as pd
import os
from surprise import Dataset, KNNBasic, Reader, accuracy, SVD
from surprise.model_selection import cross_validate, PredefinedKFold

In [2]:
items_stream = open('ml-100k/u.item', 'r')
item_data = items_stream.read().split('\n')
item_data = list(map(lambda item: item.split('|')[:2], item_data))
items_stream.close()

In [3]:
database = pd.read_csv('ml-100k/u1.base.csv')
user_set = set(database.user_id)
item_set = set(database.item_id)
not_watch = {user: item_set.difference(database.query('user_id == %s' %(user)).item_id) for user in user_set}

In [4]:
# path to dataset folder
files_dir = os.path.expanduser('ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in [1]]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

sim_options = {
    'name': 'cosine',
    'user_based': True  # compute  similarities between items
}

algo = KNNBasic(sim_options=sim_options, k=4, min_k=2)

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [5]:
def get_top_5(uid):
    top = []
    items = not_watch[uid]
    
    
    for item in items:
        top.append((item, algo.predict(uid=str(uid), iid=str(item)).est))
    
    return sorted(top, key=lambda item: item[1], reverse=True)[:5]

In [6]:
five = get_top_5(339)
print(five)

for item in five:
    print(item_data[int(item[0])][1])

[(8, 5), (48, 5), (171, 5), (198, 5), (478, 5)]
Dead Man Walking (1995)
I.Q. (1994)
Empire Strikes Back, The (1980)
Bridge on the River Kwai, The (1957)
Vertigo (1958)


In [7]:
pred = algo.predict(uid='339', iid='524', verbose=True)
inner_iid = algo.trainset.to_inner_iid('524')
inner_uid = algo.trainset.to_inner_uid('943')
print(algo.get_neighbors(iid=inner_uid, k=4))

user: 339        item: 524        r_ui = None   est = 5.00   {'actual_k': 4, 'was_impossible': False}
[2, 73, 78, 97]
