# 这段代码是如何在协同过滤算法建模以后，根据一个item取回相似度最高的item，主要是用到algo.get_neighbors()这个函数

In [1]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import io

from surprise import KNNBaseline
from surprise import Dataset

In [12]:
def read_item_names():
    """
    获取电影名到电影id 和 电影id到电影名的映射
    """
    #rid表示原始数据raw id，iid表示inner id，内部映射id
    file_name = (os.path.expanduser('~') +
                 '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]
    return rid_to_name, name_to_rid

In [8]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/wjj/.surprise_data/ml-100k


In [10]:
#因为数据库中都是libsvm的方式稠密储存，所以用build_full_trainset()将其还原为稀疏的可计算矩阵
trainset = data.build_full_trainset()
#CF参数，基于皮尔逊距离，用的item——based CF
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x120d9fda0>

In [13]:

rid_to_name, name_to_rid = read_item_names()
#得到电影名字为'Toy Story (1995)'的原始ID
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_raw_id

'1'

In [14]:
#原始id转化为inner id
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id

24

In [28]:
#利用algo自带的get_neighbors方法得到10哥最相似的用户，这里的id是inner id
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id,k=10)
toy_story_neighbors

[433, 101, 302, 309, 971, 95, 26, 561, 816, 347]

In [29]:
#转成rid
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors)
toy_story_neighbors

<generator object <genexpr> at 0x1288ce6d8>

In [30]:
toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)

In [31]:
for movie in toy_story_neighbors:
    print(movie)

Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)
