In [1]:
from math import sqrt

### 创建一个数据集，字典表示。内容为评价者对电影的评价

In [2]:
critics = {'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
                         'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
                         'The Night Listener': 3.0},
           'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
                            'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
                            'You, Me and Dupree': 3.5},
           'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
                                'Superman Returns': 3.5, 'The Night Listener': 4.0},
           'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
                            'The Night Listener': 4.5, 'Superman Returns': 4.0,
                            'You, Me and Dupree': 2.5},
           'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                            'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
                            'You, Me and Dupree': 2.0},
           'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                             'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
           'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}}

### 欧式距离评价
$$sim(x,y) = \frac{1}{1+\sqrt{\lVert x-y \rVert_2^2}}$$

In [3]:
def sim_distance(person1_name, person2_name, prefs):
    person1, person2 = prefs[person1_name], prefs[person2_name]
    # 查找共同电影
    common_item = {}
    for key in person1.keys():
        if key in person2.keys():
            common_item[key] = 1
    if not len(common_item):
        return 0
    euc_distance = sum([pow(person1[key]-person2[key], 2) for key in common_item])
    return 1 / (1 + sqrt(euc_distance))

In [4]:
sim_distance('Lisa Rose', 'Gene Seymour', critics)

0.29429805508554946

### 皮尔森相关系数评价
$$ sim(x,y) = \frac{\sum{x_i y_i} - \frac{\sum{x_i}\sum{y_i}}{n}}{\sqrt{\sum{x_i^2}-\frac{(\sum{x_i})^2}{n}} \sqrt{\sum{y_i^2}-\frac{(\sum{y_i})^2}{n}}}$$

In [5]:
def sim_pearson(person1_name, person2_name, prefs):
    person1, person2 = prefs[person1_name], prefs[person2_name]
    # 查找共同电影
    common_item = {}
    for key in person1.keys():
        if key in person2.keys():
            common_item[key] = 1
    n = len(common_item)
    if not n:
        return 0
    # 求和
    sum1 = sum([person1[key] for key in common_item])
    sum2 = sum([person2[key] for key in common_item])
    # 求平方和
    sum1_square = sum([person1[key]**2 for key in common_item])
    sum2_square = sum([person2[key]**2 for key in common_item])
    # 求两者乘积和
    sum12 = sum([person1[key]*person2[key] for key in common_item])
    num = sum12 - sum1*sum2/n
    den = sqrt((sum1_square - sum1**2/n)*(sum2_square - sum2**2/n))
    if not den:
        return 0
    return  num / den

In [6]:
sim_pearson('Lisa Rose', 'Gene Seymour', critics)

0.39605901719066977

### 寻找相似用户

In [7]:
def peopleTopMatches(person_name, prefs, topk=5, similarity=sim_pearson):
    scores = [(similarity(person_name, name, prefs), name) for name in prefs.keys()]
    scores.sort(key=lambda x:x[0], reverse=True)
    return scores[1:topk+1]

In [8]:
peopleTopMatches('Toby', critics)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig'),
 (0.66284898035987, 'Jack Matthews'),
 (0.38124642583151164, 'Gene Seymour')]

### 推荐物品