In [1]:
from math import sqrt
import codecs
import random as rd

### 创建一个数据集，字典表示。内容为评价者对电影的评价

In [2]:
critics = {'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
                         'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
                         'The Night Listener': 3.0},
           'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
                            'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
                            'You, Me and Dupree': 3.5},
           'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
                                'Superman Returns': 3.5, 'The Night Listener': 4.0},
           'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
                            'The Night Listener': 4.5, 'Superman Returns': 4.0,
                            'You, Me and Dupree': 2.5},
           'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                            'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
                            'You, Me and Dupree': 2.0},
           'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                             'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
           'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}}

## 1. &nbsp; 用户相似度

### 欧式距离评价
$$sim(x,y) = \frac{1}{1+\sqrt{\lVert x-y \rVert_2^2}}$$

In [3]:
def sim_distance(person1_name, person2_name, prefs):
    person1, person2 = prefs[person1_name], prefs[person2_name]
    # 查找共同电影
    common_item = {}
    for key in person1.keys():
        if key in person2.keys():
            common_item[key] = 1
    if not len(common_item):
        return 0
    euc_distance = sum([pow(person1[key]-person2[key], 2) for key in common_item])
    return 1 / (1 + sqrt(euc_distance))

In [4]:
sim_distance('Lisa Rose', 'Gene Seymour', critics)

0.29429805508554946

### 皮尔森相关系数评价
$$ sim(x,y) = \frac{\sum{x_i y_i} - \frac{\sum{x_i}\sum{y_i}}{n}}{\sqrt{\sum{x_i^2}-\frac{(\sum{x_i})^2}{n}} \sqrt{\sum{y_i^2}-\frac{(\sum{y_i})^2}{n}}}$$

In [5]:
def sim_pearson(person1_name, person2_name, prefs):
    person1, person2 = prefs[person1_name], prefs[person2_name]
    # 查找共同电影
    common_item = {}
    for key in person1.keys():
        if key in person2.keys():
            common_item[key] = 1
    n = len(common_item)
    if not n:
        return 0
    # 求和
    sum1 = sum([person1[key] for key in common_item])
    sum2 = sum([person2[key] for key in common_item])
    # 求平方和
    sum1_square = sum([person1[key]**2 for key in common_item])
    sum2_square = sum([person2[key]**2 for key in common_item])
    # 求两者乘积和
    sum12 = sum([person1[key]*person2[key] for key in common_item])
    num = sum12 - sum1*sum2/n
    den = sqrt((sum1_square - sum1**2/n)*(sum2_square - sum2**2/n))
    if not den:
        return 0
    return  num / den

In [6]:
sim_pearson('Lisa Rose', 'Gene Seymour', critics)

0.39605901719066977

### 寻找相似用户

In [7]:
def topMatches(person_name, prefs, topk=5, similarity=sim_pearson):
    scores = [(similarity(person_name, name, prefs), name) for name in prefs.keys()]
    scores.sort(key=lambda x:x[0], reverse=True)
    return scores[1:topk+1]

In [8]:
topMatches('Toby', critics)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig'),
 (0.66284898035987, 'Jack Matthews'),
 (0.38124642583151164, 'Gene Seymour')]

## 2. &nbsp; 推荐物品

### 新物品评分
$$score(x,p)=\frac{\sum_{i,sim>0}{score(i,p)*sim(x,i)}}{\sum_{i,sim>0}{sim(x,i)}}$$

In [9]:
# 对未评分物品进行打分推荐
def estimatePersonScoreItem(people_name, prefs, similarity=sim_pearson):
    person = prefs[people_name]
    scoreSum = {}
    simSum = {}
    for other_name in prefs:
        sim = similarity(people_name, other_name, prefs)
        if sim==1 or sim<=0:
            continue
        other = prefs[other_name]
        for item_name in other:
            if item_name not in person or person[item_name]==0:
                scoreSum.setdefault(item_name, 0)
                scoreSum[item_name] += sim * other[item_name]
                simSum.setdefault(item_name, 0)
                simSum[item_name] += sim
    score = [(scoreSum / simSum[item_name], item_name) for item_name, scoreSum in scoreSum.items()]
    score.sort(key=lambda x:x[0], reverse=True)
    return score        

In [10]:
estimatePersonScoreItem('Toby', critics, sim_pearson)

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

### 将字典键值反转

In [11]:
def transformPrefs(prefs):
    result = {}
    for person_name in prefs:
        for item_name in prefs[person_name]:
            result.setdefault(item_name, {})
            result[item_name][person_name] = prefs[person_name][item_name]
    return result

In [12]:
transformPrefs(critics)

{'Lady in the Water': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.0,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0},
 'Snakes on a Plane': {'Lisa Rose': 3.5,
  'Gene Seymour': 3.5,
  'Michael Phillips': 3.0,
  'Claudia Puig': 3.5,
  'Mick LaSalle': 4.0,
  'Jack Matthews': 4.0,
  'Toby': 4.5},
 'Just My Luck': {'Lisa Rose': 3.0,
  'Gene Seymour': 1.5,
  'Claudia Puig': 3.0,
  'Mick LaSalle': 2.0},
 'Superman Returns': {'Lisa Rose': 3.5,
  'Gene Seymour': 5.0,
  'Michael Phillips': 3.5,
  'Claudia Puig': 4.0,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 5.0,
  'Toby': 4.0},
 'You, Me and Dupree': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.5,
  'Claudia Puig': 2.5,
  'Mick LaSalle': 2.0,
  'Jack Matthews': 3.5,
  'Toby': 1.0},
 'The Night Listener': {'Lisa Rose': 3.0,
  'Gene Seymour': 3.0,
  'Michael Phillips': 4.0,
  'Claudia Puig': 4.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0}}

### 计算物品之间相似度

In [13]:
def calSimilarItem(prefs, n=5):
    result = {}
    itemPrefs = transformPrefs(prefs)
    for item_name in itemPrefs:
        scores = topMatches(item_name, itemPrefs, topk=n, similarity=sim_pearson)
        result[item_name] = scores
    return result

In [14]:
calSimilarItem(critics)

{'Lady in the Water': [(0.7637626158259785, 'Snakes on a Plane'),
  (0.4879500364742689, 'Superman Returns'),
  (0.3333333333333333, 'You, Me and Dupree'),
  (-0.6123724356957927, 'The Night Listener'),
  (-0.9449111825230676, 'Just My Luck')],
 'Snakes on a Plane': [(0.7637626158259785, 'Lady in the Water'),
  (0.11180339887498941, 'Superman Returns'),
  (-0.3333333333333333, 'Just My Luck'),
  (-0.5663521139548527, 'The Night Listener'),
  (-0.6454972243679047, 'You, Me and Dupree')],
 'Just My Luck': [(0.5555555555555556, 'The Night Listener'),
  (-0.3333333333333333, 'Snakes on a Plane'),
  (-0.42289003161103106, 'Superman Returns'),
  (-0.4856618642571827, 'You, Me and Dupree'),
  (-0.9449111825230676, 'Lady in the Water')],
 'Superman Returns': [(0.6579516949597695, 'You, Me and Dupree'),
  (0.4879500364742689, 'Lady in the Water'),
  (0.11180339887498941, 'Snakes on a Plane'),
  (-0.1798471947990544, 'The Night Listener'),
  (-0.42289003161103106, 'Just My Luck')],
 'You, Me and

## 3. &nbsp; 获取数据集

In [15]:
def loadData(path='./'):
    # Get movie titles
    movies = {}
    for line in codecs.open(path+'u.item','r',encoding = "ISO-8859-1"):
        (id, title) = line.split('|')[0:2]
        movies[id] = title

    # Load data
    prefs = {}
    for line in codecs.open(path+'u.data','r',encoding = "ISO-8859-1"):
        (userid, movieid, rating, ts) = line.split('\t')
        prefs.setdefault(userid, {})
        prefs[userid][movies[movieid]] = float(rating)
    return prefs

### 下载数据集，并转化为字典格式

In [16]:
!wget https://raw.githubusercontent.com/moneyDboat/Programming-Collective-Intelligence/master/data/movielens/u.item
!wget https://github.com/moneyDboat/Programming-Collective-Intelligence/raw/master/data/movielens/u.data

wget: /opt/conda/lib/libuuid.so.1: no version information available (required by wget)
--2019-07-29 03:41:48--  https://raw.githubusercontent.com/moneyDboat/Programming-Collective-Intelligence/master/data/movielens/u.item
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 236343 (231K) [text/plain]
Saving to: ‘u.item’


2019-07-29 03:41:48 (7.58 MB/s) - ‘u.item’ saved [236343/236343]

wget: /opt/conda/lib/libuuid.so.1: no version information available (required by wget)
--2019-07-29 03:41:49--  https://github.com/moneyDboat/Programming-Collective-Intelligence/raw/master/data/movielens/u.data
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response..

In [17]:
personToItem = loadData()

## 4. &nbsp; 引用
[1] &nbsp;&nbsp; [集体智慧编程](https://book.douban.com/subject/3288908/)    
[2] &nbsp; [集体智慧编程笔记](https://github.com/moneyDboat/Programming-Collective-Intelligence)