In [64]:
import math
from operator import itemgetter


K=20

In [65]:
data=[
    ['A','a',5],
    ['A','b',3],
    ['A','c',4],
    ['A','d',4],
    ['B','a',3],
    ['B','b',1],
    ['B','c',2],
    ['B','d',2],
    ['B','e',2],
    ['C','a',4],
    ['C','b',4],
    ['C','c',4],
    ['C','d',4],
    ['C','e',4],
    ['D','a',3],
    ['D','b',2],
    ['D','c',2],
    ['D','e',3],
]


In [66]:
train_data = {}

for user, item,record in data:
    train_data.setdefault(user,{})
    train_data[user][item] = record

train_data

{'A': {'a': 5, 'b': 3, 'c': 4, 'd': 4},
 'B': {'a': 3, 'b': 1, 'c': 2, 'd': 2, 'e': 2},
 'C': {'a': 4, 'b': 4, 'c': 4, 'd': 4, 'e': 4},
 'D': {'a': 3, 'b': 2, 'c': 2, 'e': 3}}

In [67]:
# 计算每个用户的平均评分
average_rating = {}
for u, items in train_data.items():
    average_rating.setdefault(u, 0)
    for i in items:
        average_rating[u] += train_data[u][i] / len(items)
average_rating

{'A': 4.0, 'B': 2.0, 'C': 4.0, 'D': 2.5}

In [68]:
# 建立item_user倒排表
# item->set
item_users = dict()
for u, items in train_data.items():
    for i in items:
        if i not in item_users:
            item_users[i] = set()
        item_users[i].add(u)
item_users

{'a': {'A', 'B', 'C', 'D'},
 'b': {'A', 'B', 'C', 'D'},
 'c': {'A', 'B', 'C', 'D'},
 'd': {'A', 'B', 'C'},
 'e': {'B', 'C', 'D'}}

In [69]:
# 余弦相似度
def calCosineSimi():
    C1 = dict()
    C2 = dict()
    C3 = dict()
    for i, users in item_users.items():
        for u in users:
            for v in users:
                if u == v:
                    continue
                C1.setdefault(u, {})
                C1[u].setdefault(v, 0)
                C2.setdefault(u, {})
                C2[u].setdefault(v, 0)
                C3.setdefault(u, {})
                C3[u].setdefault(v, 0)

                C1[u][v] += ((train_data[u][i]) * (
                        train_data[v][i] ))
                C2[u][v] += ((train_data[u][i]) * (
                        train_data[u][i] ))
                C3[u][v] += ((train_data[v][i] ) * (
                        train_data[v][i] ))

    # 计算最终的用户相似度矩阵
    user_sim = dict()
    for u, related_users in C1.items():
        user_sim[u] = {}
        for v, cuv in related_users.items():
            if C1[u][v]==0:
                user_sim[u][v]=0
            else:
                user_sim[u][v] = C1[u][v] / math.sqrt(C2[u][v] * C3[u][v])
    return user_sim

In [70]:
# 皮尔逊相似度
def calPearsonSimi():
    C1 = dict()
    C2 = dict()
    C3 = dict()
    for i, users in item_users.items():
        for u in users:
            for v in users:
                if u == v:
                    continue
                C1.setdefault(u, {})
                C1[u].setdefault(v, 0)
                C2.setdefault(u, {})
                C2[u].setdefault(v, 0)
                C3.setdefault(u, {})
                C3[u].setdefault(v, 0)
                
                C1[u][v] += ((train_data[u][i] - average_rating[u]) * (
                        train_data[v][i] - average_rating[v]))
                C2[u][v] += ((train_data[u][i] - average_rating[u]) * (
                        train_data[u][i] - average_rating[u]))
                C3[u][v] += ((train_data[v][i] - average_rating[v]) * (
                        train_data[v][i] - average_rating[v]))

    # 计算最终的用户相似度矩阵
    user_sim = dict()
    for u, related_users in C1.items():
        user_sim[u] = {}
        for v, cuv in related_users.items():
            if C1[u][v]==0:
                user_sim[u][v]=0
            else:
                user_sim[u][v] = C1[u][v] / math.sqrt(C2[u][v] * C3[u][v])
    return user_sim

In [71]:
def predict(user_sim,user, item):
    rui = average_rating[user]
    # 分子和分母
    C1 = 0
    C2 = 0
    for similar_user, similarity_factor in sorted(user_sim[user].items(),
                                                  key=itemgetter(1), reverse=True)[0:K]:
        if item not in train_data[similar_user]:
            continue
        C1 += similarity_factor * (train_data[similar_user][item] - average_rating[similar_user])
        C2 += math.fabs(similarity_factor)
    if not C1==0:
        rui += (C1 / C2)
    else :
        rui=0
    return rui

In [72]:
# 余弦相似度
user_sim1=calCosineSimi()
user_sim1

{'A': {'D': 0.9946917938265513,
  'C': 0.9847319278346618,
  'B': 0.9864400504156211},
 'D': {'A': 0.9946917938265513,
  'C': 0.9805806756909202,
  'B': 0.9707253433941508},
 'C': {'A': 0.9847319278346618,
  'D': 0.9805806756909202,
  'B': 0.9534625892455924},
 'B': {'A': 0.9864400504156211,
  'D': 0.9707253433941508,
  'C': 0.9534625892455924}}

In [73]:
# 皮尔逊相似度
user_sim2=calPearsonSimi()
user_sim2

{'A': {'D': 0.8164965809277261, 'C': 0, 'B': 1.0},
 'D': {'A': 0.8164965809277261, 'C': 0, 'B': 0.7071067811865475},
 'C': {'A': 0, 'D': 0, 'B': 0},
 'B': {'A': 1.0, 'D': 0.7071067811865475, 'C': 0}}

In [74]:
# 使用余弦相似度进行推荐
predict(user_sim1,'A','e')

4.167690067762287

In [75]:
# 使用皮尔逊相似度进行推荐
predict(user_sim2,'A','e')

4.224744871391589