In [45]:
import math
from operator import itemgetter


K=3

In [46]:
data=[
    ['A','a',5],
    ['A','b',3],
    ['A','c',4],
    ['A','d',4],
    ['B','a',3],
    ['B','b',1],
    ['B','c',2],
    ['B','d',2],
    ['B','e',2],
    ['C','a',4],
    ['C','b',4],
    ['C','c',4],
    ['C','d',4],
    ['C','e',4],
    ['D','a',3],
    ['D','b',2],
    ['D','c',2],
    ['D','e',3],
]

In [47]:
train_data = {}

for user, item,record in data:
    train_data.setdefault(user,{})
    train_data[user][item] = record

train_data

{'A': {'a': 5, 'b': 3, 'c': 4, 'd': 4},
 'B': {'a': 3, 'b': 1, 'c': 2, 'd': 2, 'e': 2},
 'C': {'a': 4, 'b': 4, 'c': 4, 'd': 4, 'e': 4},
 'D': {'a': 3, 'b': 2, 'c': 2, 'e': 3}}

In [48]:
# 计算每个物品被用户评分的个数
item_cnt = {}
for user, items in train_data.items():
    for i in items:
        # count item popularity
        item_cnt.setdefault(i, 0)
        item_cnt[i] += 1
item_cnt

{'a': 4, 'b': 4, 'c': 4, 'd': 3, 'e': 3}

In [49]:
# 计算每个项目的平均评分
average_rating = {}
for user, items in train_data.items():
    for i in items:
        average_rating.setdefault(i, 0)
        average_rating[i] += train_data[user][i] / item_cnt[i]
average_rating

{'a': 3.75, 'b': 2.5, 'c': 3.0, 'd': 3.333333333333333, 'e': 3.0}

In [50]:
# 计算用户的平均评分
user_average_rating={}
for user, items in train_data.items():
    user_average_rating.setdefault(user, 0)
    for i in items:
        user_average_rating[user] += train_data[user][i] / len(train_data[user])
user_average_rating

{'A': 4.0, 'B': 2.0, 'C': 4.0, 'D': 2.5}

In [51]:
# 修正后的余弦相似度
def calCosineSimi():
    C2 = dict()
    C3 = dict()
    C1 = dict()
    for user, items in train_data.items():
        for i in items:
            for j in items:
                if i == j:
                    continue
                C1.setdefault(i, {})
                C1[i].setdefault(j, 0)
                C2.setdefault(i, {})
                C2[i].setdefault(j, 0)
                C3.setdefault(i, {})
                C3[i].setdefault(j, 0)
                C1[i][j] += ((train_data[user][i] - user_average_rating[user]) * (
                        train_data[user][j] - user_average_rating[user]))
                C2[i][j] += ((train_data[user][i] - user_average_rating[user]) * (
                        train_data[user][i] -user_average_rating[user]))
                C3[i][j] += ((train_data[user][j] - user_average_rating[user]) * (
                        train_data[user][j] - user_average_rating[user]))
    # 计算最终的物品相似度矩阵
    item_sim = dict()
    for i, related_items in C1.items():
        item_sim[i] = {}
        for j, cuv in related_items.items():
            if C1[i][j] == 0:
                item_sim[i][j] = 0
            else:
                item_sim[i][j] = C1[i][j] / math.sqrt(C2[i][j] * C3[i][j])
    return item_sim

In [52]:
# 皮尔逊相似度
def calPearsonSimi():
    C2 = dict()
    C3 = dict()
    C1 = dict()
    for user, items in train_data.items():
        for i in items:
            for j in items:
                if i == j:
                    continue
                C1.setdefault(i, {})
                C1[i].setdefault(j, 0)
                C2.setdefault(i, {})
                C2[i].setdefault(j, 0)
                C3.setdefault(i, {})
                C3[i].setdefault(j, 0)

                C1[i][j] += ((train_data[user][i] -average_rating[i]) * (
                        train_data[user][j] - average_rating[j]))
                C2[i][j] += ((train_data[user][i] - average_rating[i]) * (
                        train_data[user][i] - average_rating[i]))
                C3[i][j] += ((train_data[user][j] - average_rating[j]) * (
                       train_data[user][j] - average_rating[j]))

    # 计算最终的物品相似度矩阵
    item_sim = dict()
    for i, related_items in C1.items():
        item_sim[i] = {}
        for j, cuv in related_items.items():
            if C1[i][j] == 0:
                item_sim[i][j] = 0
            else:
                item_sim[i][j] = C1[i][j] / math.sqrt(C2[i][j] * C3[i][j])
    return item_sim

In [53]:
def predict(item_sim,user, item):
    rui = 0
    # 分子和分母
    C1 = 0
    C2 = 0
    if not item in item_sim:
        return rui
    for interacted_item in train_data[user]:
        simi_items=sorted(item_sim[interacted_item].items(),key=itemgetter(1), reverse=True)[:K]
        for similar_item, similarity_factor in simi_items:
            if item == similar_item:
                C1 += similarity_factor*train_data[user][interacted_item]
                C2 += math.fabs(similarity_factor)
    if not C1 == 0:
        rui = (C1 / C2)
    return rui   

In [54]:
# 余弦相似度
user_sim1=calCosineSimi()
user_sim1

{'a': {'b': -1.0, 'c': -0.3333333333333333, 'd': 0, 'e': 0.4472135954999579},
 'b': {'a': -1.0, 'c': 0.3333333333333333, 'd': 0, 'e': -0.4472135954999579},
 'c': {'a': -0.3333333333333333, 'b': 0.3333333333333333, 'd': 0, 'e': -1.0},
 'd': {'a': 0, 'b': 0, 'c': 0, 'e': 0},
 'e': {'a': 0.4472135954999579, 'b': -0.4472135954999579, 'c': -1.0, 'd': 0}}

In [55]:
# 皮尔逊相似度
user_sim2=calPearsonSimi()
user_sim2

{'a': {'b': 0.674199862463242,
  'c': 0.9045340337332909,
  'd': 0.828078671210825,
  'e': 0.6488856845230502},
 'b': {'a': 0.674199862463242,
  'c': 0.8944271909999159,
  'd': 0.936585811581694,
  'e': 0.9733285267845753},
 'c': {'a': 0.9045340337332909,
  'b': 0.8944271909999159,
  'd': 0.9428090415820634,
  'e': 0.8164965809277261},
 'd': {'a': 0.828078671210825,
  'b': 0.936585811581694,
  'c': 0.9428090415820634,
  'e': 0.948683298050514},
 'e': {'a': 0.6488856845230502,
  'b': 0.9733285267845753,
  'c': 0.8164965809277261,
  'd': 0.948683298050514}}

In [56]:
# 使用余弦相似度进行推荐
predict(user_sim1,'A','e')

1.0000000000000002

In [57]:
# 使用皮尔逊相似度进行推荐
predict(user_sim2,'A','e')

3.4935886896179276