In [82]:

import numpy as np
import math
from operator import itemgetter


# 近邻数目最多为20
K=20  

# 推荐物品数目最多为10
N=10

In [83]:
data=[
    ['A','b'],
    ['A','d'],
    ['B','a'],
    ['B','b'],
    ['B','c'],
    ['C','a'],
    ['C','b'],
    ['C','d'],
    ['D','a'],
    ['D','e']
]
data=np.array(data)

In [84]:
# user->item的映射
# key：user
# value：item的set
train_data = {}

for user, item in data:
    train_data.setdefault(user,set())
    train_data[user].add(item)

# 用户数量和物品数量
n_users = len(list(set(data[:,0])))
n_items = len(list(set(data[:,1])))

In [85]:
train_data

{'A': {'b', 'd'}, 'B': {'a', 'b', 'c'}, 'C': {'a', 'b', 'd'}, 'D': {'a', 'e'}}

In [86]:
# 建立item_user倒排表
# item->set
item_users = dict()
for u, items in train_data.items():
    for i in items:
        if i not in item_users:
            item_users[i] = set()
        item_users[i].add(u)
item_users

{'d': {'A', 'C'},
 'b': {'A', 'B', 'C'},
 'c': {'B'},
 'a': {'B', 'C', 'D'},
 'e': {'D'}}

In [87]:
 # 计算用户之间共同评分的物品数,C为修正后的，count为修正前的。
C = dict()
count = dict()
for i, users in item_users.items():
    for u in users:
        for v in users:
            if u == v:
                continue
            C.setdefault(u,{})
            C[u].setdefault(v,0)
            # 对热门物品进行惩罚
            C[u][v] += math.log(n_users/len(users))
            

            count.setdefault(u, {})
            count[u].setdefault(v, 0)
            count[u][v] += 1


In [88]:
C

{'A': {'C': 0.9808292530117262, 'B': 0.28768207245178085},
 'C': {'A': 0.9808292530117262,
  'B': 0.5753641449035617,
  'D': 0.28768207245178085},
 'B': {'A': 0.28768207245178085,
  'C': 0.5753641449035617,
  'D': 0.28768207245178085},
 'D': {'C': 0.28768207245178085, 'B': 0.28768207245178085}}

In [89]:
count

{'A': {'C': 2, 'B': 1},
 'C': {'A': 2, 'B': 2, 'D': 1},
 'B': {'A': 1, 'C': 2, 'D': 1},
 'D': {'C': 1, 'B': 1}}

In [90]:
# 计算原始的余弦相似度
def calCosineSimi():
    user_sim = dict()
    for u, related_users in C.items():
        user_sim[u]={}
        for v, cuv in related_users.items():
            user_sim[u][v] = count[u][v] / math.sqrt(len(train_data[u]) * len(train_data[v]))
    return user_sim

In [91]:
# 计算修正后的余弦相似度
def calCorrectionCosineSimi():
    user_sim = dict()
    for u, related_users in C.items():
        user_sim[u]={}
        for v, cuv in related_users.items():
            user_sim[u][v] = cuv / math.sqrt(len(train_data[u]) * len(train_data[v]))
    return user_sim

In [92]:
# 计算杰卡德相似度
def calJaccardSimi():
    user_sim = dict()
    for u, related_users in C.items():
        user_sim[u]={}
        for v, cuv in related_users.items():
            user_sim[u][v] = count[u][v] / (len(train_data[u])+len(train_data[v])-count[u][v])
    return user_sim

In [93]:
# 给用户推荐N个item
def predict(user_sim,user):
    rank = dict()
    interacted_items = train_data[user]

    # 寻找最近的K个用户，利用它们的评分信息构造推荐列表
    for similar_user, similarity_factor in sorted(user_sim[user].items(),
                                                  key=itemgetter(1), reverse=True)[0:K]:
        for movie in train_data[similar_user]:
            if movie in interacted_items:
                continue
            rank.setdefault(movie, 0)
            rank[movie] += similarity_factor

    rec_list=[]
    rec_items=sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
    for item,score in rec_items:
        rec_list.append([item,score])
    return rec_list

In [94]:
# 余弦相似度
user_sim1=calCosineSimi()
user_sim1

{'A': {'C': 0.8164965809277261, 'B': 0.4082482904638631},
 'C': {'A': 0.8164965809277261,
  'B': 0.6666666666666666,
  'D': 0.4082482904638631},
 'B': {'A': 0.4082482904638631,
  'C': 0.6666666666666666,
  'D': 0.4082482904638631},
 'D': {'C': 0.4082482904638631, 'B': 0.4082482904638631}}

In [95]:
# 修正后的余弦相似度
user_sim2=calCorrectionCosineSimi()
user_sim2

{'A': {'C': 0.40042186577898503, 'B': 0.11744571427554072},
 'C': {'A': 0.40042186577898503,
  'B': 0.19178804830118723,
  'D': 0.11744571427554072},
 'B': {'A': 0.11744571427554072,
  'C': 0.19178804830118723,
  'D': 0.11744571427554072},
 'D': {'C': 0.11744571427554072, 'B': 0.11744571427554072}}

In [96]:
# 杰卡德相似度
user_sim3=calJaccardSimi()
user_sim3

{'A': {'C': 0.6666666666666666, 'B': 0.25},
 'C': {'A': 0.6666666666666666, 'B': 0.5, 'D': 0.25},
 'B': {'A': 0.25, 'C': 0.5, 'D': 0.25},
 'D': {'C': 0.25, 'B': 0.25}}

In [97]:
# 使用余弦相似度进行推荐
for user in ['A','B','C','D']:
        rec_list=predict(user_sim1,user)
        print("给",user,"推荐：",rec_list)

给 A 推荐： [['a', 1.2247448713915892], ['c', 0.4082482904638631]]
给 B 推荐： [['d', 1.0749149571305296], ['e', 0.4082482904638631]]
给 C 推荐： [['c', 0.6666666666666666], ['e', 0.4082482904638631]]
给 D 推荐： [['b', 0.8164965809277261], ['d', 0.4082482904638631], ['c', 0.4082482904638631]]


In [98]:
# 使用修正后的余弦相似度进行推荐
for user in ['A','B','C','D']:
        rec_list=predict(user_sim2,user)
        print("给",user,"推荐：",rec_list)

给 A 推荐： [['a', 0.5178675800545257], ['c', 0.11744571427554072]]
给 B 推荐： [['d', 0.30923376257672797], ['e', 0.11744571427554072]]
给 C 推荐： [['c', 0.19178804830118723], ['e', 0.11744571427554072]]
给 D 推荐： [['b', 0.23489142855108144], ['d', 0.11744571427554072], ['c', 0.11744571427554072]]


In [99]:
# 使用杰卡德相似度进行推荐
for user in ['A','B','C','D']:
        rec_list=predict(user_sim3,user)
        print("给",user,"推荐：",rec_list)

给 A 推荐： [['a', 0.9166666666666666], ['c', 0.25]]
给 B 推荐： [['d', 0.75], ['e', 0.25]]
给 C 推荐： [['c', 0.5], ['e', 0.25]]
给 D 推荐： [['b', 0.5], ['d', 0.25], ['c', 0.25]]
