In [26]:
import random
import pandas as pd
import numpy as np
import math
from operator import itemgetter
import argparse
np.random.seed(1024)

K=20
N=10
similarityMeasure="cosine"
data_file="../data/TopN-jupyter.txt"

In [27]:
data_fields = ['user_id', 'item_id']
data = pd.read_table(data_file, names=data_fields)
# 二维字典
train_data = {}

# 按照1:9划分数据集
for (user, item) in data.itertuples(index=False):
    train_data.setdefault(user,{})
    train_data[user][item] = 1

n_users = len(set(data['user_id'].values))
n_items = len(set(data['item_id'].values))
train_data

{'A': {'b': 1, 'd': 1},
 'B': {'a': 1, 'b': 1, 'c': 1},
 'C': {'a': 1, 'b': 1, 'd': 1},
 'D': {'a': 1, 'e': 1}}

In [28]:
# 计算每个物品被用户评分的个数
item_cnt = {}
for user, items in train_data.items():
    for i in items:
        # count item popularity
        item_cnt.setdefault(i,0)
        item_cnt[i] += 1
item_cnt

{'b': 3, 'd': 2, 'a': 3, 'c': 1, 'e': 1}

In [29]:
 # 计算物品之间共同评分的物品数,C为修正后的，count为修正前的。
C = dict()
count=dict()
for user, items in train_data.items():
    for u in items:
        for v in items:
            if u == v:
                continue
            C.setdefault(u,{})
            C[u].setdefault(v,0)
            C[u][v] += math.log(n_items/len(items))

            count.setdefault(u, {})
            count[u].setdefault(v, 0)
            count[u][v] += 1


In [30]:
C

{'b': {'d': 1.4271163556401458,
  'a': 1.0216512475319814,
  'c': 0.5108256237659907},
 'd': {'b': 1.4271163556401458, 'a': 0.5108256237659907},
 'a': {'b': 1.0216512475319814,
  'c': 0.5108256237659907,
  'd': 0.5108256237659907,
  'e': 0.9162907318741551},
 'c': {'a': 0.5108256237659907, 'b': 0.5108256237659907},
 'e': {'a': 0.9162907318741551}}

In [31]:
count

{'b': {'d': 2, 'a': 2, 'c': 1},
 'd': {'b': 2, 'a': 1},
 'a': {'b': 2, 'c': 1, 'd': 1, 'e': 1},
 'c': {'a': 1, 'b': 1},
 'e': {'a': 1}}

In [32]:
# 计算最终的物品相似度矩阵
item_sim = dict()
for u, related_items in C.items():
    item_sim[u]={}
    for v, cuv in related_items.items():
        if similarityMeasure=="cosine":
            item_sim[u][v] = count[u][v] / math.sqrt(item_cnt[u] * item_cnt[v])
#             item_sim[u][v] = cuv / math.sqrt(item_cnt[u] * item_cnt[v])
        else:
            item_sim[u][v] = count[u][v] / (item_cnt[u])
item_sim

{'b': {'d': 0.8164965809277261,
  'a': 0.6666666666666666,
  'c': 0.5773502691896258},
 'd': {'b': 0.8164965809277261, 'a': 0.4082482904638631},
 'a': {'b': 0.6666666666666666,
  'c': 0.5773502691896258,
  'd': 0.4082482904638631,
  'e': 0.5773502691896258},
 'c': {'a': 0.5773502691896258, 'b': 0.5773502691896258},
 'e': {'a': 0.5773502691896258}}

In [33]:
def predict( user):
    rank = dict()
    interacted_items = train_data[user]

    # 对每个评分的物品寻找最近K个物品，构建评分列表
    for item, rating in interacted_items.items():
        for similar_item, similarity_factor in sorted(item_sim[item].items(),
                                                       key=itemgetter(1), reverse=True)[:K]:
            if similar_item in interacted_items:
                continue
            rank.setdefault(similar_item, 0)
            # rank[similar_item] += similarity_factor * rating
            rank[similar_item] += similarity_factor

    rec_list = []
    rec_items = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
    for item, score in rec_items:
        rec_list.append([item,score])

    # 返回最大N个物品
    return rec_list

In [34]:
for user in ['A','B','C','D']:
        rec_list=predict(user)
        print("给",user,"推荐：",rec_list)

给 A 推荐： [['a', 1.0749149571305296], ['c', 0.5773502691896258]]
给 B 推荐： [['d', 1.2247448713915892], ['e', 0.5773502691896258]]
给 C 推荐： [['c', 1.1547005383792517], ['e', 0.5773502691896258]]
给 D 推荐： [['b', 0.6666666666666666], ['c', 0.5773502691896258], ['d', 0.4082482904638631]]
