In [80]:
import random
import pandas as pd
import numpy as np
import math
from operator import itemgetter
import argparse
np.random.seed(1024)

K=3
similarityMeasure="pearson"
data_file="../data/Rate-jupyter.txt"

In [81]:
data_fields = ['user_id', 'item_id', 'rating']
data = pd.read_table(data_file, names=data_fields)
# 二维字典
train_data = {}

# 按照1:9划分数据集
for (user, item,record) in data.itertuples(index=False):
    train_data.setdefault(user,{})
    train_data[user][item] = record

n_users = len(set(data['user_id'].values))
n_items = len(set(data['item_id'].values))

train_data

{'A': {'a': 5, 'b': 3, 'c': 4, 'd': 4},
 'B': {'a': 3, 'b': 1, 'c': 2, 'd': 2, 'e': 2},
 'C': {'a': 4, 'b': 4, 'c': 4, 'd': 4, 'e': 4},
 'D': {'a': 3, 'b': 2, 'c': 2, 'e': 3}}

In [82]:
# 计算每个物品被用户评分的个数
item_cnt = {}
for user, items in train_data.items():
    for i in items:
        # count item popularity
        item_cnt.setdefault(i, 0)
        item_cnt[i] += 1
item_cnt

{'a': 4, 'b': 4, 'c': 4, 'd': 3, 'e': 3}

In [83]:
# 计算每个项目的平均评分
average_rating = {}
for user, items in train_data.items():
    for i in items:
        average_rating.setdefault(i, 0)
        average_rating[i] += train_data[user][i] / item_cnt[i]
average_rating

{'a': 3.75, 'b': 2.5, 'c': 3.0, 'd': 3.333333333333333, 'e': 3.0}

In [84]:
# 计算用户的平均评分
user_average_rating={}
for user, items in train_data.items():
    user_average_rating.setdefault(user, 0)
    for i in items:
        user_average_rating[user] += train_data[user][i] / len(train_data[user])
user_average_rating

{'A': 4.0, 'B': 2.0, 'C': 4.0, 'D': 2.5}

In [85]:
# 相似度的分子部分
C2 = dict()
C3 = dict()
C1 = dict()
for user, items in train_data.items():
    for i in items:
        for j in items:
            if i == j:
                continue
            C1.setdefault(i, {})
            C1[i].setdefault(j, 0)
            C2.setdefault(i, {})
            C2[i].setdefault(j, 0)
            C3.setdefault(i, {})
            C3[i].setdefault(j, 0)

            if similarityMeasure=="cosine":
                C1[i][j] += ((train_data[user][i] - user_average_rating[user]) * (
                        train_data[user][j] - user_average_rating[user]))
                C2[i][j] += ((train_data[user][i] - user_average_rating[user]) * (
                        train_data[user][i] -user_average_rating[user]))
                C3[i][j] += ((train_data[user][j] - user_average_rating[user]) * (
                        train_data[user][j] - user_average_rating[user]))
            else:
                C1[i][j] += ((train_data[user][i] -average_rating[i]) * (
                        train_data[user][j] - average_rating[j]))
                C2[i][j] += ((train_data[user][i] - average_rating[i]) * (
                        train_data[user][i] - average_rating[i]))
                C3[i][j] += ((train_data[user][j] - average_rating[j]) * (
                       train_data[user][j] - average_rating[j]))

# 计算最终的物品相似度矩阵
item_sim = dict()
for i, related_items in C1.items():
    item_sim[i] = {}
    for j, cuv in related_items.items():
        if C1[i][j] == 0:
            item_sim[i][j] = 0
        else:
            item_sim[i][j] = C1[i][j] / math.sqrt(C2[i][j] * C3[i][j])
item_sim

{'a': {'b': 0.674199862463242,
  'c': 0.9045340337332909,
  'd': 0.828078671210825,
  'e': 0.6488856845230502},
 'b': {'a': 0.674199862463242,
  'c': 0.8944271909999159,
  'd': 0.936585811581694,
  'e': 0.9733285267845753},
 'c': {'a': 0.9045340337332909,
  'b': 0.8944271909999159,
  'd': 0.9428090415820634,
  'e': 0.8164965809277261},
 'd': {'a': 0.828078671210825,
  'b': 0.936585811581694,
  'c': 0.9428090415820634,
  'e': 0.948683298050514},
 'e': {'a': 0.6488856845230502,
  'b': 0.9733285267845753,
  'c': 0.8164965809277261,
  'd': 0.948683298050514}}

In [88]:
def predict(user, item):
    rui = 0
    # 分子和分母
    C1 = 0
    C2 = 0
    if not item in item_sim:
        return rui
    for interacted_item in train_data[user]:
        simi_items=sorted(item_sim[interacted_item].items(),key=itemgetter(1), reverse=True)[:K]
        for similar_item, similarity_factor in simi_items:
            if item == similar_item:
                C1 += similarity_factor*train_data[user][interacted_item]
                C2 += math.fabs(similarity_factor)
    if not C1 == 0:
        rui = (C1 / C2)
    return rui   

In [89]:
predict('A','e')

3.4935886896179276