In [25]:
import random
import pandas as pd
import numpy as np
import math
from operator import itemgetter
import argparse
np.random.seed(1024)

K=20
N=10
similarityMeasure="pearson"
data_file="../data/Rate-jupyter.txt"

In [26]:
data_fields = ['user_id', 'item_id', 'rating']
data = pd.read_table(data_file, names=data_fields)
# 二维字典
train_data = {}

# 按照1:9划分数据集
for (user, item,record) in data.itertuples(index=False):
    train_data.setdefault(user,{})
    train_data[user][item] = record

n_users = len(set(data['user_id'].values))
n_items = len(set(data['item_id'].values))

train_data

{'A': {'a': 5, 'b': 3, 'c': 4, 'd': 4},
 'B': {'a': 3, 'b': 1, 'c': 2, 'd': 2, 'e': 2},
 'C': {'a': 4, 'b': 4, 'c': 4, 'd': 4, 'e': 4},
 'D': {'a': 3, 'b': 2, 'c': 2, 'e': 3}}

In [27]:
# 计算每个用户的平均评分
average_rating = {}
for u, items in train_data.items():
    average_rating.setdefault(u, 0)
    for i in items:
        average_rating[u] += train_data[u][i] / len(items)
average_rating

{'A': 4.0, 'B': 2.0, 'C': 4.0, 'D': 2.5}

In [28]:
# 建立item_user倒排表
# item->set
item_users = dict()
for u, items in train_data.items():
    for i in items:
        if i not in item_users:
            item_users[i] = set()
        item_users[i].add(u)
item_users

{'a': {'A', 'B', 'C', 'D'},
 'b': {'A', 'B', 'C', 'D'},
 'c': {'A', 'B', 'C', 'D'},
 'd': {'A', 'B', 'C'},
 'e': {'B', 'C', 'D'}}

In [29]:
# 相似度的分子部分
C1 = dict()
C2 = dict()
C3 = dict()
for i, users in item_users.items():
    for u in users:
        for v in users:
            if u == v:
                continue
            C1.setdefault(u, {})
            C1[u].setdefault(v, 0)
            C2.setdefault(u, {})
            C2[u].setdefault(v, 0)
            C3.setdefault(u, {})
            C3[u].setdefault(v, 0)

            if similarityMeasure == "cosine":
                C1[u][v] += ((train_data[u][i]) * (
                        train_data[v][i] ))
                C2[u][v] += ((train_data[u][i]) * (
                        train_data[u][i] ))
                C3[u][v] += ((train_data[v][i] ) * (
                        train_data[v][i] ))
            else:
                C1[u][v] += ((train_data[u][i] - average_rating[u]) * (
                        train_data[v][i] - average_rating[v]))
                C2[u][v] += ((train_data[u][i] - average_rating[u]) * (
                        train_data[u][i] - average_rating[u]))
                C3[u][v] += ((train_data[v][i] - average_rating[v]) * (
                        train_data[v][i] - average_rating[v]))

# 计算最终的用户相似度矩阵
user_sim = dict()
for u, related_users in C1.items():
    user_sim[u] = {}
    for v, cuv in related_users.items():
        # print(C1[u][v],"  ",C2[u][v],"  ",C3[u][v])
        if C1[u][v]==0:
            user_sim[u][v]=0
        else:
            user_sim[u][v] = C1[u][v] / math.sqrt(C2[u][v] * C3[u][v])
user_sim

{'A': {'B': 1.0, 'D': 0.8164965809277261, 'C': 0},
 'B': {'A': 1.0, 'D': 0.7071067811865475, 'C': 0},
 'D': {'A': 0.8164965809277261, 'B': 0.7071067811865475, 'C': 0},
 'C': {'A': 0, 'B': 0, 'D': 0}}

In [32]:
def predict(user, item):
    rui = average_rating[user]
    # 分子和分母
    C1 = 0
    C2 = 0
    for similar_user, similarity_factor in sorted(user_sim[user].items(),
                                                  key=itemgetter(1), reverse=True)[0:K]:
        if item not in train_data[similar_user]:
            continue
        C1 += similarity_factor * (train_data[similar_user][item] - average_rating[similar_user])
        C2 += math.fabs(similarity_factor)
    if not C1==0:
        rui += (C1 / C2)
    else :
        rui=0
    return rui

In [33]:
predict('A','e')

4.224744871391589