In [1]:
import pandas as pd
import autograd.numpy as np
from autograd import grad
import random
from tqdm import tqdm

In [2]:
# 训练集和测试集
def get_train_data():
    return pd.read_csv('/Users/chao/workspace/d2l/data/ml-100k/u1.base', sep='\t', names=['userId', 'itemId', 'rating', 'timestamp'])

def get_test_data():
    return pd.read_csv('/Users/chao/workspace/d2l/data/ml-100k/u1.test', sep='\t', names=['userId', 'itemId', 'rating', 'timestamp'])

In [3]:
# 用户-物品矩阵
user_num = 943
item_num = 1682
ratings = np.zeros((user_num + 1, item_num + 1), int)
y_ui = np.zeros((user_num + 1, item_num + 1), int)

with open('/Users/chao/workspace/d2l/data/ml-100k/u1.base', 'r') as file:
    for line in file.readlines():
        user, item, rating, timestamp = line.split('	')
        ratings[int(user)][int(item)] = int(rating)
        y_ui[int(user)][int(item)] = 1

p = y_ui.sum()
density = p / (user_num * item_num)
r_ = ratings.sum() / p

In [4]:
# 求用户均值和物品均值
r_1 = ratings.sum(axis=0)
r_2 = ratings.sum(axis=1)
r_3 = y_ui.sum(axis=0)
r_4 = y_ui.sum(axis=1)

r_u = np.zeros(user_num + 1, float)
for i in range(1, user_num + 1):
    r_u[i] = (r_ if r_4[i] == 0 else r_2[i] / r_4[i])

r_i = np.zeros(item_num + 1, float)
for i in range(1, item_num + 1):
    r_i[i] = (r_ if r_3[i] == 0 else r_1[i] / r_3[i])

b_u = np.zeros(user_num + 1, float)
for i in range(1, user_num + 1):
    b_u[i] = (0 if r_4[i] == 0 else (y_ui[i] * (ratings[i] - r_i)).sum() / r_4[i])

b_i = np.zeros(item_num + 1, float)
for i in range(1, item_num + 1):
    b_i[i] = (0 if r_3[i] == 0 else (y_ui[:, i] * (ratings[:, i] - r_u)).sum() / r_3[i])

In [5]:
# 损失函数
def MAE(predict_rule):
    test_data = get_test_data()
    data_num = test_data.shape[0]
    loss = 0.0
    for i in range(data_num):
        userId, itemId, rating = test_data.iloc[i,:]['userId'], test_data.iloc[i,:]['itemId'], test_data.iloc[i,:]['rating'] 
        y_hat = postProcess(predict_rule(userId, itemId))
        loss += abs(y_hat - rating)
    return loss / data_num

def RMSE(predict_rule):
    test_data = get_test_data()
    data_num = test_data.shape[0]
    loss = 0.0
    for i in range(data_num):
        userId, itemId, rating = test_data.iloc[i,:]['userId'], test_data.iloc[i,:]['itemId'], test_data.iloc[i,:]['rating'] 
        y_hat = postProcess(predict_rule(userId, itemId))
        loss += ((y_hat - rating) ** 2) / data_num
    return loss ** 0.5

In [6]:
# 用户u评分过的所有物品
def I_u(u):
    return np.where(y_ui[u] == 1)[0]

# 给物品j评分过的所有用户
def U_j(j):
    return np.where(y_ui[:, j] == 1)[0]

# 用户u评分过的所有分数
def R_u(u):
    items = np.unique(ratings[u] * y_ui[u])
    return items[items != 0]

# 求用户u和w共同评分过的物品
def I_u_w(u, w):
    return np.intersect1d(I_u(u), I_u(w))

# 给物品k和j都评分过的用户
def U_k_j(k, j):
    return np.intersect1d(U_j(k), U_j(j))

# 用户u评分为r的所有物品
def I_u_r(u, r):
    items = I_u(u)
    return np.where(ratings[u] == r)[0]

In [7]:
# 数据后处理
def postProcess(num):
    num = min(5.0, num)
    num = max(1.0, num)
    return num

# 随机打乱数据
def shuffle_data(data):
    indices = list(range(len(data)))
    random.shuffle(indices)
    return data.iloc[indices,:]

In [8]:
# 预测
def predict(*predict_rules):
    for predict_rule in predict_rules:
        print(f"RMSE: {RMSE(predict_rule):.4f}, MAE: {MAE(predict_rule):.4f}")