In [35]:
import math

import pandas as pd
import numpy as np
import random

from tqdm import tqdm

#读入数据集
u1_base = pd.read_csv('ml-100k/u1.base', sep='\t', names=['uid', 'iid', 'rate', 'timestamp'])
u1_test = pd.read_csv('ml-100k/u1.test', sep='\t', names=['uid', 'iid', 'rate', 'timestamp'])

#初始化
user_num = u1_base['uid'].max()
item_num = u1_base['iid'].max()

rating_matrix = np.zeros((user_num, item_num), float)
y_ui = np.zeros((user_num, item_num), int)

#base记录转化为matrix
for index, row in u1_base.iterrows():
    user_id = row['uid']
    item_id = row['iid']
    rating = row['rate']
    rating_matrix[user_id - 1, item_id - 1] = rating
    y_ui[user_id - 1, item_id - 1] = 1

R = y_ui.sum()
#全局平均
GlobalAverage = rating_matrix.sum() / R

#计算四个参数 user_means item_means user_bias item_bias
rating_sum_row = [sum(row) for row in rating_matrix]
y_sum_row = [sum(row) for row in y_ui]

rating_sum_col = [sum(column) for column in zip(*rating_matrix)]
y_sum_col = [sum(column) for column in zip(*y_ui)]

user_means = []
for i in range(user_num):
    if y_sum_row[i] == 0:
        user_means.append(GlobalAverage)
    else:
        user_means.append(rating_sum_row[i] / y_sum_row[i])

item_means = []
for i in range(item_num):
    if y_sum_col[i] == 0:
        item_means.append(GlobalAverage)
    else:
        item_means.append(rating_sum_col[i] / y_sum_col[i])

user_bias = []
for i in range(user_num):
    if y_sum_row[i] == 0:
        user_bias.append(0)
    else:
        sum_bias = 0
        for j in range(item_num):
            sum_bias += y_ui[i][j] * (rating_matrix[i][j] - GlobalAverage)
        user_bias.append(sum_bias / y_sum_row[i])

item_bias = []
for i in range(item_num):
    if y_sum_col[i] == 0:
        item_bias.append(0)
    else:
        sum_bias = 0
        for j in range(user_num):
            sum_bias += y_ui[j][i] * (rating_matrix[j][i] - GlobalAverage)
        item_bias.append(sum_bias / y_sum_col[i])


In [52]:
#test
I_u_preferred = {}
U_te = set()
for index, row in u1_test.iterrows():
    user_id = row['uid'] - 1
    item_id = row['iid'] - 1
    rating = row['rate']
    if rating >= 4:
        if user_id not in I_u_preferred:
            I_u_preferred[user_id] = set()
        I_u_preferred[user_id].add(item_id)
        U_te.add(user_id)

In [36]:
#用户u评价过的物品集
def I_u(user_id):
    return np.where(y_ui[user_id] == 1)[0]
#用户u未评价过的物品集
def I_unob_u(user_id):
    return np.where(y_ui[user_id] == 0)[0]


In [37]:
#为用户u推荐的物品
def get_rec_items(u,R_HAT,k):
    unob_items = I_unob_u(u)
    items_pred = {i:R_HAT(u,i) for i in unob_items }
    # 根据预测评分降序排序
    sorted_items = sorted(items_pred.items(), key=lambda x: x[1], reverse=True)
    # 输出前 k 个物品及其评分
    top_k_items = sorted_items[:k]
    return [k for k,v in top_k_items]


In [38]:
# 正负样本集
S_Pos = set()
S_Neg = set()
for index, row in u1_base.iterrows():
    user_id = row['uid'] - 1 
    item_id = row['iid'] - 1
    rating = row['rate']
    if rating >= user_means[user_id]:
        tri = (user_id, item_id, 1)
        S_Pos.add(tri)
    else:
        tri = (user_id, item_id, -1)
        S_Neg.add(tri)
        

In [39]:
d = 20
def items_u_r(u, item, r):
    items = np.where(rating_matrix[u] == r)[0]
    items = items[items != item]
    return items


In [40]:
def U_MPC(u, i, M):
    U_mpc = np.zeros(d, float)
    for rating_class in range(5):
        items = items_u_r(u, i, rating_class + 1)
        items_len = len(items)
        if items_len == 0:
            continue
        U_mpc += np.sum(M[rating_class, items, :], axis=0) / items_len ** 0.5
    return U_mpc

def R_HAT(u, i):
    return miu + user_bias[u] + item_bias[i] + np.dot(U[u], V[i]) + np.dot(U_MPC(u, i, M), V[i])


In [53]:
U = np.random.rand(user_num + 1, d)
V = np.random.rand(item_num + 1, d)
M = np.random.rand(5, item_num, d)
U = (U - 0.5) * 0.01
V = (V - 0.5) * 0.01
M = (M - 0.5) * 0.01
miu = GlobalAverage


In [42]:
T = 20
p = 3


In [43]:
def sample_unobserved_positions(matrix, n):
    # 获取所有值为0的元素的坐标
    unobserved_positions = np.argwhere(matrix == 0)
    
    # 检查n是否超过零元素数量
    if n > len(unobserved_positions):
        raise ValueError(f"n={n} 超过了矩阵中元素0的数量：{len(unobserved_positions)}")
    
    # 随机选择n个坐标
    sampled_positions = unobserved_positions[np.random.choice(len(unobserved_positions), n, replace=False)]
    S_Unob = set()
    for i in range(n):
        tri = (sampled_positions[i][0], sampled_positions[i][1],-1)
        S_Unob.add(tri)

    # 返回采样的集合
    return S_Unob


In [44]:
def sigmoid(x):
    return 1/(1+math.exp(-x))

In [45]:
lr = 0.01
lmda = 0.01
for t in range(T):
    S_Unob = sample_unobserved_positions(y_ui, p * R)
    S_New = S_Neg | S_Unob | S_Pos
    S_Num = len(S_New)
    S_list = list(S_New)
    for i in range(S_Num):
        # print(f'\repoch:{t + 1},{i}:{S_Num}',end='')
        sampled_triplet = random.sample(S_list, 1)[0]
        user_id = sampled_triplet[0]
        item_id = sampled_triplet[1]
        yui = sampled_triplet[2]
        U_mpc = U_MPC(user_id, item_id, M)
        r_pred = R_HAT(user_id, item_id)
        print(r_pred)
        e = -yui * (sigmoid(-yui * r_pred))
        #计算梯度
        delta_Uu = e * V[item_id] + lmda * U[user_id]
        delta_Vi = e * (U[user_id] + U_mpc) + lmda * V[item_id]
        delta_miu = e
        delta_bu = e + lmda * user_bias[user_id]
        delta_bi = e + lmda * item_bias[item_id]

        for r in range(5):
            i_pie = items_u_r(user_id, item_id, r + 1)
            i_num = len(i_pie)
            if i_num == 0:
                continue
            for i in i_pie:
                delta_M = e * V[item_id] / i_num ** 0.5 + lmda * M[r][i]
                M[r][i] -= lr * delta_M
        #update 

        miu -= lr * delta_miu
        user_bias[user_id] -= lr * delta_bu
        item_bias[item_id] -= lr * delta_bi
        U[user_id] -= lr * delta_Uu
        V[item_id] -= lr * delta_Vi

2.34511700659671
3.277958651977678
3.33828783017674
3.72272333992431
2.9573888032839277
3.462823030939517
3.9116801724342185
1.9953262942699233
4.280095503860184
3.9924788368190294
3.2197152651653558
3.2140039754647702
3.366758110081139
2.7245221618546305
3.609830865010462
3.5556039795410346
2.2727728759732457
2.742019598639493
3.5375838681719247
2.749085379646243
3.5325418959593606
3.89136932239993
1.7498759887367952
2.3627013103112393
0.7069895562333471
2.5147168395384183
3.3041232509422813
2.64052260380207
3.042412221518175
0.4630819286562183
3.1274350476705357
4.047042503137995
3.15048101666271
2.6843454233713815
3.3677514119519842
3.684255174966473
3.438600275593598
1.5754585993042778
2.0793724626455035
2.5395172961081363
3.5448086824819827
2.585647005041006
3.1082192974859923
1.1496203483278762
2.4157841173300336
2.4080801354405246
2.7340734102397612
1.9796238201513894
0.9043311832175633
1.915241313582703
3.9272940946826878
1.1747674023412231
3.0360811719810608
2.3032630206868445

KeyboardInterrupt: 

In [46]:
def Pre_at_k(R_HAT, k):
    Pre = 0
    for u in U_te:
        rec_items = get_rec_items(u, R_HAT, k)
        pre_items = I_u_preferred[u]  #用户实际喜欢项目 
        Pre_u = len(set(rec_items).intersection(set(pre_items))) / k
        Pre += Pre_u
    return Pre / len(U_te)


def Rec_at_k(R_HAT, k):
    Rec = 0
    for u in U_te:
        rec_items = get_rec_items(u, R_HAT, k)
        pre_items = I_u_preferred[u]
        Rec_u = len(set(rec_items).intersection(set(pre_items))) / len(pre_items)
        Rec += Rec_u
    return Rec / len(U_te)


def NDCG_at_k(R_HAT, k):
    NDCG = 0
    for u in U_te:
        rec_items = get_rec_items(u, R_HAT, k)
        pre_items = I_u_preferred[u]
        DCG_u = 0
        for pos in range(k):
            if rec_items[pos] in pre_items:
                DCG_u += 1 / math.log(pos + 2)
        IDCG_u = sum(1.0 / math.log(pos + 2) for pos in range(min(len(pre_items), k)))  #Zu
        NDCG += DCG_u / IDCG_u
    return NDCG / len(U_te)

In [54]:
print(f"Pre : {Pre_at_k(R_HAT, 5):.4f}")
print(f"Rec : {Rec_at_k(R_HAT, 5):.4f}")
print(f"NDCG: {NDCG_at_k(R_HAT, 5):.4f}")

Pre : 0.2105


KeyboardInterrupt: 