In [1]:
import numpy as np
import random

In [2]:
# 读取训练集
user_num = 943
item_num = 1682
k = 50
lambda_ = 0.5

ratings = np.zeros((user_num + 1, item_num + 1), int)
y_ui = np.zeros((user_num + 1, item_num + 1), int)

with open('../data/ml-100k/u1.base', 'r') as file:
    for line in file.readlines():
        user, item, rating, timestamp = line.split('	')
        ratings[int(user)][int(item)] = int(rating)
        y_ui[int(user)][int(item)] = 1

p = y_ui.sum()
density = p / (user_num * item_num)
r_ = ratings.sum() / p
n = y_ui.sum()

r_1 = ratings.sum(axis=0)
r_2 = ratings.sum(axis=1)
r_3 = y_ui.sum(axis=0)
r_4 = y_ui.sum(axis=1)

In [3]:
# 求四个参数
r_u = np.zeros(user_num + 1, float)
for i in range(1, user_num + 1):
    r_u[i] = (r_ if r_4[i] == 0 else r_2[i] / r_4[i])

r_i = np.zeros(item_num + 1, float)
for i in range(1, item_num + 1):
    r_i[i] = (r_ if r_3[i] == 0 else r_1[i] / r_3[i])

b_u = np.zeros(user_num + 1, float)
for i in range(1, user_num + 1):
    b_u[i] = (0 if r_4[i] == 0 else (y_ui[i] * (ratings[i] - r_i)).sum() / r_4[i])

b_i = np.zeros(item_num + 1, float)
for i in range(1, item_num + 1):
    b_i[i] = (0 if r_3[i] == 0 else (y_ui[:, i] * (ratings[:, i] - r_u)).sum() / r_3[i])

In [4]:
# 相邻算法
def PCC_user(u, w):
    # 求u和w的交集
    intersect = y_ui[u] * y_ui[w]
    if intersect.sum() == 0:
        return 0
    s1, s2, s3 = 0.0, 0.0, 0.0
    for k in range(1, item_num + 1):
        if intersect[k] == 1:
            s1 += (ratings[u][k] - r_u[u]) * (ratings[w][k] - r_u[w])
            s2 += (ratings[u][k] - r_u[u]) ** 2
            s3 += (ratings[w][k] - r_u[w]) ** 2
    if s2 == 0.0 or s3 == 0.0:
        return 0.0
    return s1 / ((s2 ** 0.5) * (s3 ** 0.5))

def PCC_item(k, j):
    # 求k和j的交集
    intersect = y_ui[:, k] * y_ui[:, j]
    if intersect.sum() == 0:
        return 0
    s1, s2, s3 = 0.0, 0.0, 0.0
    for u in range(1, user_num + 1):
        if intersect[u] == 1:
            s1 += (ratings[u][k] - r_u[u]) * (ratings[u][j] - r_u[u])
            s2 += (ratings[u][k] - r_u[u]) ** 2
            s3 += (ratings[u][j] - r_u[u]) ** 2
    if s2 == 0.0 or s3 == 0.0:
        return 0.0
    return s1 / ((s2 ** 0.5) * (s3 ** 0.5))

In [5]:
# 求用户和物品的相邻系数矩阵
s_uw = np.zeros((user_num + 1, user_num + 1), float)
for i in range(1, user_num + 1):
    for j in range(1, i + 1):
        if j == i:
            s_uw[i][j] = 1.0
        else:
            s_uw[i][j] = PCC_user(i, j)
            s_uw[j][i] = PCC_user(i, j)
            
s_kj = np.zeros((item_num + 1, item_num + 1), float)
for i in range(1, item_num + 1):
    for j in range(1, i + 1):
        if j == i:
            s_kj[i][j] = 1.0
        else:
            s_kj[i][j] = PCC_item(i, j)
            s_kj[j][i] = PCC_item(i, j)

In [14]:
# 获取k个邻居
def get_user_neighbors(u, j):
    # 获取所有对物品j评分了的用户
    users = s_uw[u] * y_ui[:, j]
    users[u] = 0
    neighbors = np.argpartition(users, -k)[-k:]
    # 去掉相关系数为0的邻居
    neighbors = neighbors[neighbors != 0]
    return neighbors

def get_item_neighbors(u, j):
    # 获取所有被用户u评分过的物品
    items = s_kj[j] * y_ui[u]
    items[j] = 0
    neighbors = np.argpartition(items, -k)[-k:]
    # 去掉相关系数为0的邻居
    neighbors = neighbors[neighbors != 0]
    return neighbors

In [38]:
# 预测规则
def UCF(ratings_test, y_ui_test):
    ratings_hat = np.zeros_like(ratings_test, float)
    for u in range(1, user_num + 1):
        for j in range(1, item_num + 1):
            if y_ui_test[u][j] == 1:
                # 获取邻居
                neighbors = get_user_neighbors(u, j)
                rating_hat = 0.0
                if len(neighbors) == 0:
                    rating_hat = r_u[u]
                else:
                    s1, s2 = 0.0, 0.0
                    for w in neighbors:
                        s1 += s_uw[w][u] * (ratings[w][j] - r_u[w])
                        s2 += s_uw[w][u]
                    if s2 == 0.0:
                        rating_hat = r_u[u]
                    else:
                        rating_hat = r_u[u] + (s1 / s2)
                        rating_hat = min(5.0, rating_hat)
                        rating_hat = max(1.0, rating_hat)
                ratings_hat[u][j] = rating_hat
    return ratings_hat

def ICF(ratings_test, y_ui_test):
    ratings_hat = np.zeros_like(ratings_test, float)
    for u in range(1, user_num + 1):
        for j in range(1, item_num + 1):
            if y_ui_test[u][j] == 1:
                # 获取邻居
                neighbors = get_item_neighbors(u, j)
                rating_hat = 0.0
                if len(neighbors) == 0:
                    rating_hat = r_u[u]
                else:
                    s1, s2 = 0.0, 0.0
                    for w in neighbors:
                        s1 += s_kj[w][j] * ratings[u][w]
                        s2 += s_kj[w][j]
                    if s2 == 0.0:
                        rating_hat = r_u[u]
                    else:
                        rating_hat = s1 / s2
                        rating_hat = min(5.0, rating_hat)
                        rating_hat = max(1.0, rating_hat)
                ratings_hat[u][j] = rating_hat
    return ratings_hat

def HCF(u, j):
    return lambda_ * UCF(u, j) + (1 - lambda_) * ICF(u, j)

In [39]:
# 损失函数
def MAE(ratings_hat, ratings, y_ui, n):
    return abs(ratings_hat * y_ui - ratings).sum() / n

def RMSE(ratings_hat, ratings, y_ui, n):
    return (((ratings_hat * y_ui - ratings) ** 2).sum() / n) ** 0.5

In [40]:
# 预测
ratings_test = np.zeros((user_num + 1, item_num + 1), int)
y_ui_test = np.zeros((user_num + 1, item_num + 1), int)

with open('../data/ml-100k/u1.test', 'r') as file:
    for line in file.readlines():
        user, item, rating, timestamp = line.split('	')
        ratings_test[int(user)][int(item)] = int(rating)
        y_ui_test[int(user)][int(item)] = 1

test_num = y_ui_test.sum()

# 评分规则
ratings_hat1 = UCF(ratings_test, y_ui_test)
ratings_hat2 = ICF(ratings_test, y_ui_test)
ratings_hat3 = HCF(ratings_test, y_ui_test)

print(f"{RMSE(ratings_hat1, ratings_test, y_ui_test, test_num):.4f}, {MAE(ratings_hat1, ratings_test, y_ui_test, test_num):.4f}")
print(f"{RMSE(ratings_hat2, ratings_test, y_ui_test, test_num):.4f}, {MAE(ratings_hat2, ratings_test, y_ui_test, test_num):.4f}")
print(f"{RMSE(ratings_hat3, ratings_test, y_ui_test, test_num):.4f}, {MAE(ratings_hat3, ratings_test, y_ui_test, test_num):.4f}")

1.1456, 0.8691
1.4560, 1.0790
1.1374, 0.8950
