In [1]:
import pandas as pd
import numpy as np
import math

#读入数据集
u1_base = pd.read_csv('ml-100k/u1.base', sep='\t', names=['uid', 'iid', 'rate', 'timestamp'])
u1_test = pd.read_csv('ml-100k/u1.test', sep='\t', names=['uid', 'iid', 'rate', 'timestamp'])


#定义误差函数
def ERR(rating_matrix, test):
    cnt = 0
    abs_err = 0
    squ_err = 0

    for index, row in test.iterrows():
        user_id = row['uid']
        item_id = row['iid']
        true_rating = row['rate']

        predicted_rating = rating_matrix[user_id - 1][item_id - 1]

        # 计算绝对误差/平方误差
        absolute_error = abs(predicted_rating - true_rating)
        abs_err += absolute_error

        square_error = pow(predicted_rating - true_rating, 2)
        squ_err += square_error

        cnt += 1
    # 计算平均绝对误差/平方误差
    mae = abs_err / cnt
    rmse = (squ_err / cnt) ** 0.5
    return mae, rmse


#初始化
user_num = u1_base['uid'].max()
item_num = u1_base['iid'].max()

rating_matrix = np.zeros((user_num, item_num), float)
y_ui = np.zeros((user_num, item_num), int)
user_num, item_num

#base记录转化为matrix
for index, row in u1_base.iterrows():
    user_id = row['uid']
    item_id = row['iid']
    rating = row['rate']
    rating_matrix[user_id - 1, item_id - 1] = rating
    y_ui[user_id - 1, item_id - 1] = 1

#全局平均
GlobalAverage = rating_matrix.sum() / y_ui.sum()

#计算两个参数 user_means item_means
rating_sum_row = [sum(row) for row in rating_matrix]
y_sum_row = [sum(row) for row in y_ui]

rating_sum_col = [sum(column) for column in zip(*rating_matrix)]
y_sum_col = [sum(column) for column in zip(*y_ui)]

user_means = []
for i in range(user_num):
    if y_sum_row[i] == 0:
        user_means.append(GlobalAverage)
    else:
        user_means.append(rating_sum_row[i] / y_sum_row[i])

item_means = []
for i in range(item_num):
    if y_sum_col[i] == 0:
        item_means.append(GlobalAverage)
    else:
        item_means.append(rating_sum_col[i] / y_sum_col[i])


#两用户都评分过的item id 
def con_items(u, w):
    con_item = []
    for i in range(item_num):
        if rating_matrix[u][i] != 0 and rating_matrix[w][i] != 0:
            con_item.append(i)
    return con_item


#共同评过u,w的 user id
def con_users(u, w):
    con_user = []
    for i in range(user_num):
        if rating_matrix[i][u] != 0 and rating_matrix[i][w] != 0:
            con_user.append(i)
    return con_user


def PCC_u(u, w):
    con_item = con_items(u, w)
    if len(con_item) == 0:
        return 0
    sum = 0
    sum_u = 0
    sum_w = 0
    for i in con_item:
        sum += (rating_matrix[u][i] - user_means[u]) * (rating_matrix[w][i] - user_means[w])
        sum_u += (rating_matrix[u][i] - user_means[u]) ** 2
        sum_w += (rating_matrix[w][i] - user_means[w]) ** 2
    if sum_u == 0 or sum_w == 0:
        return 0
    return sum / (math.sqrt(sum_u) * math.sqrt(sum_w))


def PCC_i(k, j):
    con_user = con_users(k, j)
    if len(con_user) == 0:
        return 0
    sum = 0
    sum_u = 0
    sum_w = 0
    for i in con_user:
        sum += (rating_matrix[i][k] - user_means[i]) * (rating_matrix[i][j] - user_means[i])
        sum_u += (rating_matrix[i][k] - user_means[i]) ** 2
        sum_w += (rating_matrix[i][j] - user_means[i]) ** 2
    if sum_u == 0 or sum_w == 0:
        return 0
    return sum / (math.sqrt(sum_u) * math.sqrt(sum_w))

In [2]:
PCC_matrix_u = np.zeros((user_num, user_num), float)
for i in range(user_num):
    PCC_matrix_u[i][i] = 1
    for k in range(i):
        coefficient = PCC_u(i, k)
        PCC_matrix_u[i][k] = coefficient
        PCC_matrix_u[k][i] = coefficient
    print(f'\r{i / (user_num - 1) * 100:.2f}%', end='')

100.00%

In [3]:
PCC_matrix_i = np.zeros((item_num, item_num), float)
for i in range(item_num):
    PCC_matrix_i[i][i] = 1
    for k in range(i):
        coefficient = PCC_i(i, k)
        PCC_matrix_i[i][k] = coefficient
        PCC_matrix_i[k][i] = coefficient
    print(f'\r{i / (item_num - 1) * 100:.2f}%', end='')

100.00%

In [4]:
#相似度非0 的user 邻居
def non_zero_neighbors(u):
    ret = np.where(PCC_matrix_u[u] != 0)[0]
    return ret[ret != u]


def users_rated_j(j):
    return np.where(y_ui[:, j] == 1)[0]


k = 20


def predict_rate_u(u, j):
    neighbors = np.intersect1d(non_zero_neighbors(u), users_rated_j(j))
    neighbors_PCCs = [(neighbor, PCC_matrix_u[u][neighbor]) for neighbor in neighbors]
    # 按相似度排序（降序）
    sorted_neighbors = sorted(neighbors_PCCs, key=lambda x: x[1], reverse=True)
    # 获取前k个邻居
    top_k = sorted_neighbors[:k]

    if len(top_k) == 0:
        return user_means[u]
    sum1 = 0
    sum2 = 0
    for neighbor, pcc in top_k:
        sum1 += pcc * (rating_matrix[neighbor][j] - user_means[neighbor])
        sum2 += pcc
    if sum2 == 0:
        return user_means[u]
    predict_rate = user_means[u] + sum1 / sum2
    if predict_rate < 1:
        return 1
    if predict_rate > 5:
        return 5
    return user_means[u] + sum1 / sum2


#相似度非0 的item邻居
def non_zero_neighbors_i(i):
    ret = np.where(PCC_matrix_i[i] != 0)[0]
    return ret[ret != i]


def items_rated_u(u):
    return np.where(y_ui[u] == 1)[0]


def predict_rate_i(u, j):
    neighbors = np.intersect1d(non_zero_neighbors_i(j), items_rated_u(u))
    neighbors_PCCs = [(neighbor, PCC_matrix_i[j][neighbor]) for neighbor in neighbors]
    # 按相似度排序（降序）
    sorted_neighbors = sorted(neighbors_PCCs, key=lambda x: x[1], reverse=True)
    # 获取前k个邻居
    top_k = sorted_neighbors[:k]

    if len(top_k) == 0:
        return user_means[u]
    sum1 = 0
    sum2 = 0
    for neighbor, pcc in top_k:
        sum1 += pcc * rating_matrix[u][neighbor]
        sum2 += pcc
    if sum2 == 0:
        return user_means[u]
    predict_rate = sum1 / sum2
    if predict_rate < 1:
        return 1
    if predict_rate > 5:
        return 5
    return predict_rate

In [5]:
filled_matrix_u = np.copy(rating_matrix)
for idx, row in u1_test.iterrows():
    user_id = row['uid'] - 1
    item_id = row['iid'] - 1
    filled_matrix_u[user_id][item_id] = predict_rate_u(user_id, item_id)
mae1, rmse1 = ERR(filled_matrix_u, u1_test)

In [6]:
filled_matrix_i = np.copy(rating_matrix)
for idx, row in u1_test.iterrows():
    user_id = row['uid'] - 1
    item_id = row['iid'] - 1
    filled_matrix_i[user_id][item_id] = predict_rate_i(user_id, item_id)
mae2, rmse2 = ERR(filled_matrix_i, u1_test)

In [7]:
filled_matrix_ui = np.copy(rating_matrix)
for idx, row in u1_test.iterrows():
    user_id = row['uid'] - 1
    item_id = row['iid'] - 1
    filled_matrix_ui[user_id][item_id] = 0.5 * filled_matrix_u[user_id, item_id] + 0.5 * filled_matrix_i[user_id, item_id]
mae3, rmse3 = ERR(filled_matrix_ui, u1_test)

In [8]:
print(f'(RMSE:{rmse1:.4f},MAE:{mae1:.4f})')
print(f'(RMSE:{rmse2:.4f},MAE:{mae2:.4f})')
print(f'(RMSE:{rmse3:.4f},MAE:{mae3:.4f})')

(RMSE:0.9819,MAE:0.7642)
(RMSE:1.0770,MAE:0.8335)
(RMSE:0.9877,MAE:0.7771)
