In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

# 读取数据集
u1_base = pd.read_csv('ml-100k/u1.base', sep='\t', names=['uid', 'iid', 'rate', 'timestamp'])
u1_test = pd.read_csv('ml-100k/u1.test', sep='\t', names=['uid', 'iid', 'rate', 'timestamp'])


#定义误差函数
def ERR(model, test):
    cnt = 0
    abs_err = 0
    squ_err = 0

    for index, row in test.iterrows():
        user_id = row['uid'] - 1
        item_id = row['iid'] - 1
        true_rating = row['rate']

        predicted_rating = model(user_id, item_id)
        if predicted_rating > 5:
            predicted_rating = 5
        if predicted_rating < 1:
            predicted_rating = 1

        absolute_error = abs(predicted_rating - true_rating)
        abs_err += absolute_error

        square_error = (predicted_rating - true_rating) ** 2
        squ_err += square_error

        cnt += 1

    mae = abs_err / cnt
    rmse = (squ_err / cnt) ** 0.5
    return mae, rmse


user_num = u1_base['uid'].max()
item_num = u1_base['iid'].max()

rating_matrix = np.zeros((user_num, item_num), float)
y_ui = np.zeros((user_num, item_num), int)

# 记录转换为矩阵
for index, row in u1_base.iterrows():
    user_id = row['uid']
    item_id = row['iid']
    rating = row['rate']
    rating_matrix[user_id - 1, item_id - 1] = rating
    y_ui[user_id - 1, item_id - 1] = 1

# Global average
GlobalAverage = rating_matrix.sum() / y_ui.sum()

#计算四个参数:user_means, item_means, user_bias, item_bias
rating_sum_row = [sum(row) for row in rating_matrix]
y_sum_row = [sum(row) for row in y_ui]

rating_sum_col = [sum(column) for column in zip(*rating_matrix)]
y_sum_col = [sum(column) for column in zip(*y_ui)]

user_means = []
for i in range(user_num):
    if y_sum_row[i] == 0:
        user_means.append(GlobalAverage)
    else:
        user_means.append(rating_sum_row[i] / y_sum_row[i])

item_means = []
for i in range(item_num):
    if y_sum_col[i] == 0:
        item_means.append(GlobalAverage)
    else:
        item_means.append(rating_sum_col[i] / y_sum_col[i])

user_bias = []
for i in range(user_num):
    if y_sum_row[i] == 0:
        user_bias.append(0)
    else:
        sum_bias = 0
        for j in range(item_num):
            sum_bias += y_ui[i][j] * (rating_matrix[i][j] - GlobalAverage)
        user_bias.append(sum_bias / y_sum_row[i])

item_bias = []
for i in range(item_num):
    if y_sum_col[i] == 0:
        item_bias.append(0)
    else:
        sum_bias = 0
        for j in range(user_num):
            sum_bias += y_ui[j][i] * (rating_matrix[j][i] - GlobalAverage)
        item_bias.append(sum_bias / y_sum_col[i])

In [2]:
# RSVD
class RSVD(nn.Module):
    def __init__(self, user_num, item_num, d, GlobalAverage, user_bias, item_bias):
        super(RSVD, self).__init__()
        self.U = nn.Parameter((torch.rand(user_num, d) - 0.5) * 0.01)
        self.V = nn.Parameter((torch.rand(item_num, d) - 0.5) * 0.01)
        self.user_bias = nn.Parameter(torch.tensor(user_bias, dtype=torch.float32))
        self.item_bias = nn.Parameter(torch.tensor(item_bias, dtype=torch.float32))
        self.GlobalAverage = GlobalAverage

    def forward(self, user_id, item_id):
        U_u = self.U[user_id]
        V_i = self.V[item_id]
        bu_u = self.user_bias[user_id]
        bi_i = self.item_bias[item_id]
        pred = torch.sum(U_u * V_i) + bu_u + bi_i + self.GlobalAverage
        return pred

In [3]:
d = 20
lr = 0.01
epochs = 100

#转换到tensor
train_data = torch.tensor(u1_base[['uid', 'iid']].values - 1, dtype=torch.long)
train_ratings = torch.tensor(u1_base['rate'].values, dtype=torch.float32)

# Dataset
train_dataset = TensorDataset(train_data, train_ratings)

# batch size = 2000
batch_size = 2000
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 定义模型、优化器、loss
model = RSVD(user_num, item_num, d, GlobalAverage, user_bias, item_bias)
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=0.01)
criterion = nn.MSELoss()

In [5]:
import time
# train
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    start_time = time.time()

    for batch_data, batch_ratings in train_loader:
        user_id, item_id = batch_data[:, 0], batch_data[:, 1]
        optimizer.zero_grad()

        predictions = model(user_id, item_id)

        loss = criterion(predictions, batch_ratings) #单个batch的loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    end_time = time.time()
    print(f'Epoch {epoch + 1}, Loss: {total_loss},Time:{end_time-start_time:.2f} secs')#一个epoch总loss
    if total_loss < 34.219:
        break
    lr *= 0.9

Epoch 1, Loss: 34.72365915775299,Time:0.29 secs
Epoch 2, Loss: 34.696198761463165,Time:0.40 secs
Epoch 3, Loss: 34.66837006807327,Time:0.31 secs
Epoch 4, Loss: 34.64253216981888,Time:0.31 secs
Epoch 5, Loss: 34.617886662483215,Time:0.31 secs
Epoch 6, Loss: 34.592787742614746,Time:0.30 secs
Epoch 7, Loss: 34.56941229104996,Time:0.31 secs
Epoch 8, Loss: 34.54748594760895,Time:0.31 secs
Epoch 9, Loss: 34.525861620903015,Time:0.31 secs
Epoch 10, Loss: 34.50492179393768,Time:0.31 secs
Epoch 11, Loss: 34.48442578315735,Time:0.31 secs
Epoch 12, Loss: 34.46638524532318,Time:0.31 secs
Epoch 13, Loss: 34.447903871536255,Time:0.31 secs
Epoch 14, Loss: 34.43037247657776,Time:0.31 secs
Epoch 15, Loss: 34.41255176067352,Time:0.32 secs
Epoch 16, Loss: 34.39700537919998,Time:0.31 secs
Epoch 17, Loss: 34.38122671842575,Time:0.31 secs
Epoch 18, Loss: 34.367148756980896,Time:0.31 secs
Epoch 19, Loss: 34.35424906015396,Time:0.30 secs
Epoch 20, Loss: 34.34106719493866,Time:0.31 secs
Epoch 21, Loss: 34.3289

In [6]:
#评价指标 ERR
mae, rmse = ERR(model,u1_test)
print(f'(RMSE:{rmse:.4f},MAE:{mae:.4f})')

(RMSE:0.9682,MAE:0.7671)
