# 说明
1. 实验目标  
算法内容：手动实现 MF 矩阵分解的推荐系统算法  
数据来源：使用 RecBole 中提供的ml-100k的数据  
误差标准：使用 RMSE 均方根误差作为误差的衡量标准  
  
2. 本ipynb内容  
编写 MF 类，包括训练与测试  
考虑了论文中提到的三种偏置，包括 mu, bu 和 bi。实验时确实发现加了bu, bi后误差变小，再加mu后进一步变小。  
代码编写完成后，进行调参与对比实验

3. 训练集与测试集划分  
3/5作为训练数据，2/5作为测试数据  
没有将整个的 R 矩阵作为类的参数，因为不太好根据矩阵 R 划分训练和测试集  

4. 做对比实验，绘制表格  
首先，对比是否引入L2正则化的效果，即lmbda值是否取0  
其次，对embedding_size调参，对比实验效果

In [1]:
import pandas as pd
import torch
from torch.distributions import normal

## 1. MF模型
1. init()函数：初始化各个参数，训练集和测试集在调用模型前划分好，传入类的是训练和测试的评分矩阵。可调参数包括：迭代次数epochs，嵌入空间维数embed_size，学习率gamma，学习率衰减系数weight_decay，正则化系数lmbda。
2. compute_mu()函数：根据训练集计算平均评分值，在SGD_train()函数训练前调用，计算好self.mu
3. compute_loss()函数：计算每轮迭代后的损失值，便于验证是否沿着下降的趋势，也可以判断何时停止迭代
4. SGD_train()函数：使用SGD随机梯度下降更新参数
5. test(K)函数：使用训练好的参数在测试集上测试，并计算RMSE, MAE, Hit数值。参数K代表预测top-k的item，计算Hit时需要
6. see()函数：返回训练好的各个参数，不然不好直接访问

In [13]:
class MF():
    
    def __init__(self, R_train, R_test, epochs, embed_size=10, gamma=0.05, weight_decay=0.8, lmbda=0.05):
        # 各个参数
        self.epochs = epochs  # 迭代次数
        self.embed_size = embed_size  # 映射的空间维度
        self.gamma = gamma  # 学习率
        self.weight_decay = weight_decay  # 学习率衰减
        self.lmbda = lmbda  # 正则化系数
        
        # P, Q, R分别对应user, item和rating矩阵
        self.R_train = R_train  # 用于训练的评分矩阵
        self.R_test = R_test  # 用于测试的评分矩阵
        self.n_P = R_train.size()[0]  # user数目
        self.n_Q = R_train.size()[1]  # item数目
        #self.P = torch.rand(self.n_P, embed_size)  # 初始化user参数矩阵
        #self.Q = torch.rand(self.n_Q, embed_size)  # 初始化item参数矩阵
        m = normal.Normal(0.0, 0.1)  # 均值为0，标准差为0.5的正态分布
        self.P = m.sample((self.n_P, embed_size))  # user参数矩阵
        self.Q = m.sample((self.n_Q, embed_size))  # item参数矩阵
        
        # 偏置
        self.mu = 0  # 评分均值
        #self.bu = torch.rand(self.n_P, 1)  # 初始化user偏置
        #self.bi = torch.rand(self.n_Q, 1)  # 初始化item偏置
        self.bu = m.sample((self.n_P, 1))
        self.bi = m.sample((self.n_Q, 1))
        
    
    
    
    # 计算训练集的评分均值 mu
    def compute_mu(self):
        n_R = 0  # 非零元素个数，即已经评分的个数
        sum_R = 0  # 评分总和
        for u in range(1, self.n_P):
            for i in range(1, self.n_Q):
                rui = self.R_train[u][i]
                if rui != 0:
                    n_R += 1
                    sum_R += rui
        self.mu = sum_R / n_R
    
    
    
    # 定义计算损失值的函数，用于判断每次迭代是否有效
    def compute_loss(self):
        
        # 提前计算好参数的二范数，不然在循环中可能重复计算
        P_norm = 0  # 总的user向量范数
        Q_norm = 0  # 总的item向量范数
        for u in range(1, self.n_P):
            pu = self.P[u]
            P_norm += torch.norm(pu)**2
        for i in range(1, self.n_Q):
            qi = self.Q[i]
            Q_norm += torch.norm(qi)**2
        loss = self.lmbda*(P_norm + Q_norm)  # 初始化误差
        
        # 遍历评分矩阵，计算平均误差
        n = 0
        for u in range(1, self.n_P):
            pu = self.P[u]  # 提取该用户的f维向量
            for i in range(1, self.n_Q):
                rui = self.R_train[u][i]  # 真实评分
                if rui != 0:  # 判断是否有评分
                    n += 1
                    qi = self.Q[i]  # 提取该物品的f维向量
                    rui_pred = sum(pu*qi) + self.mu + self.bu[u] + self.bi[i]
                    eui = rui - rui_pred  # 真实评分与预测评分的偏差
                    loss = loss + eui**2  # 累加误差
        return loss/n
    
   

    # 使用 SGD 随机梯度下降训练参数
    def SGD_train(self):
        loss_list = []  # 存储每轮迭代后的总误差
        self.compute_mu()  # 计算好评分均值mu
        
        # 计算初始误差值
        loss = self.compute_loss()
        print('初始平均误差为 ', loss)
        loss_list.append(loss)
        
        # 多次迭代
        for k in range(self.epochs):
            # 遍历评分，根据每个 rui，按照 SGD 更新参数
            for u in range(1, self.n_P):
                for i in range(1, self.n_Q):
                    rui = self.R_train[u][i]
                    if rui != 0:  # 判断是否已评分
                        pu = self.P[u]
                        qi = self.Q[i]
                        rui_pred = sum(pu*qi) + self.mu + self.bu[u] + self.bi[i]
                        eui = rui - rui_pred  # 用更新后的来计算误差，因为SGD是对每个样本都计算更新
                
                        # 更新参数
                        self.P[u] += self.gamma * (eui*qi - self.lmbda*pu)
                        self.Q[i] += self.gamma * (eui*pu - self.lmbda*qi)
                        self.bu[u] += self.gamma * (eui - self.lmbda*self.bu[u])
                        self.bi[i] += self.gamma * (eui - self.lmbda*self.bi[i])
            self.gamma *= self.weight_decay  # 学习率逐渐减小
            
            # 一次迭代完成后，计算平均误差并测试
            loss = self.compute_loss()
            loss_list.append(loss)
            rmse, mae, hit = self.test(20)
            print('epoch %d: avg_loss=%.4f, RMSE=%.4f, MAE=%.4f, Hit=%.4f' % (k+1, loss, rmse, mae, hit))
        return loss_list
        

        
    # 在测试集上预测，并计算RMSE, MAE和Hit
    def test(self, K):
        # 预测结果
        R_pred = torch.matmul(self.P, self.Q.t()) + self.mu + self.bu + self.bi.t()
        R_pred_rated = torch.Tensor(self.n_P, self.n_Q).zero_()  # 提取已评分item的预测值矩阵
        
        # 计算 RMSE 和 MAE
        rmse = 0  # 初始化均方根误差
        mae = 0  # 初始化平均绝对误差
        n = 0  # 测试集已评分个数
        # 遍历每个评分
        for u in range(1, self.n_P):
            for i in range(1, self.n_Q):
                rui = self.R_test[u][i]
                rui_pred = R_pred[u][i]
                if rui != 0:
                    n += 1
                    rmse += (rui - rui_pred)**2
                    mae += torch.abs(rui - rui_pred)
                    R_pred_rated[u][i] = rui_pred
        rmse = torch.sqrt(torch.tensor([rmse/n]))
        mae /= n
        
        # 计算 Hit 指标
        hit = 0  # 初始化命中率
        # 首先，计算矩阵 R_pred_rated，即从预测矩阵中提取出之前已评分的部分
        # 否则按评分值降序排序的话一定会有未评分的预测值排到前面，而测试集中未评分的值为0，一定排在最后面  
        # 其次，测试集评分矩阵R_test和预测结果R_pred_rated按照评分降序排序
        # 再统计R_pred_rated中前k个item索引有哪些是真的评分排在R_test前k的
        sort_result1, indices1 = torch.sort(self.R_test, descending=True)
        sort_result2, indices2 = torch.sort(R_pred_rated, descending=True)
        # 计算top k的 hit 值
        for u in range(1, self.n_P):
            for i in range(K):
                if indices2[u][i] in indices1[u][:K]:
                    hit += 1
        hit /= ((self.n_P-1)*K)
        
        # 返回三个指标
        return rmse, mae, hit
    
    
    def see(self):
        return self.P, self.Q, self.mu, self.bu, self.bi

## 2. 数据预处理
1. 读取评分文件
2. **随机打乱数据**，至关重要，否则会导致测试性能很差
3. 划分训练集和测试集，3/5作为训练集，2/5作为测试集，并建立相应的评分矩阵

In [10]:
# 读取评分文件
inter = pd.read_csv('../dataset/ml-100k/ml-100k.inter', delimiter='\t', engine='python')

# 创建评分矩阵，分为训练矩阵和测试矩阵
n_user = 943
n_item = 1682
R_train = torch.Tensor(n_user+1, n_item+1).zero_()
R_test = torch.Tensor(n_user+1, n_item+1).zero_()
print(R_train.size())
print(R_test.size())

# 填充训练集和测试集的评分矩阵值
# 先随机打乱数据
df = pd.DataFrame(inter)
df = df.sample(frac=1).reset_index(drop=True)
# 前3/5作为训练数据
df1 = df[:60000]
for index, row in df.iterrows():
    R_train[row['user_id:token']][row['item_id:token']] = row['rating:float']
# 后2/5作为测试数据
df2 = df[60000:100000]
for index, row in df.iterrows():
    R_test[row['user_id:token']][row['item_id:token']] = row['rating:float']

torch.Size([944, 1683])
torch.Size([944, 1683])


## 3. 运行与测试

In [12]:
# 正态分布N(0,0.1)初始化参数
# lmbda=0.05, embed_size=8, weight_decay=0.8

mf = MF(R_train, R_test, 10, 8, 0.05, 0.8, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.2956])
epoch 1: avg_loss=0.9707, RMSE=0.9852, MAE=0.7872, Hit=0.5086
epoch 2: avg_loss=0.8719, RMSE=0.9337, MAE=0.7399, Hit=0.5199
epoch 3: avg_loss=0.8111, RMSE=0.9005, MAE=0.7125, Hit=0.5355
epoch 4: avg_loss=0.7658, RMSE=0.8749, MAE=0.6919, Hit=0.5459
epoch 5: avg_loss=0.7328, RMSE=0.8558, MAE=0.6766, Hit=0.5528
epoch 6: avg_loss=0.7085, RMSE=0.8414, MAE=0.6652, Hit=0.5582
epoch 7: avg_loss=0.6906, RMSE=0.8307, MAE=0.6567, Hit=0.5635
epoch 8: avg_loss=0.6775, RMSE=0.8228, MAE=0.6504, Hit=0.5665
epoch 9: avg_loss=0.6679, RMSE=0.8169, MAE=0.6458, Hit=0.5697
epoch 10: avg_loss=0.6609, RMSE=0.8126, MAE=0.6424, Hit=0.5704


In [14]:
# 正态分布N(0,0.1)初始化参数
# lmbda=0.05, embed_size=16, weight_decay=0.8

mf = MF(R_train, R_test, 10, 16, 0.05, 0.8, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.2938])
epoch 1: avg_loss=0.9598, RMSE=0.9796, MAE=0.7827, Hit=0.5134
epoch 2: avg_loss=0.8370, RMSE=0.9147, MAE=0.7247, Hit=0.5330
epoch 3: avg_loss=0.7605, RMSE=0.8718, MAE=0.6901, Hit=0.5525
epoch 4: avg_loss=0.7025, RMSE=0.8378, MAE=0.6633, Hit=0.5666
epoch 5: avg_loss=0.6595, RMSE=0.8117, MAE=0.6427, Hit=0.5773
epoch 6: avg_loss=0.6287, RMSE=0.7924, MAE=0.6275, Hit=0.5846
epoch 7: avg_loss=0.6068, RMSE=0.7784, MAE=0.6164, Hit=0.5910
epoch 8: avg_loss=0.5910, RMSE=0.7682, MAE=0.6084, Hit=0.5948
epoch 9: avg_loss=0.5797, RMSE=0.7607, MAE=0.6025, Hit=0.5970
epoch 10: avg_loss=0.5713, RMSE=0.7552, MAE=0.5982, Hit=0.5992


In [None]:
# 正态分布N(0,0.1)初始化参数
# lmbda=0.05, embed_size=32, weight_decay=1

mf = MF(R_train, R_test, 10, 32, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.3027])
epoch 1: avg_loss=0.9404, RMSE=0.9695, MAE=0.7746, Hit=0.5219
epoch 2: avg_loss=0.7822, RMSE=0.8841, MAE=0.7004, Hit=0.5576
epoch 3: avg_loss=0.6513, RMSE=0.8064, MAE=0.6383, Hit=0.5945
epoch 4: avg_loss=0.5465, RMSE=0.7382, MAE=0.5841, Hit=0.6175
epoch 5: avg_loss=0.4768, RMSE=0.6891, MAE=0.5442, Hit=0.6344
epoch 6: avg_loss=0.4319, RMSE=0.6555, MAE=0.5163, Hit=0.6459
epoch 7: avg_loss=0.4020, RMSE=0.6321, MAE=0.4963, Hit=0.6532
epoch 8: avg_loss=0.3813, RMSE=0.6153, MAE=0.4817, Hit=0.6589
epoch 9: avg_loss=0.3662, RMSE=0.6028, MAE=0.4705, Hit=0.6642
epoch 10: avg_loss=0.3548, RMSE=0.5932, MAE=0.4618, Hit=0.6668


In [None]:
# 正态分布N(0,0.1)初始化参数
# lmbda=0.05, embed_size=48, weight_decay=1

mf = MF(R_train, R_test, 10, 48, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.2894])
epoch 1: avg_loss=0.9125, RMSE=0.9549, MAE=0.7621, Hit=0.5346
epoch 2: avg_loss=0.7310, RMSE=0.8544, MAE=0.6762, Hit=0.5775
epoch 3: avg_loss=0.5863, RMSE=0.7648, MAE=0.6047, Hit=0.6160
epoch 4: avg_loss=0.4768, RMSE=0.6891, MAE=0.5444, Hit=0.6429
