# 内容说明
1. 实验目标  
算法内容：借助SGD随机梯度下降，手动实现 MF 矩阵分解的推荐系统算法  
数据来源：使用 RecBole 中提供的ml-100k的数据  
误差标准：使用 RMSE 均方根误差作为误差的衡量标准  
  
2. 本ipynb内容  
编写 MF 类，包括训练与测试  
考虑了论文中提到的三种偏置，包括 mu, bu 和 bi。实验时确实发现加了bu, bi后误差变小，再加mu后进一步变小。  
代码编写完成后，进行调参与对比实验

3. 训练集与测试集划分  
没有将整个的评分矩阵 R 作为类的参数，因为不太好根据矩阵 R 划分训练和测试集  
评分文件的3/5作为训练数据，2/5作为测试数据，**注意一定要先随机打乱**，提前构造好训练的评分矩阵和测试的评分矩阵作为参数传入MF类中

In [1]:
import pandas as pd
import torch
from torch.distributions import normal

## 1. MF模型
1. init()函数：使用正态分布N(0,0.1)来初始化各个参数，训练集和测试集在调用模型前划分好，传入类的是训练和测试的评分矩阵。可调参数包括：迭代次数epochs，嵌入空间维数embed_size，学习率gamma，学习率衰减系数weight_decay，正则化系数lmbda。
2. compute_mu()函数：根据训练集计算平均评分值，在SGD_train()函数训练前调用，计算好self.mu
3. compute_loss()函数：计算每轮迭代后的损失值，便于验证是否沿着下降的趋势，也可以判断何时停止迭代
4. SGD_train()函数：使用SGD随机梯度下降更新参数
5. test(K)函数：使用训练好的参数在测试集上测试，并计算RMSE, MAE, Hit数值。参数K代表预测top-k的item，计算Hit时需要
6. see()函数：返回训练好的各个参数，不然不好直接访问，以防想要计算其它的一些东西

In [2]:
class MF():
    
    def __init__(self, R_train, R_test, epochs, embed_size=10, gamma=0.05, weight_decay=0.8, lmbda=0.05):
        # 各个参数
        self.epochs = epochs  # 迭代次数
        self.embed_size = embed_size  # 映射的空间维度
        self.gamma = gamma  # 学习率
        self.weight_decay = weight_decay  # 学习率衰减
        self.lmbda = lmbda  # 正则化系数
        
        # 初始化训练和测试的评分矩阵以及P, Q参数矩阵
        self.R_train = R_train  # 用于训练的评分矩阵
        self.R_test = R_test  # 用于测试的评分矩阵
        self.n_P = R_train.size()[0]  # user数目
        self.n_Q = R_train.size()[1]  # item数目
        m = normal.Normal(0.0, 0.1)  # 均值为0，标准差为0.1的正态分布
        self.P = m.sample((self.n_P, embed_size))  # 初始化user参数矩阵
        self.Q = m.sample((self.n_Q, embed_size))  # 初始化item参数矩阵
        
        # 偏置
        self.mu = 0  # 评分均值
        self.bu = m.sample((self.n_P, 1))
        self.bi = m.sample((self.n_Q, 1))
        
    
    # 计算训练集的评分均值 mu
    def compute_mu(self):
        n_R = 0  # 非零元素个数，即已经评分的个数
        sum_R = 0  # 评分总和
        for u in range(1, self.n_P):
            for i in range(1, self.n_Q):
                rui = self.R_train[u][i]
                if rui != 0:
                    n_R += 1
                    sum_R += rui
        self.mu = sum_R / n_R
    
    
    # 定义计算损失值的函数，用于判断每次迭代是否有效
    def compute_loss(self):
        
        # 提前计算好参数的二范数，不然在循环中可能重复计算
        P_norm = 0  # 总的user向量范数
        Q_norm = 0  # 总的item向量范数
        for u in range(1, self.n_P):
            pu = self.P[u]
            P_norm += torch.norm(pu)**2
        for i in range(1, self.n_Q):
            qi = self.Q[i]
            Q_norm += torch.norm(qi)**2
        loss = self.lmbda*(P_norm + Q_norm)  # 初始化误差
        
        # 遍历评分矩阵，计算平均误差
        n = 0
        for u in range(1, self.n_P):
            pu = self.P[u]  # 提取该用户的f维向量
            for i in range(1, self.n_Q):
                rui = self.R_train[u][i]  # 真实评分
                if rui != 0:  # 判断是否有评分
                    n += 1
                    qi = self.Q[i]  # 提取该物品的f维向量
                    rui_pred = sum(pu*qi) + self.mu + self.bu[u] + self.bi[i]
                    eui = rui - rui_pred  # 真实评分与预测评分的偏差
                    loss = loss + eui**2  # 累加误差
        return loss/n
    
   

    # 使用 SGD 随机梯度下降训练参数
    def SGD_train(self):
        loss_list = []  # 存储每轮迭代后的总误差
        self.compute_mu()  # 计算好评分均值mu
        
        # 计算初始误差值
        loss = self.compute_loss()
        print('初始平均误差为 ', loss)
        loss_list.append(loss)
        
        # 多次迭代
        for k in range(self.epochs):
            # 遍历评分，根据每个 rui，按照 SGD 更新参数
            for u in range(1, self.n_P):
                for i in range(1, self.n_Q):
                    rui = self.R_train[u][i]
                    if rui != 0:  # 判断是否已评分
                        pu = self.P[u]
                        qi = self.Q[i]
                        rui_pred = sum(pu*qi) + self.mu + self.bu[u] + self.bi[i]
                        eui = rui - rui_pred  # 用更新后的来计算误差，因为SGD是对每个样本都计算更新
                
                        # 更新参数
                        self.P[u] += self.gamma * (eui*qi - self.lmbda*pu)
                        self.Q[i] += self.gamma * (eui*pu - self.lmbda*qi)
                        self.bu[u] += self.gamma * (eui - self.lmbda*self.bu[u])
                        self.bi[i] += self.gamma * (eui - self.lmbda*self.bi[i])
            self.gamma *= self.weight_decay  # 学习率逐渐减小
            
            # 一次迭代完成后，计算平均误差并测试性能
            loss = self.compute_loss()
            loss_list.append(loss)
            rmse, mae, hit = self.test(20)
            print('epoch %d: avg_loss=%.4f, RMSE=%.4f, MAE=%.4f, Hit=%.4f' % (k+1, loss, rmse, mae, hit))
        return loss_list
        
        
    # 在测试集上预测，并计算RMSE, MAE和Hit性能
    def test(self, K):
        # 预测结果
        R_pred = torch.matmul(self.P, self.Q.t()) + self.mu + self.bu + self.bi.t()
        R_pred_rated = torch.Tensor(self.n_P, self.n_Q).zero_()  # 提取已评分item的预测值矩阵
        
        # 计算 RMSE 和 MAE
        rmse = 0  # 初始化均方根误差
        mae = 0  # 初始化平均绝对误差
        n = 0  # 测试集已评分个数
        # 遍历每个评分
        for u in range(1, self.n_P):
            for i in range(1, self.n_Q):
                rui = self.R_test[u][i]
                rui_pred = R_pred[u][i]
                if rui != 0:
                    n += 1
                    rmse += (rui - rui_pred)**2
                    mae += torch.abs(rui - rui_pred)
                    R_pred_rated[u][i] = rui_pred
        rmse = torch.sqrt(torch.tensor([rmse/n]))
        mae /= n
        
        # 计算 Hit 指标
        hit = 0  # 初始化命中率
        # 首先，计算矩阵 R_pred_rated，即从预测矩阵中提取出之前已评分的部分
        # 否则按评分值降序排序的话一定会有未评分的预测值排到前面，而测试集中未评分的值为0，一定排在最后面  
        # 其次，测试集评分矩阵R_test和预测结果R_pred_rated按照评分降序排序
        # 再统计R_pred_rated中前k个item索引有哪些是真的评分排在R_test前k的
        sort_result1, indices1 = torch.sort(self.R_test, descending=True)
        sort_result2, indices2 = torch.sort(R_pred_rated, descending=True)
        # 计算top k的 hit 值
        for u in range(1, self.n_P):
            for i in range(K):
                if indices2[u][i] in indices1[u][:K]:
                    hit += 1
        hit /= ((self.n_P-1)*K)
        
        # 返回三个指标
        return rmse, mae, hit
    
    
    # 返回各个参数矩阵
    def see(self):
        return self.P, self.Q, self.mu, self.bu, self.bi

## 2. 数据预处理
1. 读取评分文件
2. **随机打乱数据**，至关重要，否则会导致测试性能很差，甚至比将所有评分预测为3分的性能要差
3. 划分训练集和测试集，3/5作为训练集，2/5作为测试集，并建立相应的评分矩阵

In [3]:
# 读取评分文件
inter = pd.read_csv('../dataset/ml-100k/ml-100k.inter', delimiter='\t', engine='python')

# 创建评分矩阵，分为训练矩阵和测试矩阵
n_user = 943
n_item = 1682
R_train = torch.Tensor(n_user+1, n_item+1).zero_()
R_test = torch.Tensor(n_user+1, n_item+1).zero_()
print(R_train.size())
print(R_test.size())

# 填充训练集和测试集的评分矩阵值
# 先随机打乱数据
df = pd.DataFrame(inter)
df = df.sample(frac=1).reset_index(drop=True)
# 前3/5作为训练数据
df1 = df[:60000]
for index, row in df.iterrows():
    R_train[row['user_id:token']][row['item_id:token']] = row['rating:float']
# 后2/5作为测试数据
df2 = df[60000:100000]
for index, row in df.iterrows():
    R_test[row['user_id:token']][row['item_id:token']] = row['rating:float']

torch.Size([944, 1683])
torch.Size([944, 1683])


## 2. 运行与测试 
在MF模型中，参数P, Q, bu和bi都是按照正态分布N(0,0.1)初始化的  
下面实验中，正则化系数lmbda=0.05，学习率gamma=0.05，如果出现过拟合的话，可以适当减小weight_decay。  
取embed_size = 8, 16, 32, 64, 128, 256, 512，分别进行实验，发现RMSE误差逐渐减小

In [23]:
# epochs=10, embed_size=8, gamma=0.05, weight_decay=1, mbda=0.05

mf = MF(R_train, R_test, 10, 8, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.2885])
epoch 1: avg_loss=0.9714, RMSE=0.9856, MAE=0.7875, Hit=0.5085
epoch 2: avg_loss=0.8761, RMSE=0.9359, MAE=0.7420, Hit=0.5249
epoch 3: avg_loss=0.7948, RMSE=0.8913, MAE=0.7054, Hit=0.5448
epoch 4: avg_loss=0.7332, RMSE=0.8559, MAE=0.6770, Hit=0.5593
epoch 5: avg_loss=0.6936, RMSE=0.8324, MAE=0.6580, Hit=0.5688
epoch 6: avg_loss=0.6689, RMSE=0.8173, MAE=0.6454, Hit=0.5744
epoch 7: avg_loss=0.6527, RMSE=0.8072, MAE=0.6368, Hit=0.5787
epoch 8: avg_loss=0.6414, RMSE=0.8001, MAE=0.6307, Hit=0.5815
epoch 9: avg_loss=0.6330, RMSE=0.7948, MAE=0.6261, Hit=0.5818
epoch 10: avg_loss=0.6266, RMSE=0.7907, MAE=0.6225, Hit=0.5822


In [24]:
# epochs=15, embed_size=16, gamma=0.05, weight_decay=1, mbda=0.05

mf = MF(R_train, R_test, 10, 16, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.3008])
epoch 1: avg_loss=0.9569, RMSE=0.9781, MAE=0.7814, Hit=0.5165
epoch 2: avg_loss=0.8298, RMSE=0.9107, MAE=0.7216, Hit=0.5397
epoch 3: avg_loss=0.7311, RMSE=0.8547, MAE=0.6764, Hit=0.5670
epoch 4: avg_loss=0.6500, RMSE=0.8056, MAE=0.6375, Hit=0.5861
epoch 5: avg_loss=0.5958, RMSE=0.7711, MAE=0.6097, Hit=0.5971
epoch 6: avg_loss=0.5613, RMSE=0.7482, MAE=0.5908, Hit=0.6051
epoch 7: avg_loss=0.5383, RMSE=0.7325, MAE=0.5775, Hit=0.6085
epoch 8: avg_loss=0.5221, RMSE=0.7213, MAE=0.5678, Hit=0.6108
epoch 9: avg_loss=0.5102, RMSE=0.7129, MAE=0.5604, Hit=0.6135
epoch 10: avg_loss=0.5011, RMSE=0.7064, MAE=0.5546, Hit=0.6167


In [15]:
# epochs=10, embed_size=32, gamma=0.05, weight_decay=1, mbda=0.05

mf = MF(R_train, R_test, 10, 32, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.3027])
epoch 1: avg_loss=0.9404, RMSE=0.9695, MAE=0.7746, Hit=0.5219
epoch 2: avg_loss=0.7822, RMSE=0.8841, MAE=0.7004, Hit=0.5576
epoch 3: avg_loss=0.6513, RMSE=0.8064, MAE=0.6383, Hit=0.5945
epoch 4: avg_loss=0.5465, RMSE=0.7382, MAE=0.5841, Hit=0.6175
epoch 5: avg_loss=0.4768, RMSE=0.6891, MAE=0.5442, Hit=0.6344
epoch 6: avg_loss=0.4319, RMSE=0.6555, MAE=0.5163, Hit=0.6459
epoch 7: avg_loss=0.4020, RMSE=0.6321, MAE=0.4963, Hit=0.6532
epoch 8: avg_loss=0.3813, RMSE=0.6153, MAE=0.4817, Hit=0.6589
epoch 9: avg_loss=0.3662, RMSE=0.6028, MAE=0.4705, Hit=0.6642
epoch 10: avg_loss=0.3548, RMSE=0.5932, MAE=0.4618, Hit=0.6668


In [18]:
# epochs=10, embed_size=64, gamma=0.05, weight_decay=1, mbda=0.05

mf = MF(R_train, R_test, 10, 64, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.2837])
epoch 1: avg_loss=0.8987, RMSE=0.9476, MAE=0.7566, Hit=0.5410
epoch 2: avg_loss=0.6950, RMSE=0.8329, MAE=0.6595, Hit=0.5891
epoch 3: avg_loss=0.5368, RMSE=0.7315, MAE=0.5787, Hit=0.6316
epoch 4: avg_loss=0.4255, RMSE=0.6505, MAE=0.5135, Hit=0.6603
epoch 5: avg_loss=0.3548, RMSE=0.5934, MAE=0.4664, Hit=0.6769
epoch 6: avg_loss=0.3098, RMSE=0.5539, MAE=0.4332, Hit=0.6879
epoch 7: avg_loss=0.2801, RMSE=0.5261, MAE=0.4095, Hit=0.6955
epoch 8: avg_loss=0.2595, RMSE=0.5060, MAE=0.3921, Hit=0.7007
epoch 9: avg_loss=0.2447, RMSE=0.4910, MAE=0.3791, Hit=0.7063
epoch 10: avg_loss=0.2336, RMSE=0.4795, MAE=0.3690, Hit=0.7099


In [25]:
# epochs=15, embed_size=128, gamma=0.05, weight_decay=1, mbda=0.05

mf = MF(R_train, R_test, 15, 128, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.2966])
epoch 1: avg_loss=0.8257, RMSE=0.9078, MAE=0.7235, Hit=0.5703
epoch 2: avg_loss=0.5719, RMSE=0.7548, MAE=0.5964, Hit=0.6327
epoch 3: avg_loss=0.4070, RMSE=0.6358, MAE=0.5012, Hit=0.6762
epoch 4: avg_loss=0.3093, RMSE=0.5531, MAE=0.4338, Hit=0.6977
epoch 5: avg_loss=0.2541, RMSE=0.5005, MAE=0.3899, Hit=0.7125
epoch 6: avg_loss=0.2216, RMSE=0.4666, MAE=0.3611, Hit=0.7203
epoch 7: avg_loss=0.2012, RMSE=0.4440, MAE=0.3417, Hit=0.7236
epoch 8: avg_loss=0.1876, RMSE=0.4283, MAE=0.3281, Hit=0.7276
epoch 9: avg_loss=0.1780, RMSE=0.4169, MAE=0.3183, Hit=0.7291
epoch 10: avg_loss=0.1711, RMSE=0.4085, MAE=0.3110, Hit=0.7302
epoch 11: avg_loss=0.1658, RMSE=0.4019, MAE=0.3054, Hit=0.7307
epoch 12: avg_loss=0.1617, RMSE=0.3968, MAE=0.3010, Hit=0.7317
epoch 13: avg_loss=0.1585, RMSE=0.3926, MAE=0.2975, Hit=0.7322
epoch 14: avg_loss=0.1558, RMSE=0.3892, MAE=0.2947, Hit=0.7321
epoch 15: avg_loss=0.1536, RMSE=0.3863, MAE=0.2923, Hit=0.7326


In [27]:
# epochs=15, embed_size=256, gamma=0.05, weight_decay=1, mbda=0.05

mf = MF(R_train, R_test, 15, 256, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.3195])
epoch 1: avg_loss=0.6986, RMSE=0.8339, MAE=0.6613, Hit=0.6167
epoch 2: avg_loss=0.4182, RMSE=0.6437, MAE=0.5047, Hit=0.6843
epoch 3: avg_loss=0.2861, RMSE=0.5307, MAE=0.4133, Hit=0.7131
epoch 4: avg_loss=0.2241, RMSE=0.4685, MAE=0.3618, Hit=0.7243
epoch 5: avg_loss=0.1933, RMSE=0.4341, MAE=0.3328, Hit=0.7297
epoch 6: avg_loss=0.1762, RMSE=0.4140, MAE=0.3156, Hit=0.7321
epoch 7: avg_loss=0.1660, RMSE=0.4013, MAE=0.3048, Hit=0.7333
epoch 8: avg_loss=0.1593, RMSE=0.3929, MAE=0.2976, Hit=0.7341
epoch 9: avg_loss=0.1546, RMSE=0.3870, MAE=0.2926, Hit=0.7354
epoch 10: avg_loss=0.1512, RMSE=0.3826, MAE=0.2889, Hit=0.7359
epoch 11: avg_loss=0.1487, RMSE=0.3792, MAE=0.2861, Hit=0.7360
epoch 12: avg_loss=0.1466, RMSE=0.3765, MAE=0.2840, Hit=0.7362
epoch 13: avg_loss=0.1450, RMSE=0.3744, MAE=0.2823, Hit=0.7361
epoch 14: avg_loss=0.1436, RMSE=0.3726, MAE=0.2809, Hit=0.7359
epoch 15: avg_loss=0.1425, RMSE=0.3711, MAE=0.2797, Hit=0.7356


In [28]:
# epochs=15, embed_size=512, gamma=0.05, weight_decay=1, mbda=0.05

mf = MF(R_train, R_test, 15, 512, 0.05, 1, 0.05)
loss_list = mf.SGD_train()  # 训练

初始平均误差为  tensor([1.3439])
epoch 1: avg_loss=0.4979, RMSE=0.7010, MAE=0.5471, Hit=0.6806
epoch 2: avg_loss=0.2586, RMSE=0.5018, MAE=0.3840, Hit=0.7250
epoch 3: avg_loss=0.1946, RMSE=0.4335, MAE=0.3285, Hit=0.7329
epoch 4: avg_loss=0.1710, RMSE=0.4055, MAE=0.3056, Hit=0.7350
epoch 5: avg_loss=0.1601, RMSE=0.3920, MAE=0.2946, Hit=0.7356
epoch 6: avg_loss=0.1541, RMSE=0.3844, MAE=0.2885, Hit=0.7357
epoch 7: avg_loss=0.1503, RMSE=0.3796, MAE=0.2848, Hit=0.7352
epoch 8: avg_loss=0.1477, RMSE=0.3763, MAE=0.2822, Hit=0.7358
epoch 9: avg_loss=0.1457, RMSE=0.3738, MAE=0.2804, Hit=0.7363
epoch 10: avg_loss=0.1442, RMSE=0.3719, MAE=0.2790, Hit=0.7361
epoch 11: avg_loss=0.1429, RMSE=0.3703, MAE=0.2779, Hit=0.7357
epoch 12: avg_loss=0.1419, RMSE=0.3690, MAE=0.2770, Hit=0.7360
epoch 13: avg_loss=0.1410, RMSE=0.3679, MAE=0.2763, Hit=0.7354
epoch 14: avg_loss=0.1402, RMSE=0.3669, MAE=0.2757, Hit=0.7355
epoch 15: avg_loss=0.1396, RMSE=0.3661, MAE=0.2752, Hit=0.7355
