# 内容说明
背景：之前自己手动实现 MF 算法的整个过程，便于掌握 MF 算法的基本原理，但是运行速度非常缓慢  
目的：希望在掌握原理之后借用Pytorch中的已有库，同时采取改进措施优化时间性能  
改进方向：  
1. 分batch，同一个batch可以并行计算，极大改善时间性能
2. 调用Pytorch中自带的优化器  
3. 减少对df文件以及评分矩阵的遍历，尽可能改善时间性能

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error

## 1. 编写MF类
1. init(): 初始化各个参数，包括：  
1) 学习率lr  
2) L2系数weight_decay（注意：不用自己计算正则化范数，直接利用优化器的weight_decay参数即可）  
3) 嵌入维数embed_size  
4) 批大小batch_size  
5) 训练文件df_train，测试文件df_test  
6) 训练评分矩阵R_train，测试评分矩阵R_test  
7) 参数矩阵P和Q，偏置mu, bu和bi。  
2. preprocess(df, ratio_train): 参数为文件df以及训练集所占比例ratio_train。首先，划分训练与测试数据；其次，根据df文件，获取n_P和n_Q值，便于根据这个维度初始化各个变量与矩阵；计算mu值。  
3. generate_train(): 生成批训练集，得到每一批数据中user和item对应的编号矩阵，同时可以填充R_train矩阵  
4. train(): 多次迭代训练，优化参数  
5. performance(): 计算RMSE和MAE的值

In [7]:
class MF():
    
    # 初始化参数
    def __init__(self, lr=0.005, weight_decay=0.05, embed_size=10, batch_size=2000):
        
        # 常用参数
        self.lr = lr  # 优化器参数
        self.weight_decay = weight_decay  # 优化器参数
        self.embed_size = embed_size
        self.batch_size = batch_size
        
        # 训练测试的df文件与评分矩阵
        self.df_train = None
        self.df_test = None
        self.R_train = None
        self.R_test = None

        # user和item的参数矩阵
        self.P = None
        self.Q = None
        
        # 三种偏置
        self.mu = 0
        self.bu = None
        self.bi = None
    
    
    # 读取文件，划分训练测试集，完善初始化参数
    def preprocess(self, df, ratio_train):
        # 划分数据
        n_train = int(ratio_train * df.shape[0])
        self.df_train = df[:n_train]
        self.df_test = df[n_train:df.shape[0]]
        
        # 获取n_P, n_Q值，计算mu值
        n_P = df['user_id:token'].max() + 1
        n_Q = df['item_id:token'].max() + 1
        self.mu = self.df_train['rating:float'].sum(axis=0) / self.df_train.shape[0]
        print('n_P = %d, n_Q = %d, mu = %.4f' % (n_P, n_Q, self.mu))
        
        # 补充初始化
        self.R_train = torch.zeros(n_P, n_Q)
        self.R_test = torch.zeros(n_P, n_Q)
        self.P = nn.Parameter(torch.empty(n_P, self.embed_size))
        self.Q = nn.Parameter(torch.empty(n_Q, self.embed_size))
        self.bu = nn.Parameter(torch.empty(n_P, 1))
        self.bi = nn.Parameter(torch.empty(n_Q, 1))
        nn.init.xavier_normal_(self.P.data)
        nn.init.xavier_normal_(self.Q.data)
        nn.init.xavier_normal_(self.bu.data)
        nn.init.xavier_normal_(self.bi.data)
        
    
    # 生成批训练集的 u, i索引，同时填充评分矩阵R_train
    def generate_train_batch(self):
        n_train = self.df_train.shape[0]
        n_batch = n_train // self.batch_size
        
        # 针对每个batch，分别生成user和item索引
        for k in range(n_batch):
            u_batch = []
            i_batch = []
            df_batch = self.df_train[k*self.batch_size: (k+1)*self.batch_size]
            for index, row in df_batch.iterrows():
                u = row['user_id:token']
                i = row['item_id:token']
                rui = row['rating:float']
                u_batch.append(u)
                i_batch.append(i)
                self.R_train[u][i] = rui  # 填充评分矩阵
            yield np.asarray(u_batch), np.asarray(i_batch)
    
    
    # 迭代训练，优化参数
    def train(self, epochs):
        # 定义优化器
        optimizer = optim.Adam([self.P, self.Q, self.bu, self.bi], lr=self.lr, weight_decay=self.weight_decay)
        print("\nstart training......")
        
        # 多次迭代
        for k in range(epochs):
            sum_loss = 0
            # 针对每个小batch
            for u_batch, i_batch in self.generate_train_batch():
                pu = self.P[u_batch, :]
                qi = self.Q[i_batch, :]
                rui = self.R_train[u_batch, i_batch]  # 真实评分
                rui_pred = torch.mul(pu, qi).sum(dim=1) + self.bu[u_batch].sum(dim=1) + self.bi[i_batch].sum(dim=1) + self.mu
                # 计算误差
                square = (rui-rui_pred).pow(2).sum()
                #L2 = self.lmbda * (pu.norm(dim=1).pow(2).sum() + qi.norm(dim=1).pow(2).sum() + self.bu.norm(dim=0).pow(2) + self.bi.norm(dim=0).pow(2))
                loss = square
                sum_loss += loss
                # 优化参数
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            # 输出均差，评估性能
            avg_loss = sum_loss/self.df_train.shape[0]
            rmse, mae = self.test_RMSE()
            ndcg = self.test_NDCG(10)
            print('epoch %d：avg_loss = %.4f;   RMSE = %.4f, MAE = %.4f, NDCG = %.4f' % (k+1, avg_loss, rmse, mae, ndcg))
    
    
    # 评估性能RMSE，MAE
    def test_RMSE(self):
        u_test = []
        i_test = []
        
        # 遍历测试数据，记录user和item索引以及评分
        for index, row in self.df_test.iterrows():
            u = row['user_id:token']
            i = row['item_id:token']
            rui = row['rating:float']
            u_test.append(u)
            i_test.append(i)
            self.R_test[u][i] = rui
        
        # 提取user和item向量
        u_test = np.asarray(u_test)
        i_test = np.asarray(i_test)
        pu = self.P[u_test, :]
        qi = self.Q[i_test, :]
        
        # 获取真实评分并计算预测评分（不是U*I矩阵形式，而是一行数值，没有0值）
        rui = self.R_test[u_test, i_test]  # 真实评分
        rui_pred = torch.mul(pu, qi).sum(dim=1) + self.bu[u_test].sum(dim=1) + self.bi[i_test].sum(dim=1) + self.mu
        
        # 计算RMSE与MAE
        r = rui.data.numpy()
        r_pred = rui_pred.data.numpy()
        rmse = np.sqrt(mean_squared_error(r, r_pred))
        mae = mean_absolute_error(r, r_pred)
        
        # 返回
        return rmse, mae
    
    
    # 评估性能NDCG
    def test_NDCG(self, K):
        n_P = self.R_test.shape[0]
        n_Q = self.R_test.shape[1]
        R_pred = torch.matmul(self.P, self.Q.t()) + self.mu + self.bu + self.bi.t()  # 预测矩阵
        R_pred -= 100*self.R_train  # 去除用于训练集的评分数据，减去100则排序后处于最后，不影响排序与计算
        '''
        # 遍历测试数据，记录实际评分，计算预测评分
        for index, row in self.df_test.iterrows():
            u = row['user_id:token']
            i = row['item_id:token']
            rui = row['rating:float']
            pu = self.P[u]
            qi = self.Q[i]
            self.R_test[u][i] = rui  # 实际评分
            R_pred[u][i] = sum(pu*qi) + self.bu[u] + self.bi[i] + self.mu  # 预测评分。没有生成整个预测的矩阵再提取测试部分，否则遍历矩阵浪费时间
        '''
        # 排序
        sort_results1, indices1 = torch.sort(self.R_test, descending=True)
        sort_results2, indices2 = torch.sort(R_pred, descending=True)
        
        # 计算DCG，使用真实评分与预测的排序
        # 计算IDCG，使用真实评分与真实排序
        
        ndcg = 0
        n = n_P - 1
        for u in range(1, n_P):
            dcg = 0
            idcg = 0
            for idx in range(K):
                a = torch.tensor([idx+2])
                i = indices2[u][idx]
                dcg += (2**self.R_test[u][i]-1)/(torch.log2(a))
                idcg += (2**sort_results1[u][idx]-1)/(torch.log2(a))
            # 有的用户在测试数据中没评过分，idcg=0不可以做分母
            if idcg == 0:
                n -= 1
            else:
                ndcg += dcg / idcg  # 各用户的ndcg求和
        ndcg /= n  # 计算每个用户的平均ndcg
        
        return ndcg

### 一些实现心得
改进时间性能非常重要，否则可能相差几十上百倍的时间  
1. 划分batch：划分成一定大小的batch后，如果仍依次遍历一个batch中的样本，对时间没有任何改进。而应该利用矩阵或者向量的方式，同时对一个batch中的所有样本进行计算。  
2. 尽量减少对df文件的遍历：实验中df文件有十万行，以计算评分均值mu为例，入股遍历df每一行求评分和，将会十分缓慢。相反，可以直接用df['rating:float'].sum()相当快速求和。
3. 尽量减少对评分矩阵的遍历：实验中评分矩阵维度为(944, 1683)，需要遍历158万个数据，非常缓慢。以计算RMSE指标为例，不能直接对944×1683的R_test矩阵调用mean_squared_error()，因为矩阵中只有20000个是测试数据，其它很多0是未评分的。如果遍历矩阵，判断rui是否为0，再累加计算RMSE，非常缓慢。为了改进时间复杂度，可以根据df_test文件获取测试数据的user和item编号矩阵，借鉴batch中的并行方式，构建只包含测试数据的评分矩阵，再直接调用mean_squared_error()。

## 2. 运行与测试
分别使用显式评分文件和隐式评分文件来运行测试  
显式数据集：就是原始ml-100k数据集中的.inter文件得到的，已评分的分值在1-5之间，还有未评分的0  
隐式数据集1：为了跟BPR对比性能，将4分和5分记为1，1-3分以及未评分的记为0  
隐式数据集2：将已评分的都记为1，未评分的都记为0

In [19]:
# 显式数据集

# 读取文件并随机打乱
inter = pd.read_csv('../dataset/ml-100k/inter_explicit.csv')
df = pd.DataFrame(inter)
df = df.sample(frac=1).reset_index(drop=True)

# 建立MF模型并训练测试
mf = MF(lr=0.003, weight_decay=0.1, embed_size=10, batch_size=500)
mf.preprocess(df, ratio_train=0.8)
mf.train(epochs=15)

n_P = 944, n_Q = 1683, mu = 3.5295

start training......
epoch 1：avg_loss = 1.1723;   RMSE = 1.0329, MAE = 0.8481, NDCG = 0.1169
epoch 2：avg_loss = 1.0002;   RMSE = 0.9626, MAE = 0.7682, NDCG = 0.1351
epoch 3：avg_loss = 0.8852;   RMSE = 0.9314, MAE = 0.7396, NDCG = 0.1288
epoch 4：avg_loss = 0.8286;   RMSE = 0.9179, MAE = 0.7278, NDCG = 0.1257
epoch 5：avg_loss = 0.7960;   RMSE = 0.9110, MAE = 0.7216, NDCG = 0.1240
epoch 6：avg_loss = 0.7730;   RMSE = 0.9069, MAE = 0.7178, NDCG = 0.1225
epoch 7：avg_loss = 0.7540;   RMSE = 0.9042, MAE = 0.7151, NDCG = 0.1217
epoch 8：avg_loss = 0.7368;   RMSE = 0.9022, MAE = 0.7131, NDCG = 0.1223
epoch 9：avg_loss = 0.7206;   RMSE = 0.9009, MAE = 0.7117, NDCG = 0.1239
epoch 10：avg_loss = 0.7054;   RMSE = 0.9001, MAE = 0.7109, NDCG = 0.1252
epoch 11：avg_loss = 0.6917;   RMSE = 0.8998, MAE = 0.7104, NDCG = 0.1255
epoch 12：avg_loss = 0.6794;   RMSE = 0.8998, MAE = 0.7103, NDCG = 0.1250
epoch 13：avg_loss = 0.6687;   RMSE = 0.9001, MAE = 0.7103, NDCG = 0.1260
epo

In [22]:
# 隐式数据集1

# 读取文件并随机打乱
inter = pd.read_csv('../dataset/ml-100k/inter_implicit1.csv')
df = pd.DataFrame(inter)
df = df.sample(frac=1).reset_index(drop=True)

# 建立MF模型并训练测试
mf = MF(lr=0.003, weight_decay=0.1, embed_size=10, batch_size=500)
mf.preprocess(df, ratio_train=0.8)
mf.train(epochs=15)

n_P = 944, n_Q = 1683, mu = 0.5538

start training......
epoch 1：avg_loss = 0.2206;   RMSE = 0.4510, MAE = 0.4280, NDCG = 0.1102
epoch 2：avg_loss = 0.1953;   RMSE = 0.4412, MAE = 0.4035, NDCG = 0.0913
epoch 3：avg_loss = 0.1882;   RMSE = 0.4372, MAE = 0.3934, NDCG = 0.0858
epoch 4：avg_loss = 0.1846;   RMSE = 0.4354, MAE = 0.3889, NDCG = 0.0851
epoch 5：avg_loss = 0.1827;   RMSE = 0.4346, MAE = 0.3867, NDCG = 0.0867
epoch 6：avg_loss = 0.1817;   RMSE = 0.4341, MAE = 0.3856, NDCG = 0.0872
epoch 7：avg_loss = 0.1810;   RMSE = 0.4338, MAE = 0.3848, NDCG = 0.0877
epoch 8：avg_loss = 0.1805;   RMSE = 0.4336, MAE = 0.3844, NDCG = 0.0879
epoch 9：avg_loss = 0.1801;   RMSE = 0.4334, MAE = 0.3840, NDCG = 0.0889
epoch 10：avg_loss = 0.1798;   RMSE = 0.4333, MAE = 0.3837, NDCG = 0.0891
epoch 11：avg_loss = 0.1794;   RMSE = 0.4331, MAE = 0.3835, NDCG = 0.0904
epoch 12：avg_loss = 0.1791;   RMSE = 0.4330, MAE = 0.3833, NDCG = 0.0910
epoch 13：avg_loss = 0.1789;   RMSE = 0.4329, MAE = 0.3832, NDCG = 0.0913
epo

In [8]:
# 隐式数据集2

# 读取文件并随机打乱
inter = pd.read_csv('../dataset/ml-100k/inter_implicit2.csv')
df = pd.DataFrame(inter)
df = df.sample(frac=1).reset_index(drop=True)

# 建立MF模型并训练测试
mf = MF(lr=0.001, weight_decay=0.1, embed_size=10, batch_size=500)
mf.preprocess(df, ratio_train=0.8)
mf.train(epochs=5)

n_P = 944, n_Q = 1683, mu = 1.0000

start training......
epoch 1：avg_loss = 0.0008;   RMSE = 0.0082, MAE = 0.0041, NDCG = 0.0058
epoch 2：avg_loss = 0.0000;   RMSE = 0.0017, MAE = 0.0005, NDCG = 0.0049
epoch 3：avg_loss = 0.0000;   RMSE = 0.0004, MAE = 0.0001, NDCG = 0.0044
epoch 4：avg_loss = 0.0000;   RMSE = 0.0001, MAE = 0.0000, NDCG = 0.0034
epoch 5：avg_loss = 0.0000;   RMSE = 0.0000, MAE = 0.0000, NDCG = 0.0025
