In [2]:
import numpy as np
import pandas as pd


class BaselineCFBySGD(object):
    def __init__(self, number_epochs, alpha, reg, columns: list):
        # 梯度下降最高迭代次数
        self.number_epochs = number_epochs
        # 学习率
        self.alpha = alpha
        # 正则参数
        self.reg = reg
        # 数据集字段名称
        self.columns = columns

    def fit(self, dataset):
        """
        模型训练
        :param dataset: uid, iid, rating
        :return:
        """
        self.dataset = dataset
        # 用户评分数据
        self.user_ratings = self.dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # 物品评分数据
        self.item_ratings = self.dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # 计算全局平均分
        self.global_mean = self.dataset[self.columns[2]].mean()
        # 调用sgd方法训练模型参数
        self.bu, self.bi = self.sgd()

    def sgd(self):
        """
        随机梯度下降优化bu, bi
        :return: bu, bi
        """
        # 初始化参数 bu, bi  全部设置为0
        bu = dict(zip(self.user_ratings.index, np.zeros(len(self.user_ratings.index))))
        bi = dict(zip(self.item_ratings.index, np.zeros(len(self.item_ratings.index))))
        # 梯度下降更新参数
        for epoch in range(self.number_epochs):
            print("Epoch: %d" % epoch)
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                bu[uid] += self.alpha * (error - self.reg * bu[uid])
                bi[iid] += self.alpha * (error - self.reg * bi[iid])
        return bu, bi

    def predict(self, uid, iid):
        """预测评分"""
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating


if __name__ == '__main__':
    dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32}
    dataset = pd.read_csv("../dataset/ml-latest-small/ratings.csv", dtype=dtype, usecols=range(3))
    # 设置参数
    number_epochs = 100
    alpha = 0.05
    reg = 0.01
    columns = ["userId", "movieId", "rating"]
    # 建立模型
    model = BaselineCFBySGD(number_epochs, alpha, reg, columns)
    # 模型训练
    model.fit(dataset)
    # 预测
    predict_rating = model.predict(uid=1, iid=5)
    print(predict_rating)


Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 55
Epoch: 56
Epoch: 57
Epoch: 58
Epoch: 59
Epoch: 60
Epoch: 61
Epoch: 62
Epoch: 63
Epoch: 64
Epoch: 65
Epoch: 66
Epoch: 67
Epoch: 68
Epoch: 69
Epoch: 70
Epoch: 71
Epoch: 72
Epoch: 73
Epoch: 74
Epoch: 75
Epoch: 76
Epoch: 77
Epoch: 78
Epoch: 79
Epoch: 80
Epoch: 81
Epoch: 82
Epoch: 83
Epoch: 84
Epoch: 85
Epoch: 86
Epoch: 87
Epoch: 88
Epoch: 89
Epoch: 90
Epoch: 91
Epoch: 92
Epoch: 93
Epoch: 94
Epoch: 95
Epoch: 96
Epoch: 97
Epoch: 98
Epoch: 99
3.52347984