In [6]:
import numpy as np
import pandas as pd

# 数据集的拆分
def split_dataset(dataset_path, x=0.8, random=False):
    """
    这里为了保证每个用户在训练集和测试集中都有数据，对每个用户的评分数据按照比例切分
    : param x: 训练集的比例
    : param random:是否随机切分 默认False
    """
    dtype = {'userId':np.int32, 'movieId':np.int32, 'rating':np.float32}
    dataset = pd.read_csv(dataset_path, dtype=dtype, usecols=range(3))
    user_index_list = dataset.groupby('userId').any().index
    testset_index = []
    for user in user_index_list:
        # 此user的所有评分数据
        user_ratings = dataset.where(dataset['userId']==user).dropna()
        all_index = list(user_ratings.index)
        if random:
            np.random.shuffle(all_index)
        cut_point = round(len(all_index)*x)
        testset_index += all_index[cut_point:]
    
    testset = dataset.loc[testset_index]
    trainset = dataset.drop(testset_index)
    return trainset, testset

In [7]:
dataset_path = '../dataset/ml-latest-small/ratings.csv'
trainset, testset = split_dataset(dataset_path)

In [8]:
trainset

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100571,610,96610,5.0
100572,610,96811,4.0
100573,610,96815,3.5
100574,610,96832,5.0


In [9]:
testset

Unnamed: 0,userId,movieId,rating
186,1,2899,5.0
187,1,2916,4.0
188,1,2944,5.0
189,1,2947,5.0
190,1,2948,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [10]:
trainset, testset = split_dataset(dataset_path, random=True)

In [11]:
trainset

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100827,610,163937,3.5
100828,610,163981,3.5
100830,610,166528,4.0
100832,610,168248,5.0


In [12]:
testset

Unnamed: 0,userId,movieId,rating
109,1,1777,4.0
149,1,2353,5.0
92,1,1377,3.0
25,1,457,5.0
170,1,2617,2.0
...,...,...,...
100585,610,97938,4.0
100330,610,60333,5.0
100500,610,86295,3.0
99649,610,1573,3.5


In [19]:
import numpy as np
import pandas as pd


# 数据集的拆分
def split_dataset(dataset_path, x=0.8, random=False):
    """
    这里为了保证每个用户在训练集和测试集中都有数据，对每个用户的评分数据按照比例切分
    : param x: 训练集的比例
    : param random:是否随机切分 默认False
    """
    print("正在切分数据集···")
    dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32}
    dataset = pd.read_csv(dataset_path, dtype=dtype, usecols=range(3))
    user_index_list = dataset.groupby('userId').any().index
    testset_index = []
    for user in user_index_list:
        # 此user的所有评分数据
        user_ratings = dataset.where(dataset['userId'] == user).dropna()
        all_index = list(user_ratings.index)
        if random:
            np.random.shuffle(all_index)
        cut_point = round(len(all_index) * x)
        testset_index += all_index[cut_point:]

    testset = dataset.loc[testset_index]
    trainset = dataset.drop(testset_index)
    print("数据集切分完毕！！！")
    return trainset, testset


def accuracy(predict_results, method="all"):
    """
    准确性指标计算方法
    :param predict_results: 预测结果，类型为容器，
    每个元素是一个包含uid, iid, real_rating, pred_rating的序列
    :param method: 指标方法，字符串类型，rmse或mae，否则返回两者
    :return:
    """

    def rmse(predict_results):
        """
        rmse评估指标
        :param predict_results: uid, iid, real_rating, pred_rating序列
        :return:  rmse
        """
        length = 0
        _rmse_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse_sum = (real_rating - pred_rating) ** 2
        return round(np.sqrt(_rmse_sum/length), 4)

    def mae(predict_results):
        """
        mae评估指标
        :param predict_results:
        :return:
        """
        length = 0
        _mae_sum = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _mae_sum += abs(real_rating - pred_rating)
        return round(_mae_sum/length, 4)

    def rmse_mae(predict_results):
        length = 0
        _rmse = 0
        _mae = 0
        for uid, iid, real_rating, pred_rating in predict_results:
            length += 1
            _rmse += (real_rating - pred_rating) ** 2
            _mae += abs(real_rating - pred_rating)

        _rmse = round(np.sqrt(_rmse / length), 4)
        _mae = round(_mae / length, 4)
        return _rmse, _mae

    if method.lower() == 'rmse':
        return rmse(predict_results)
    elif method.lower() == 'mae':
        return mae(predict_results)
    else: # all
        return rmse_mae(predict_results)


class BaselineCFBySGD(object):
    def __init__(self, number_epochs, alpha, reg, columns: list):
        # 梯度下降最高迭代次数
        self.number_epochs = number_epochs
        # 学习率
        self.alpha = alpha
        # 正则参数
        self.reg = reg
        # 数据集字段名称
        self.columns = columns

    def fit(self, dataset):
        """
        训练
        :param dataset: uid, iid, rating
        :return:
        """
        self.dataset = dataset
        # 用户评分数据
        self.user_ratings = self.dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        # 物品评分数据
        self.item_ratings = self.dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]
        # 计算全局平均分
        self.global_mean = self.dataset[self.columns[2]].mean()
        # 调用sgd方法训练模型参数
        self.bu, self.bi = self.sgd()

    def sgd(self):
        """
        随机梯度下降优化bu, bi
        :return: bu, bi
        """
        # 初始化参数 bu, bi  全部设置为0
        bu = dict(zip(self.user_ratings.index, np.zeros(len(self.user_ratings.index))))
        bi = dict(zip(self.item_ratings.index, np.zeros(len(self.item_ratings.index))))
        # 梯度下降更新参数
        for epoch in range(self.number_epochs):
            print("Epoch: %d" % epoch)
            for uid, iid, real_rating in self.dataset.itertuples(index=False):
                error = real_rating - (self.global_mean + bu[uid] + bi[iid])
                bu[uid] += self.alpha * (error - self.reg * bu[uid])
                bi[iid] += self.alpha * (error - self.reg * bi[iid])
        return bu, bi

    def predict(self, uid, iid):
        """预测评分"""
        predict_rating = self.global_mean + self.bu[uid] + self.bi[iid]
        return predict_rating

    def test(self, testset):
        """预测测试集数据"""
        for uid, iid, real_rating in testset.itertuples(index=False):
            try:
                pred_rating = self.predict(uid, iid)
            except Exception as e:
                print(e,"出现异常~")
            else:
                yield uid, iid, real_rating, pred_rating



if __name__ == '__main__':
    
    # 切分数据集
    trainset, testset = split_dataset("../dataset/ml-latest-small/ratings.csv")
    # 建立模型
    model = BaselineCFBySGD(number_epochs=100, alpha=0.05, reg=0.01, columns=["userId", "movieId", "rating"])
    # 模型训练
    model.fit(trainset)
    # 预测
    predict_results = model.test(testset)
    rmse, mae = accuracy(predict_results,"all")
    print("rmse: %f,    mae: %f" % (rmse, mae))


正在切分数据集···
数据集切分完毕！！！
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 55
Epoch: 56
Epoch: 57
Epoch: 58
Epoch: 59
Epoch: 60
Epoch: 61
Epoch: 62
Epoch: 63
Epoch: 64
Epoch: 65
Epoch: 66
Epoch: 67
Epoch: 68
Epoch: 69
Epoch: 70
Epoch: 71
Epoch: 72
Epoch: 73
Epoch: 74
Epoch: 75
Epoch: 76
Epoch: 77
Epoch: 78
Epoch: 79
Epoch: 80
Epoch: 81
Epoch: 82
Epoch: 83
Epoch: 84
Epoch: 85
Epoch: 86
Epoch: 87
Epoch: 88
Epoch: 89
Epoch: 90
Epoch: 91
Epoch: 92
Epoch: 93
Epoch: 94
Epoch: 95
Epoch: 96
Epoch: 97
Epoch: 9