In [6]:
import pandas as pd
import numpy as np
dtype = [("userId", np.string_), ("movieId", np.string_), ("rating", np.int32)]

In [7]:
train_dataset = pd.read_csv('train.csv',dtype=dict(dtype))
train_dataset.head(5)

Unnamed: 0,userId,productId,Rating
0,AOPE42H34R0EC,B00000DM9W,5.0
1,A1GI09JC6L0NF7,B00004SABJ,4.0
2,AZLZII4AFX56R,B00000J579,3.0
3,A34AHNT6GD9FWW,9888002198,5.0
4,A2PXRAO5C1XTLW,0972683275,5.0


In [8]:

# 评分预测    1-5
class LFM(object):

    def __init__(self, alpha, reg_p, reg_q, number_LatentFactors=10, number_epochs=10, columns=["uid", "iid", "rating"]):
        self.alpha = alpha # 学习率
        self.reg_p = reg_p    # P矩阵正则
        self.reg_q = reg_q    # Q矩阵正则
        self.number_LatentFactors = number_LatentFactors  # 隐式类别数量
        self.number_epochs = number_epochs    # 最大迭代次数
        self.columns = columns

    def fit(self, dataset):
        '''
        fit dataset
        :param dataset: uid, iid, rating
        :return:
        '''

        self.dataset = pd.DataFrame(dataset)

        self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]]
        self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]]

        self.globalMean = self.dataset[self.columns[2]].mean()

        self.P, self.Q = self.sgd()

    def _init_matrix(self):
        '''
        初始化P和Q矩阵，同时为设置0，1之间的随机值作为初始值
        :return:
        '''
        # User-LF
        P = dict(zip(
            self.users_ratings.index,
            np.random.rand(len(self.users_ratings), self.number_LatentFactors).astype(np.float32)
        ))
        # Item-LF
        Q = dict(zip(
            self.items_ratings.index,
            np.random.rand(len(self.items_ratings), self.number_LatentFactors).astype(np.float32)
        ))
        return P, Q

    def sgd(self):
        '''
        使用随机梯度下降，优化结果
        :return:
        '''
        P, Q = self._init_matrix()

        for i in range(self.number_epochs):
            print("iter%d"%i)
            error_list = []
            for uid, iid, r_ui in self.dataset.itertuples(index=False):
                # User-LF P
                ## Item-LF Q
                v_pu = P[uid] #用户向量
                v_qi = Q[iid] #物品向量
                err = np.float32(r_ui - np.dot(v_pu, v_qi))

                v_pu += self.alpha * (err * v_qi - self.reg_p * v_pu)
                v_qi += self.alpha * (err * v_pu - self.reg_q * v_qi)
                
                P[uid] = v_pu 
                Q[iid] = v_qi

                # for k in range(self.number_of_LatentFactors):
                #     v_pu[k] += self.alpha*(err*v_qi[k] - self.reg_p*v_pu[k])
                #     v_qi[k] += self.alpha*(err*v_pu[k] - self.reg_q*v_qi[k])

                error_list.append(err ** 2)
            print(np.sqrt(np.mean(error_list)))
        return P, Q

    def predict(self, uid, iid):
        # 如果uid或iid不在，我们使用全剧平均分作为预测结果返回
        if uid not in self.users_ratings.index or iid not in self.items_ratings.index:
            return self.globalMean

        p_u = self.P[uid]
        q_i = self.Q[iid]

        return np.dot(p_u, q_i)

    def test(self,testset):
        '''预测测试集数据'''
        df=pd.DataFrame({"Keys":[],"Rating":[]})
        i=0;
        for uid, iid, _ in testset.itertuples(index=False):
            pred_rating = self.predict(uid, iid)
            pred_rating = round(pred_rating)
            pred_rating = round(pred_rating,2)
            new_row=pd.Series({"Keys":(uid,iid),"Rating":pred_rating})
            df = pd.concat([df, new_row.to_frame().T])
            i =i +1
        df.to_csv('submission.csv',index=False)
                
           

In [9]:
test_dataset = pd.read_csv('test.csv',dtype=dict(dtype))
lfm = LFM(0.02, 0.01, 0.01, 10, 50, ["userId","productId","Rating"])
lfm.fit(train_dataset)
lfm.test(test_dataset)

iter0
1.604405452756288
iter1
1.2966198346209163
iter2
1.1034607918710309
iter3
0.94095647386949
iter4
0.7999848815615712
iter5
0.6781447261452648
iter6
0.5741586901129242
iter7
0.48660083576358076
iter8
0.4137240972360539
iter9
0.353603446559387
iter10
0.3043162007511621
iter11
0.26407234417313086
iter12
0.23128198718734636
iter13
0.20457563200442824
iter14
0.18279745121372695
iter15
0.16498618374126015
iter16
0.1503517974685476
iter17
0.1382517496513333
iter18
0.12816859443035447
iter19
0.11968936379673818
iter20
0.11248715601560794
iter21
0.1063049471636652
iter22
0.1009416264161949
iter23
0.0962403599081662
iter24
0.09207893518123421
iter25
0.08836200465955661
iter26
0.08501497330885494
iter27
0.0819791647149307
iter28
0.07920812887108244
iter29
0.07666475532348471
iter30
0.0743191490383135
iter31
0.07214694054439577
iter32
0.07012808654827399
iter33
0.06824586103220263
iter34
0.06648624103505986
iter35
0.06483720933323706
iter36
0.06328848597174247
iter37
0.06183116052518935
iter3