In [1]:
import numpy as np
import pandas as pd

In [2]:
'''
@输入参数
R:M*N的评分矩阵
K:隐特征向量维度
max_iter:最大迭代次数
alpha:步长
lamda:正则化系数

@输出
分解之后的P、Q
P:初始化用户特征矩阵M*k
Q：初始化物品特征矩阵N*K
'''

#给定超参数

K= 5
max_iter = 20
alpha = 0.002
lamda = 0.004

#核心算法
def LFM_grad_desc(R,K,max_iter,alpha=0.0002,lamda = 0.002):
    #基本维度参数定义
    M = len(R)
    N = len(R[0])
    
    #P、Q初始值，随机生成
    P = np.random.rand(M,K)
    Q = np.random.rand(N,K)
    Q = Q.T
    
    #开始迭代
    for step in range(max_iter):
        print(step)
        #对所有的用户u、物品i做遍历，对应的特征向量Pu，Qi梯度下降
        for u in range(M):
            for i in range(N):
                #对于每一个大于0的评分，求出预测的评分误差
                if R[u][i] > 0:
                    eui = np.dot(P[u,:],Q[:,i]) - R[u][i]
                    
                    #带入公式，按照梯度下降算法更新当前的Pu与Qi
                    for k in range(K):
                        P[u][k] = P[u][k] - alpha * (2 * eui * Q[k][i] + 2 * lamda * P[u][k])
                        Q[k][i] = Q[k][i] - alpha * (2 * eui * P[u][k] + 2 * lamda * Q[k][i])
                        
        #u、i遍历完成，所有的特征向量更新完成，可以得到P、Q，可以计算预测评分矩阵
        predR = np.dot(P,Q)
        
        #计算当前损失函数
        cost = 0
        ci = 0
        for u in range(M):
            for i in range(N):
                if R[u][i] > 0:
                    cost += (np.dot(P[u,:],Q[:,i]) - R[u][i]) ** 2
                    #加上正则化项
                    for k in range(K):
                        cost += lamda * (P[u][k] ** 2 + Q[k][i] ** 2)
            #ci += 1
            #print("次数：",ci,"cost:",cost)           
        if cost < 0.001:
            break
    
    return P,Q.T,cost

In [3]:
# 必要准备工作：导入库，配置环境等
from __future__ import division
import os, sys
# 导入库并为库起个别名
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
br = '\n'

In [4]:
# 使用read_csv来读取csv，默认分隔符为逗号
pd.read_csv('train.csv')

Unnamed: 0,user_id,business_id,date,stars
0,A2JGzkvNjckSmps_4FbKWw,Xg5qEQiB-7L6kGJ5F4K3bQ,2014-03-18 01:14:10,5.0
1,rypcWiSNGM0suWsiSLh9xA,4RoTEeqB_MNn6yaqZmlZHg,2015-08-29 18:32:15,4.0
2,Dgk0Wdoh7HPjhKQEPBU_jQ,ZOmf-3NN4Z59b2Fw6VAM7g,2015-09-14 16:33:03,3.0
3,FIk4lQQu1eTe2EpzQ4xhBA,HK2Ki-PvnNN-YMTlX1uSVA,2012-09-29 02:03:42,4.0
4,VizhcyMWWPz3UDXEBeix4w,UPIYuRaZvknINOd1w8kqRQ,2011-06-10 20:35:42,3.0
...,...,...,...,...
7927,1O638BDK_fWuxgTVJwff-A,ZIUs7gncPOX0OXr1ZYviAQ,2008-05-01 22:52:19,5.0
7928,rypcWiSNGM0suWsiSLh9xA,sk0stgY4NDJYOX1MbNJ3Pg,2016-07-13 00:18:24,4.0
7929,qibGLHABNReGeJr2w4_8yQ,LtNgP4FqXp5nMFOHErK8cw,2012-06-16 02:15:50,3.0
7930,1dWLN4Mr4hKhu8MQUCKqXQ,o597EK6uvR5RuPMZEwYCUg,2013-12-06 16:57:33,4.0


In [5]:
data_path = 'train.csv'
dtype = {"user_id":np.string_, "business_id":np.string_, "stars":np.float32}
# 加载数据，我们只用三列数据，分别是用户ID，商品ID，评分
ratings = pd.read_csv(data_path, dtype=dtype, usecols=[0,1,3])
ratings_matrix = ratings.pivot_table(index=["user_id"], columns=["business_id"], values="stars")
ratings_matrix       

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-050d_XIor1NpCuWkbIVaQ,-3zffZUHoY8bQjGfPSoBKQ,-4TMQnQJW1yd6NqGRDvAeA,-6tvduBzjLI1ISfs3F_qTg,-7EwIdxcRC5McO35DVfeSQ,-95mbLJsa0CxXhpaNL4LvA,-AD5PiuJHgdUcAK-Vxao2A,-Bdw-5H5C4AYSMGnAvmnzw,-BxWyEIQ6wypT-37MzZizQ,...,zcScEL0WEdFkROcnz5379g,zfQ855VX3SMA_54oVSN5Cw,zoODlH40edpJYLPLkHilNA,zpoZ6WyQUYff18-z4ZU1mA,zqNgwQjj0_XAll-neGikIw,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zuVvDYJkKAbXQTTBauAqJQ,zvQIEpJUmLLmMMffNntHXQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-267Yx8RmdP6io2-qI4UcQ,,,,,,,,,,,...,,,,,,,,,,
-50XWnmQGqBgEI-9ANvLlg,,,,,,,,,,,...,,,,,,,,,,
-594af_E7Z9VVjQc9pJK3g,,,,,,,,,,,...,,,,,,,,,,
-8rSnT5ztVk6vmTDkxTqsQ,,,,,,,,,,,...,,,,,,,,,,
-C-l8EHSLXtZZVfUAUhsPA,,,,,,,,,,,...,3.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zwLq4aVDSH7HyMbVjKfkRg,,,,,,,,,,,...,,,,,,,,,,
zwhty_ZmxbHAHoDfMjNbag,,,,,,,,,,,...,,,,,,,,,,
zx5rdBK9NFZrAglIXF1LiQ,,,,,,,,,,,...,,,,,,,,,,
zy4A7504SezncCAcotMv4g,,,,,,,,,,,...,,,,,,,,,,


In [6]:
R = np.array(ratings_matrix)

In [8]:
#预测结果
P,Q,cost = LFM_grad_desc(R,K,max_iter,alpha,lamda)
'''
print(P)
print(Q)
print(cost)
print(R)
'''
predR = P.dot(Q.T)
#预测矩阵
predR

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


array([[3.61013154, 3.73791632, 2.18383209, ..., 2.68809424, 2.38213611,
        4.43713252],
       [3.67313102, 3.98714733, 2.4790668 , ..., 2.89656642, 2.40643788,
        4.36899995],
       [2.43303821, 2.41892682, 1.4272693 , ..., 1.69828521, 1.55638154,
        3.0818087 ],
       ...,
       [3.44241782, 3.48832035, 1.82213554, ..., 1.65566015, 2.06892647,
        3.87768377],
       [3.50202486, 3.88993101, 2.09208339, ..., 2.4377571 , 2.46789221,
        3.92471072],
       [2.90053975, 3.0686216 , 1.91026389, ..., 2.07868024, 1.81563839,
        3.45706305]])

In [9]:
data_path = 'test.csv' #读取test集
ratings2 = pd.read_csv(data_path, dtype=dtype, usecols=[0,1])
ratings2

Unnamed: 0,user_id,business_id
0,PfpRvMAESbC2bC8FUIMdNg,Kbbm6Vd5UdbP10dwjBghRw
1,oaaEXgQ3x51cXE3GTXrT1Q,2GmGT-7QjowR1ihup3FbVA
2,yT_QCcnq-QGipWWuzIpvtw,pOEL97ld-FJMKO8Ki8JmYg
3,fRVNHAl2RjosC67Y67G3cA,UkWme3kwg6L9rd4tCNB15w
4,48vRThjhuhiSQINQ2KV8Sw,LNGBEEelQx4zbfWnlc66cw
...,...,...
1957,UL5K2rnSYIPD1LcqPgbmDQ,CauQnqZ5eowyrr7oWF_p3Q
1958,C4OkiPljZ3z2XUa7onmihQ,GI-CAiZ_Gg3h21PwrANB4Q
1959,N3oNEwh0qgPqPP3Em6wJXw,2UgRg5a6KmpbD_SZfhNrKg
1960,xAWA2aheTP6YwcFWgmeLaQ,f-2pMptlB6cWaWnU7zYE_A


In [10]:
ratings_matrix_mean = ratings_matrix.mean(axis=1)
#平均值矩阵，用于预测train集合没有的物品的评分
ratings_matrix_mean

user_id
-267Yx8RmdP6io2-qI4UcQ    4.285714
-50XWnmQGqBgEI-9ANvLlg    4.000000
-594af_E7Z9VVjQc9pJK3g    3.250000
-8rSnT5ztVk6vmTDkxTqsQ    3.250000
-C-l8EHSLXtZZVfUAUhsPA    3.666667
                            ...   
zwLq4aVDSH7HyMbVjKfkRg    4.750000
zwhty_ZmxbHAHoDfMjNbag    4.187500
zx5rdBK9NFZrAglIXF1LiQ    4.500000
zy4A7504SezncCAcotMv4g    4.750000
zzPvEodjvLPe-5NvV0U_cg    3.285714
Length: 1103, dtype: float32

In [11]:
T = ratings_matrix.T

In [12]:
# 写入结果，保存在result.csv上
import csv

# 创建文件对象
f = open('result.csv','w',encoding='utf-8',newline='' "")

# 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)

# 构建列表头
csv_writer.writerow(["user_id","business_id","stars"])

# 写入csv文件内容
for row in ratings2.itertuples():
    uid = getattr(row, 'user_id')
    iid = getattr(row, 'business_id')
    row_num = ratings_matrix.index.get_loc(uid)
    col_num = T.index.get_loc(iid)
    outcome = predR[row_num][col_num]
    if outcome != -1:
        csv_writer.writerow([uid,iid,outcome])# 如果有预测值就输出
    else:
        csv_writer.writerow([uid,iid,ratings_matrix_mean[uid]]) #如果没有预测值就输出历史平均值
# 关闭文件
f.close()