In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
#读取和转换ml100k数据集
def read_data(path):
    '''
    param path：数据集地址
    return data：ml100k数据的DataFrame格式
    '''
    #ml100k数据集只取前三列，不需要时间戳
    data = pd.read_table( path , header=None , usecols=[0,1,2] )
    data.columns=pd.Series(['userid','itemid','score'])
    return data

data = read_data(r'C:\Users\Administrator\Desktop\ml100k.txt')

In [3]:
#划分训练集和测试集
def train_test_split(data , N , M , seednum):
    '''
    param data: ml100k全部数据
    param N: 划分的份数
    param M: 测试集数据取第M份数据
    param seednum: 随机数种子
    return train_data: 训练集数据
           test_data: 测试集数据
    '''
    #设定随机数种子
    random.seed(seednum)
    #data的DataFrame中新建一个列用来存储每条数据所归属的份数
    data['copies'] = 0
    
    for ml in range(data.shape[0]):
        #随机获取一个整数，作为该条数据归属第几份
        data.iloc[ml,3] = random.randint(1,N)
    
    #根据M，取出训练集和测试集
    train_data = data[data['copies'] != M]
    test_data = data[data['copies'] == M]
    
    #取完训练集和测试集后，删除份数字段
    train_data.drop(columns=['copies'] , inplace=True)
    test_data.drop(columns=['copies'] , inplace=True)
    
    return train_data , test_data

train_data , test_data = train_test_split(data , 8 , 1 , 10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [4]:
#数据转化为pvoit格式
def tranform_pvoit(data):
    '''
    param: data数据
    return: data数据的pivot格式
    '''
    data = data.pivot(index='userid',columns='itemid',values='score')
    data.fillna(value=0 , inplace=True) #null用0填充
    
    #取出行列索引值
    data_index = data.index
    data_columns = data.columns
    
    return np.array(data) , data_index , data_columns

R , data_index , data_columns = tranform_pvoit(train_data)

In [5]:
#LFM矩阵分解
def LFM_model(R,fators,alpha,lamda,max_step,min_eui):
    '''
    param R: 用户-物品评分共现矩阵
    param fator: 隐因子个数
    param alpha: 学习率
    param lamda: 正则化项系数
    param max_step: 迭代最大步数
    param min_eui: 允许的最小误差
    return P: 用户-隐因子矩阵(m*k)，Q: 隐因子-物品矩阵(k*n)，preR: 用户-物品评分共现矩阵估计值(m*n)
    '''
    
    # 取R矩阵的维度
    m,n = R.shape
    
    # P，Q矩阵随机初始化
    P = np.random.rand(m,fators)
    Q = np.random.rand(fators,n)
    
    #算法迭代
    for step in range(max_step):
        #遍历R矩阵的每个值
        for u in range(m):
            for i in range(n):
                if R[u][i] != 0:
                    #计算R矩阵中每个值和估计值之间的误差
                    error = R[u][i] - np.dot( P[u,:] , Q[:,i] ) 
                    #梯度下降迭代
                    for f in range(fators):
                        P[u][f] = P[u][f] - alpha * ( -2 * error * Q[f][i] + 2 * lamda * P[u][f])
                        Q[f][i] = Q[f][i] - alpha * ( -2 * error * P[u][f] + 2 * lamda * Q[f][i])
        
        #在当前迭代步数下，P,Q矩阵迭代完成，计算R矩阵估计值preR
        preR = np.dot( P , Q)
        
        #计算当前迭代步数下，平方损失函数值eui
        eui = 0
        for u in range(m):
            for i in range(n):
                if R[u][i] != 0:
                    eui += ( np.dot(P[u,:] , Q[:,i]) - R[u][i] ) ** 2

        #加上正则化项
        eui += lamda * (np.linalg.norm(P) ** 2 + np.linalg.norm(Q) ** 2)
        
        #判断是否达到最小误差内,是则跳出循环，若一直没达到，则迭代到最大迭代次数
        if eui <= min_eui:
            break
    return P , Q , preR , eui

P , Q , preR , eui = LFM_model(R,20,0.0001,0.004,50,1)

In [6]:
#对测试集数据产生推荐（TopN）
def item_recommend(test_data,data,predata,data_index,data_columns,k):
    '''
    param test_data: 测试数据集（DataFrame）
    param data：需要进行推荐的user_item矩阵（ndarray）
    param predata: 需要进行推荐的user_item矩阵估计值（ndarray）
    param data_index: data和predata矩阵的列索引
    param data_columns: data和predata矩阵的行索引
    param k: 为每位用户推荐产品的个数
    retrun test_presorce: 测试集用户对对应产品的预测评分
         ，mse：测试集的评分与预测评分的均方误差
         ，recommend：测试集用户的推荐列表
    '''
    #把训练集的user_item矩阵转化为DataFrame格式
    data_df = pd.DataFrame(data , index = data_index , columns = data_columns)
    predata_df = pd.DataFrame(predata , index = data_index , columns = data_columns)
    
    #把user_item中用户没有评分过的（元素为0）替换为1 ，用户有评分过的(元素不为1)替换为0
    data_df01 = data_df == 0
    data_df01[data_df01 == True] = 1
    
    #计算出每个用户对于所有没有过评分产品的预测评分
    prescore = pd.DataFrame(np.array(data_df01) * np.array(predata_df)\
    ,index=data_index,columns=data_columns)
    
    #行转列,只需取出预测评分不0的部分
    prescore_stack = prescore.stack().reset_index()
    prescore_stack0 = prescore_stack[prescore_stack[0] != 0]
    
    #重新定义columns
    prescore_stack0.columns = pd.Series(['userid','itemid','predictscore'])
    
    #匹配测试集的预测评分
    test_presorce = pd.merge(test_data,prescore_stack0,how='left'\
                             ,left_on=['userid','itemid'],right_on=['userid','itemid'])
    test_presorce['predictscore'] = test_presorce['predictscore'].apply(lambda x:5 if x>5 else x)
    
    #计算mse均方误差
    mse = np.mean(np.abs(test_presorce['score'] - test_presorce['predictscore']))
    
    #对测试集的用户进行TopN推荐
    recommend = pd.DataFrame()
    for ui in prescore_stack0['userid'].drop_duplicates().tolist():
        if ui in test_data['userid'].tolist():
            user_pre = prescore_stack0[prescore_stack0['userid']==ui]
            #降序取出TopN
            user_pre.sort_values(by=['predictscore'],ascending=False,inplace=True)
            recommend = recommend.append(user_pre.head(k),ignore_index=True)

    #预测评分超过5分的,则替换为5分
    recommend['predictscore'] = recommend['predictscore'].apply(lambda x:5 if x>5 else x)
    
    return test_presorce , mse , recommend

test_presorce , mse , recommend = item_recommend(test_data,R,preR,data_index,data_columns,5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
test_presorce

Unnamed: 0,userid,itemid,score,predictscore
0,196,242,3,3.739893
1,244,51,2,3.069925
2,6,86,3,3.372425
3,303,785,3,2.673560
4,225,193,4,4.469874
...,...,...,...,...
12388,650,479,5,3.615273
12389,936,766,3,4.267297
12390,487,291,3,3.521728
12391,880,476,3,3.254706


In [8]:
mse

0.7998608919808573

In [9]:
recommend

Unnamed: 0,userid,itemid,predictscore
0,1,1570,5.0
1,1,1507,5.0
2,1,1488,5.0
3,1,987,5.0
4,1,1339,5.0
...,...,...,...
4620,943,1570,5.0
4621,943,599,5.0
4622,943,957,5.0
4623,943,1216,5.0
