In [12]:
import time
import random
import numpy as np

In [13]:
def load_data():
    """
    加载文件，从data/train.txt中加载原始训练数据，统计原始训练数据中的相关信息，进行打印输出
    形成字典sparse_matrix，sparse_matrix的键为用户编号，sparse_matrix的值为另一个字典rate_of_curruser
    字典rate_of_curruser的键为物品的编号，值为当前用户对该物品的评分。
    :return:sparse_matrix
    """
    start_time=time.time()
    train_path='data/train.txt'
    users=[]
    items=[]
    rates=[]
    with open(train_path,'r') as file:
        top_line=file.readline()
        sparse_matrix=dict()
        while top_line:
            user,nums=top_line.split('|')
            user=int(user)
            nums=int(nums)
            users.append(user)
            rate_of_curruser=dict()
            for i in range(nums):
                rate_line=file.readline()
                item,rate=rate_line.split()
                item=int(item)
                rate=int(rate)
                items.append(item)
                rates.append(rate)
                rate_of_curruser[item]=rate
            sparse_matrix[user]=rate_of_curruser
            top_line=file.readline()

        # 数据统计输出
        set_users = sorted(list(set(users)))
        set_items = sorted(list(set(items)))
        print('关于用户:')
        print('实际的用户数量:{}'.format(len(set_users)))
        print('用户的编号范围: {} 至 {}'.format(set_users[0],set_users[-1]))
        print('关于物品:')
        print('实际的物品数量:{}'.format(len(set_items)))
        print('物品的编号范围: {} 至 {}'.format(set_items[0], set_items[-1]))
        print('矩阵中的空闲率:{}'.format(1-len(items)/(len(set_items)*len(set_users))))
        end_time=time.time()
        print('加载原始数据，用时{}秒'.format(end_time-start_time))
        return set_users,set_items,sparse_matrix


In [14]:
def train_test_spilt(matrix,sample_rate=0.2):
    """

    :param matrix:原始的训练数据字典
    :param sample_rate:测试集划分比率，默认为20%
    :return:训练集字典和测试集字典
    """
    start_time=time.time()
    train_data=dict()
    test_data=dict()
    for user,rate_dict in matrix.items():
        sample_num=int(len(rate_dict)*sample_rate)

        test_keys=random.sample(list(rate_dict),sample_num)
        tmp_test_data={key:rate_dict[key] for key in test_keys}
        tmp_train_data={key:rate_dict[key] for key in rate_dict if key not in test_keys}
        train_data[user]=tmp_train_data
        test_data[user]=tmp_test_data
    end_time=time.time()
    print('训练集数据划分，用时{}秒'.format(end_time-start_time))
    return train_data,test_data

In [15]:
class fit_model:

    def __init__(self,train,set_users,set_items,k):
        """

        :param train: 训练集
        :param set_users: 用户集合
        :param set_items: 物品集合
        :param k: 超参数
        """
        self.train=train
        self.k=k
        self.mean = get_mean_of_train(train)
        self.bias_u = dict()
        self.bias_i = dict()
        self.pu = dict()
        self.qi = dict()
        self.item_hidden=np.zeros((set_items[-1]+1,self.k))
        for user_no in set_users:
            self.bias_u[user_no] = 0
            self.pu[user_no] = np.random.rand(k)
        for item_no in set_items:
            self.bias_i[item_no] = 0
            self.qi[item_no] = np.random.rand(k)

        self.curr_mean_item=np.zeros((1,k))

    def predict_score(self,user_no,item_no):
        """
        预测当前用户对某一物品的评分
        :param user_no: 用户编号
        :param item_no: 物品编号
        :return: 预测评分
        """
        basic=self.mean+self.bias_u[user_no]+self.bias_i[item_no]
        # 计算隐式反馈向量平均值res
        items=self.train[user_no]
        num_of_items=len(items)
        avg=np.zeros((1,self.k))
        if num_of_items==0:
            res=avg+0.1
        else:
            item_list=[item for item,_ in items.items()]
            avg=np.mean(self.item_hidden[item_list],axis=0)
            res=avg/np.sqrt(num_of_items)
        self.curr_mean_item=res
        basic+=np.sum(self.qi[item_no]*(self.pu[user_no]+res))
        return basic

    def gradient_desc(self,user_no,item_no,error,lr,lamb):
        self.bias_u[user_no] += lr * (error - lamb * self.bias_u[user_no])
        self.bias_i[item_no] += lr * (error - lamb * self.bias_i[item_no])
        old_pu = self.pu[user_no]
        old_qi = self.qi[item_no]
        self.pu[user_no] += lr * (error * old_qi - lamb * old_pu)
        self.qi[item_no] += lr * (error * (old_pu+self.curr_mean_item[0]) - lamb * old_qi)
        # 更新隐式向量列表
        items=self.train[user_no]
        sqrt_len=np.sqrt(len(items))
        item_list=[item for item,_ in items.items()]
        tmp_array=np.array([self.qi[no] for no in item_list])
        self.item_hidden[item_list]+=lr*(error*tmp_array/sqrt_len-lamb*self.item_hidden[item_list])


In [16]:
def get_mean_of_train(train):
    sum_rate=0
    count=0
    for user,items in train.items():
        for item_no in items.keys():
            sum_rate+=items[item_no]
            count+=1
    return sum_rate/count

In [17]:
def svdpp_eval(test,fit_model):
    sum_error=0
    count=0
    for user_no,items in test.items():
        for item_no,real_rate in items.items():
            predict_rate=fit_model.predict_score(user_no,item_no)
            sum_error+=(real_rate-predict_rate)**2
            count+=1

    return np.sqrt(sum_error/count)

In [18]:
def svdpp_train(train,test,set_users,set_items,n_epoch,lr,k,lamb):
    """

    :param train: 训练集
    :param test: 测试集
    :param set_users: 用户集合，保存着所有的用户
    :param set_items: 物品集合，保存着所有的物品
    :param n_epoch: 训练批次
    :param lr: 学习率
    :param k: 超参数
    :param lamb: 梯度下降参数
    :return:训练得到的模型
    """
    model=fit_model(train,set_users,set_items,k)
    for epoch in range(n_epoch):
        for user_no,items in train.items():
            for item_no,real_rate in items.items():
                predict_rate=model.predict_score(user_no,item_no)
                error=real_rate-predict_rate
                # 梯度下降
                model.gradient_desc(user_no,item_no,error,lr,lamb)
            if user_no %5==0:
                print('user progress:[{}/{}]'.format(user_no, len(train)))
        # 完成一轮迭代
        rmse_in_train = svdpp_eval(train, model)
        rmse_in_test = svdpp_eval(test, model)
        print('epoch:[{}/{}],RMSE in train is :{} , and RMSE in test is {}'.format(epoch, n_epoch, rmse_in_train,
                                                                                   rmse_in_test))


In [19]:
set_users,set_items,sparse_matrix=load_data()
train,test=train_test_spilt(sparse_matrix)

关于用户:
实际的用户数量:19835
用户的编号范围: 0 至 19834
关于物品:
实际的物品数量:455705
物品的编号范围: 0 至 624960
矩阵中的空闲率:0.9994466691522359
加载原始数据，用时9.357588768005371秒
训练集数据划分，用时35.851792335510254秒


In [20]:
svdpp_train(train,test,set_users,set_items,20,5e-4,3,0.02)

user progress:[0/19835]
user progress:[5/19835]
user progress:[10/19835]
user progress:[15/19835]
user progress:[20/19835]
user progress:[25/19835]
user progress:[30/19835]
user progress:[35/19835]
user progress:[40/19835]
user progress:[45/19835]
user progress:[50/19835]
user progress:[55/19835]
user progress:[60/19835]
user progress:[65/19835]
user progress:[70/19835]
user progress:[75/19835]
user progress:[80/19835]
user progress:[85/19835]
user progress:[90/19835]
user progress:[95/19835]
user progress:[100/19835]
user progress:[105/19835]
user progress:[110/19835]
user progress:[115/19835]
user progress:[120/19835]
user progress:[125/19835]
user progress:[130/19835]
user progress:[135/19835]
user progress:[140/19835]
user progress:[145/19835]
user progress:[150/19835]
user progress:[155/19835]
user progress:[160/19835]
user progress:[165/19835]
user progress:[170/19835]
user progress:[175/19835]
user progress:[180/19835]
user progress:[185/19835]
user progress:[190/19835]
user pro

KeyboardInterrupt: 

In [1]:
a=np.array([1,2,3])
b=np.array([4,5,6])
print(a@b)

NameError: name 'np' is not defined

In [6]:
mydict={}
mydict[0]=[1, 2,3,4,5]
y=[[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]]
print(y[mydict[0]])

TypeError: list indices must be integers or slices, not list

In [20]:
import numpy as np
arr = np.array([[1, 2], [3, 4], [5, 6]])
idx = [0, 2]
result = arr[idx]
print(result)
print(np.mean(arr[idx],axis=0))

[[1 2]
 [5 6]]
[3. 4.]


In [11]:
mydict=dict()
mydict[0]=[1,2,3]
mydict[1]=[4,5,6]
mydict[2]=[7,8,9]

carray=[value for key,value in mydict.items()]
x=np.array(carray)
print(x)


[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [4]:
import numpy as np

my_array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

mylist=[1,2]
print(my_array[mylist])

[[4 5 6]
 [7 8 9]]
