In [82]:
import time
import random

import numpy as np
from scipy.spatial import KDTree
import pickle


In [77]:
def load_data():
    """
    加载文件，从data/train.txt中加载原始训练数据，统计原始训练数据中的相关信息，进行打印输出
    形成字典sparse_matrix，sparse_matrix的键为用户编号，sparse_matrix的值为另一个字典rate_of_curruser
    字典rate_of_curruser的键为物品的编号，值为当前用户对该物品的评分。
    :return:sparse_matrix
    """
    start_time=time.time()
    train_path='data/train.txt'
    users=[]
    items=[]
    rates=[]
    with open(train_path,'r') as file:
        top_line=file.readline()
        sparse_matrix=dict()
        while top_line:
            user,nums=top_line.split('|')
            user=int(user)
            nums=int(nums)
            users.append(user)
            rate_of_curruser=dict()
            for i in range(nums):
                rate_line=file.readline()
                item,rate=rate_line.split()
                item=int(item)
                rate=int(rate)
                items.append(item)
                rates.append(rate)
                rate_of_curruser[item]=rate
            sparse_matrix[user]=rate_of_curruser
            top_line=file.readline()

        # 数据统计输出
        set_users = sorted(list(set(users)))
        set_items = sorted(list(set(items)))
        print('关于用户:')
        print('实际的用户数量:{}'.format(len(set_users)))
        print('用户的编号范围: {} 至 {}'.format(set_users[0],set_users[-1]))
        print('关于物品:')
        print('实际的物品数量:{}'.format(len(set_items)))
        print('物品的编号范围: {} 至 {}'.format(set_items[0], set_items[-1]))
        print('矩阵中的空闲率:{}'.format(1-len(items)/(len(set_items)*len(set_users))))
        end_time=time.time()
        print('加载原始数据，用时{}秒'.format(end_time-start_time))
        return set_users,set_items,sparse_matrix


In [78]:
def train_test_spilt(matrix,sample_rate=0.2):
    """

    :param matrix:原始的训练数据字典
    :param sample_rate:测试集划分比率，默认为20%
    :return:训练集字典和测试集字典
    """
    start_time=time.time()
    train_data=dict()
    test_data=dict()
    for user,rate_dict in matrix.items():
        sample_num=int(len(rate_dict)*sample_rate)

        test_keys=random.sample(list(rate_dict),sample_num)
        tmp_test_data={key:rate_dict[key] for key in test_keys}
        tmp_train_data={key:rate_dict[key] for key in rate_dict if key not in test_keys}
        train_data[user]=tmp_train_data
        test_data[user]=tmp_test_data
    end_time=time.time()
    print('训练集数据划分，用时{}秒'.format(end_time-start_time))
    return train_data,test_data

In [79]:
def load_attribute(bi):
    start=time.time()
    file_path='data/itemAttribute.txt'
    attr_dict=dict()
    with open(file_path,'r') as f:
        line=f.readline()
        debug=10000
        while line:
            item,att1,att2=line.split('|')
            if int(item)>debug:
                debug+=10000
                print(item)
            if 'None' in att1:
                att1=-1
            if 'None' in att2:
                att2=-1
            if int(item) in bi:
                attr_dict[int(item)]=[int(att1),int(att2)]
            line=f.readline()
    index2no=dict()
    no2index=dict()
    attr_array=np.zeros((len(attr_dict),2))
    index=0
    for item,attr in attr_dict.items():
        index2no[index]=item
        no2index[item]=index
        attr_array[index] = attr
        index+=1

    end=time.time()
    print('加载属性，用时{} s'.format(end-start))
    return index2no,no2index,attr_array

In [80]:
def k_neighbour(item_no,no2index,index2no,attr_array,kdtree,k):
    if item_no not in no2index:
        return []
    else:
        index=no2index[item_no]
        dist,ind=kdtree.query(attr_array[index],k)
        item_list=[index2no[i] for i in ind ]
        return item_list

In [81]:
def neighbour_item(bi):
    res=dict()
    index2no,no2index,attr_array=load_attribute(bi)
    kdtree=KDTree(attr_array)
    for item in bi.keys():
        # 对于当前的每一个物品
        if item not in no2index:
            res[item]=[]
        else:
            index = no2index[item]
            dist, ind = kdtree.query(attr_array[index], 5)
            item_list = [index2no[i] for i in ind]
            res[item]=item_list
    return res

In [89]:
class fit_model:

    def __init__(self,mean,bias_u,bias_i,pu,qi):
        self.mean=mean
        self.bias_u=bias_u
        self.bias_i=bias_i
        self.pu=pu
        self.qi=qi
        self.neighbour=neighbour_item(bias_i)
        print(self.neighbour)

    def predict_score(self,user_no,item_no):
        basic=self.pu[user_no]@self.qi[item_no]
        return basic+self.mean+self.bias_u[user_no]+self.bias_i[item_no]

    def pred_attribute(self,user_no,item_no):
        basic = self.pu[user_no] @ self.qi[item_no]+self.bias_i[item_no]
        item_list=self.neighbour[item_no]
        for item in item_list:
            basic+=self.pu[user_no] @ self.qi[item]+self.bias_i[item]
        return basic/(len(item_list)+1)+self.bias_u[user_no]+self.mean

    def gradient_desc(self,user_no,item_no,error,lr,lamb):
        self.bias_u[user_no] += lr * (error - lamb * self.bias_u[user_no])
        self.bias_i[item_no] += lr * (error - lamb * self.bias_i[item_no])
        old_pu=self.pu[user_no]
        old_qi=self.qi[item_no]
        self.pu[user_no] += lr * (error * old_qi - lamb * old_pu)
        self.qi[item_no] += lr * (error * old_pu - lamb * old_qi)

In [84]:
def get_mean_of_train(train):
    sum_rate=0
    count=0
    for user,items in train.items():
        for item_no in items.keys():
            sum_rate+=items[item_no]
            count+=1
    return sum_rate/count

In [85]:
def funk_svd_train(train,test,set_users,set_items,n_epoch,lr,k,lamb):
    mean = get_mean_of_train(train)
    bias_u = dict()
    bias_i = dict()
    pu = dict()
    qi = dict()
    for user_no in set_users:
        bias_u[user_no]=0
        pu[user_no]=np.random.normal(0, .1, k)
    for item_no in set_items:
        bias_i[item_no]=0
        qi[item_no]=np.random.normal(0, .1, k)

    model=fit_model(mean,bias_u,bias_i,pu,qi)
    for epoch in range(n_epoch):
        for user_no,items in train.items():
            for item_no,real_rate in items.items():
                predict_rate=model.pred_attribute(user_no,item_no)
                error=real_rate-predict_rate
                # 梯度下降
                model.gradient_desc(user_no,item_no,error,lr,lamb)
            if user_no % 2000 == 0:
                print('user progress:[{}/{}]'.format(user_no, len(train)))
        # 完成一轮迭代
        rmse_in_train=funk_svd_eval(train,model)
        rmse_in_test=funk_svd_eval(test,model)
        print('epoch:[{}/{}],RMSE in train is :{} , and RMSE in test is {}'.format(epoch,n_epoch,rmse_in_train,rmse_in_test))
        pickle_path='models/fit_model'+str(epoch)+'.pkl'
        with open(pickle_path,'wb') as f:
            pickle.dump(fit_model,f)

In [86]:
def funk_svd_eval(test,fit_model):
    sum_error=0
    count=0
    for user_no,items in test.items():
        for item_no,real_rate in items.items():
            predict_rate=fit_model.predict_score(user_no,item_no)
            sum_error+=(real_rate-predict_rate)**2
            count+=1

    return np.sqrt(sum_error/count)

In [87]:
set_users,set_items,sparse_matrix=load_data()
train,test=train_test_spilt(sparse_matrix)

关于用户:
实际的用户数量:19835
用户的编号范围: 0 至 19834
关于物品:
实际的物品数量:455705
物品的编号范围: 0 至 624960
矩阵中的空闲率:0.9994466691522359
加载原始数据，用时6.222656965255737秒
训练集数据划分，用时36.37710094451904秒


In [88]:
k=30
mean = get_mean_of_train(train)
bias_u = dict()
bias_i = dict()
pu = dict()
qi = dict()
for user_no in set_users:
    bias_u[user_no]=0
    pu[user_no]=np.random.normal(0, .1, k)
for item_no in set_items:
    bias_i[item_no]=0
    qi[item_no]=np.random.normal(0, .1, k)

In [90]:
model=fit_model(mean,bias_u,bias_i,pu,qi)

10001
20001
30001
40001
50001
60001
70002
80003
90001
100001
110001
120001
130002
140001
150002
160001
170001
180001
190001
200001
210001
220001
230001
240001
250001
260002
270001
280001
290001
300002
310001
320001
330001
340001
350001
360001
370001
380001
390001
400001
410001
420001
430001
440001
450001
460001
470001
480001
490001
500001
510001
520001
530002
540001
550002
560001
570002
580002
590002
600003
610001
620002
加载属性，用时2.3667335510253906 s


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [92]:
file_path='data/itemAttribute.txt'
attr_dict=dict()
with open(file_path,'r') as f:
    line=f.readline()
    while line:
        item,att1,att2=line.split('|')
        if 'None' in att1:
            att1=-1
        if 'None' in att2:
            att2=-1
        attr_dict[int(item)]=[int(att1),int(att2)]
        line=f.readline()
res=sorted(list(set(attr_dict.keys())))
print(len(res))
print(res[0])
print(res[-1])

507172
0
624960
