In [1]:
'''
备注：
使用算法：baseline算法  
数据集切分 ： 先加载全部数据，然后选取 movie_id能被n整除且 user_id能被m整除的数据作为训练集
m=4,n=4时 大约600W条评分数据   probe  RMSE=0.944
m=3,n=3时 大约1100W条评分数据   probe  RMSE=0.964

'''
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import KFold
import pandas as pd
import numpy as np


In [2]:
# 处理训练集的函数
def data_cleaning(data):
    data['movie_id']=data[0].map(lambda x:int(x[:-1]) if ':' in x else np.NaN)
    data=data.fillna(method='ffill')
    data=data[data[0].map(lambda x: ':' not in x)]
    data['user_id']=data[0].map(lambda x:x.split(',')[0]).astype(np.int)
    data['rating']=data[0].map(lambda x:x.split(',')[1]).astype(np.float)
    data=data.drop([data.columns[0]],axis=1)
    #data=data.set_index('user_id')

    return data

In [3]:
# 处理probe数据的函数
def probe_cleaning(data):
    data['movie_id']=data[0].map(lambda x:int(x[:-1]) if ':' in x else np.NaN)
    data=data.fillna(method='ffill')
    data=data[data[0].map(lambda x: ':' not in x)]
    data.columns=['user_id','movie_id']
    data['user_id']=data['user_id'].astype(np.int)
    data=data[['movie_id','user_id']]
    
    #data=data.set_index('user_id')
    return data

In [4]:
# 加载训练集
data1=pd.read_csv('./combined_data_1.txt',sep=' ',header=None)
data1=data_cleaning(data1)
data2=pd.read_csv('./combined_data_2.txt',sep=' ',header=None)
data2=data_cleaning(data2)
data3=pd.read_csv('./combined_data_3.txt',sep=' ',header=None)
data3=data_cleaning(data3)
data4=pd.read_csv('./combined_data_4.txt',sep=' ',header=None)
data4=data_cleaning(data4)

In [5]:
data_all=pd.concat([data1,data2,data3,data4],ignore_index=True)

In [13]:
#  对数据集进行切分，然后选取 movie_id能被n整除且 user_id能被m整除的数据作为训练集
# n=3 m=3  选取movie_id  user_id能被3整除的数据作为训练集，从1亿条数据中选出大约1100w条
n=3
m=3
data_c=data_all[data_all['movie_id']%n==0]

In [19]:
data_c=data_c[data_c['user_id']%m==0]
data_c

Unnamed: 0,movie_id,user_id,rating
694,3.0,1331154,4.0
695,3.0,2632461,3.0
696,3.0,44937,5.0
698,3.0,439011,1.0
700,3.0,1644750,3.0
...,...,...,...
100479568,17769.0,1699236,2.0
100479574,17769.0,359157,3.0
100479576,17769.0,1642092,1.0
100479582,17769.0,77664,2.0


In [8]:
reader = Reader(line_format='item user rating', sep=',', skip_lines=1)
# load_from_df  直接从DataFrame 调用数据
data = Dataset.load_from_df(data_c[['movie_id', 'user_id', 'rating']], reader=reader)

In [9]:
train_set = data.build_full_trainset()
# ALS优化
bsl_options = {'method': 'als','n_epochs': 5,'reg_u': 12,'reg_i': 5}
algo = BaselineOnly(bsl_options=bsl_options)
kf = KFold(n_splits=3)


In [10]:
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
RMSE: 0.9285
Estimating biases using als...
RMSE: 0.9283
Estimating biases using als...
RMSE: 0.9283


In [11]:
# 读取并处理probe数据格式
probe=pd.read_csv('./probe.txt',sep=' ',header=None)
probe_data=probe_cleaning(probe)

In [14]:
probe_data_c=probe_data[probe_data['movie_id']%n==0]
probe_data_c=probe_data_c[probe_data_c['user_id']%m==0]
probe_data_c.sort_values(by='movie_id')

Unnamed: 0,movie_id,user_id
739896,3.0,2297310
739895,3.0,2251677
739900,3.0,177678
739879,3.0,475143
739888,3.0,349821
...,...,...
564537,17769.0,2586954
564534,17769.0,720417
564526,17769.0,2002893
564556,17769.0,2270286


In [20]:
# 用accuracy.rmse(predictions, verbose=True) 计算RMSE 所以先把数据格式处理成testset相同的格式

# 把probe补全 成测试集
p_data=pd.merge(probe_data_c,data_c)

In [21]:
# 处理成和 testset 一样的格式
probe_test=[]

for i in p_data.values:
    tup=(i[0],i[1],i[2])
    probe_test.append(tup)


In [22]:
probe_predictions = algo.test(probe_test)

In [23]:
accuracy.rmse(probe_predictions, verbose=True)   # 计算RMSE 

RMSE: 0.9645


0.9645267124259107