In [19]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr.models import WDL
from deepctr.inputs import SparseFeat,get_feature_names

data = pd.read_csv("movielens_sample.txt")
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# 使用WDL进行训练
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=50, verbose=True, validation_split=0.2, )
# 使用WDL进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

Train on 128 samples, validate on 32 samples
Epoch 1/50


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
test RMSE 1.156027681329474


In [20]:
print(pred_ans)

[[3.0026321]
 [3.0903184]
 [3.03966  ]
 [3.0843148]
 [3.0797138]
 [2.9566696]
 [3.1370401]
 [3.0260105]
 [3.0259323]
 [3.0716891]
 [3.0483646]
 [3.1066575]
 [3.026027 ]
 [3.1008344]
 [3.084531 ]
 [3.0805583]
 [3.3434436]
 [2.8718736]
 [3.0027812]
 [3.0259295]
 [3.0626311]
 [2.9215183]
 [3.1344042]
 [3.0583985]
 [3.079704 ]
 [3.0066743]
 [2.9998665]
 [3.0288336]
 [3.0259192]
 [2.9700506]
 [3.086366 ]
 [2.8737323]
 [3.002696 ]
 [3.0452206]
 [3.0260174]
 [3.493758 ]
 [3.0924222]
 [2.984532 ]
 [3.4188316]
 [2.9830027]]


In [10]:
from surprise import KNNWithMeans
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import KFold

# 数据读取
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
#trainset = data.build_full_trainset()

# ItemCF 计算得分
# 取最相似的用户计算时，只取最相似的k个
# KNNWithMeans
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})
#algo.fit(trainset)

# 定义K折交叉验证迭代器，K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions)

# uid = str(196)
# iid = str(302)

# pred = algo.predict(uid, iid)
# print(pred)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8555
MAE:  0.6541
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8579
MAE:  0.6566
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8558
MAE:  0.6541


In [11]:
from surprise import KNNBasic, KNNWithZScore, KNNBaseline
print('KNNBasic method\n')
algo_basic = KNNBasic(k=50, sim_options={'user_based': False, 'verbose': 'True'})
# 定义K折交叉验证迭代器，K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo_basic.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions)
print("-"*20)

print('KNNWithZScore method\n')
algo_z = KNNWithZScore(k=50, sim_options={'user_based': False, 'verbose': 'True'})
# 定义K折交叉验证迭代器，K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo_z.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions)
print("-"*20)

print('KNNBasic method\n')
algo_base = KNNBaseline(k=50, sim_options={'user_based': False, 'verbose': 'True'})
# 定义K折交叉验证迭代器，K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo_base.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions)
print("-"*20)

KNNBasic method

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7341
MAE:  0.5529
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7326
MAE:  0.5516
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7328
MAE:  0.5520
--------------------
KNNWithZScore method

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7342
MAE:  0.5532
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7325
MAE:  0.5510
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7328
MAE:  0.5523
--------------------
KNNBasic method

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7327
MAE:  0.5519
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7332
MAE:  0.5524
Estimating biases using als...
Computing the msd