In [1]:
import pandas

In [49]:
training_set = pandas.read_csv('./ml-100k/u1.base', sep = '\t', 
                               names = ['userid', 'itemid', 'rating', 'tm'])
training_set.head()

Unnamed: 0,userid,itemid,rating,tm
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [7]:
test_set = pandas.read_csv('./ml-100k/u1.test', sep = '\t', 
                           names = ['userid', 'itemid', 'rating', 'tm'])
test_set.head()

Unnamed: 0,userid,itemid,rating,tm
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


# 计算电影与使用者数量

In [8]:
n_movies = max(max(training_set.itemid.tolist()), max(test_set.itemid.tolist()))
n_movies

1682

In [9]:
n_users = max(max(training_set.userid.tolist()), max(test_set.userid.tolist()))
n_users

943

# 建立训练数据集矩阵

In [12]:
import numpy as np

In [63]:
training_m = np.zeros((n_users, n_movies))

# 当我们在需要遍历行数据的时候，就可以使用 iterrows()
for rec in training_set.iterrows():
    training_m[rec[1].userid - 1, rec[1].itemid - 1] = 1

In [64]:
training_m.shape

(943, 1682)

In [65]:
training_m

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

# 建立测试数据集矩阵

In [66]:
test_m = np.zeros((n_users, n_movies))
for rec in test_set.iterrows():
    test_m[rec[1].userid - 1, rec[1].itemid - 1] = 1

In [67]:
test_m.shape

(943, 1682)

In [68]:
test_m

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 建立 Autoencoders

In [69]:
from keras.layers import Input, Dense
from keras.models import Model

Using TensorFlow backend.


In [71]:
# 把数据压缩成 50 个神经元
encoding_dim = 50
input_data = Input(shape = (n_users, ))
encoded = Dense(encoding_dim, activation='softmax')(input_data)
decoded = Dense(n_users)(encoded)

autoencoder = Model(input_data, decoded)
autoencoder.compile(optimizer = 'adam', loss = 'mean_absolute_error')

# 训练 Autoencoders

In [72]:
autoencoder.fit(training_m.T, training_m.T,
               epochs = 100,
               batch_size=32,
               shuffle=True,
               validation_data=(test_m.T, test_m.T))

Train on 1682 samples, validate on 1682 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x640fce390>

# 建立 Encoder

In [73]:
encoder = Model(input_data, encoded)

In [74]:
encoded_data = encoder.predict(training_m.T)

In [75]:
encoded_data.shape

(1682, 50)

# 建立 Decoder 

In [76]:
encoded_input = Input(shape = (encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(encoded_input, decoder_layer(encoded_input))

In [77]:
pred = decoder.predict(encoded_data)

In [80]:
pred

array([[ 3.56280711e-04,  1.69288833e-04,  2.48612370e-04, ...,
         1.05819921e-03, -7.61309639e-05,  3.59571576e-01],
       [ 3.25496658e-04,  1.66346901e-04,  2.35552900e-04, ...,
         1.02376193e-03, -9.75769944e-05,  3.43295574e-01],
       [ 1.07762287e-04, -1.78302871e-04, -3.64128500e-05, ...,
         3.04965302e-04, -4.96692955e-05,  5.65834343e-05],
       ...,
       [ 5.25601208e-05, -2.74326187e-04,  6.53453171e-05, ...,
         2.43362738e-04, -1.10612717e-04,  1.39027834e-04],
       [ 7.62974378e-05, -2.59951164e-04,  5.87571412e-05, ...,
         2.43650516e-04, -1.03760045e-04,  2.00811774e-05],
       [ 6.95738709e-05, -2.58432876e-04,  4.31053340e-05, ...,
         2.54149316e-04, -9.89309046e-05,  2.39424407e-04]], dtype=float32)

# 检视原始数据和重建数据

In [84]:
# 先比较计算第一行的数据准确度
sum((pred[:, 0] > 0.001).astype(int) == training_m.T[:, 0]) / len(training_m.T[:, 0])

0.9197384066587396

In [85]:
# 比较计算整个矩阵的数据准确度
((pred > 0.001).astype(int) == training_m.T).sum() / (1682 * 943)

0.9623598629617067

In [88]:
# 计算均方差
## 把没看过的电影遮罩掉
pred[training_m.T == 0] = 0

# power(x, y) 函数，计算 x 的 y 次方
mse = np.mean(np.power(training_m.T.flatten() - pred.flatten(), 2))
mse

0.0446388878131016

# 套用到测试数据集

In [89]:
encoded_test_data = encoder.predict(test_m.T)

In [90]:
pred_test = decoder.predict(encoded_test_data)

In [91]:
((pred_test > 0.001).astype(int) == test_m.T).sum() / (1682 * 943)

0.9645614535036939

In [93]:
pred_test[test_m.T == 0] = 0
mse = np.mean(np.power(test_m.T.flatten() - pred_test.flatten(), 2))
mse

0.012591080014690026