### 用tensorflow构建DAE推荐系统

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
# UserID::MovieID::Rating::Timestamp
!perl -pi.bak -e 's/::/\t/g' ratings.dat

In [3]:
# 加载数据
df = pd.read_csv('ratings.dat', sep='\t', names=['user', 'item', 'rating', 'timestamp'], header=None)
df = df.drop('timestamp', axis=1)

num_items = df.item.nunique()
num_users = df.user.nunique()

print("USERS: {} ITEMS: {}".format(num_users, num_items))

USERS: 6040 ITEMS: 3706


In [5]:
# 对输入做Normalization
from sklearn import preprocessing
r = df['rating'].values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['rating'] = df_normalized

In [6]:
# 把DataFrame转成user-item矩阵
matrix = df.pivot(index='user', columns='item', values='rating')
matrix.fillna(0, inplace=True)

In [7]:
users = matrix.index.tolist()
items = matrix.columns.tolist()
matrix = matrix.as_matrix()

In [8]:
# 网络超参数
num_input = num_items
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])

# 隐层的变量初始化
weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

In [9]:
# 构建encoder
def encoder(x):
    layer_1 = tf.nn.relu(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2


# 构建decoder
def decoder(x):
    layer_1 = tf.nn.relu(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2


# 构建整个模型
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)


# 预测
y_pred = decoder_op


# 标准答案就是输入
y_true = X

In [10]:
# 定义损失函数和优化器，最小化square error
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)

predictions = pd.DataFrame()

# 定义评估准则
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

In [11]:
# 变量初始化
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

In [12]:
# 在session中run
with tf.Session() as session:
    epochs = 100
    batch_size = 250

    session.run(init)
    session.run(local_init)

    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='rating')
    predictions.columns = ['user', 'item', 'rating']
    predictions['user'] = predictions['user'].map(lambda value: users[value])
    predictions['item'] = predictions['item'].map(lambda value: items[value])

Epoch: 1 Loss: 59.0580499371
Epoch: 2 Loss: 3.0786729902
Epoch: 3 Loss: 0.97855501622
Epoch: 4 Loss: 0.513937832167
Epoch: 5 Loss: 0.402825277299
Epoch: 6 Loss: 0.268313886598
Epoch: 7 Loss: 0.112210400092
Epoch: 8 Loss: 0.035147917845
Epoch: 9 Loss: 0.0213621712755
Epoch: 10 Loss: 0.0205971047981
Epoch: 11 Loss: 0.0206085862204
Epoch: 12 Loss: 0.0206078271537
Epoch: 13 Loss: 0.0206037946822
Epoch: 14 Loss: 0.0205983311947
Epoch: 15 Loss: 0.0205973804307
Epoch: 16 Loss: 0.0205953549206
Epoch: 17 Loss: 0.0205961550819
Epoch: 18 Loss: 0.0205949976031
Epoch: 19 Loss: 0.0205929138853
Epoch: 20 Loss: 0.0205930756638
Epoch: 21 Loss: 0.02059213821
Epoch: 22 Loss: 0.0205901029985
Epoch: 23 Loss: 0.0205905007509
Epoch: 24 Loss: 0.0205895710193
Epoch: 25 Loss: 0.0205876573455
Epoch: 26 Loss: 0.0205875455479
Epoch: 27 Loss: 0.0205855603563
Epoch: 28 Loss: 0.0205855794096
Epoch: 29 Loss: 0.0205850776595
Epoch: 30 Loss: 0.0205833382206
Epoch: 31 Loss: 0.020582481171
Epoch: 32 Loss: 0.0205818848141


In [13]:
# 为每个用户计算top10的推荐
print("Filtering out items in training set")
keys = ['user', 'item']
i1 = predictions.set_index(keys).index
i2 = df.set_index(keys).index

recs = predictions[~i1.isin(i2)]
recs = recs.sort_values(['user', 'rating'], ascending=[True, False])
recs = recs.groupby('user').head(10)
recs.to_csv('recs.tsv', sep='\t', index=False, header=False)

Filtering out items in training set
