In [1]:
from __future__ import absolute_import, division, print_function
import numpy as np
import pandas as pd

In [2]:
from collections import deque
from six import next

import tensorflow as tf
import numpy as np
import readers
import time

In [4]:
np.random.seed(42)#指定随机种子,方便复现

u_num = 6040 #用户数
i_num = 3952 #电影数

batch_size = 1000 #批尺寸，每次训练部分
dims = 5 #暂时指定隐含因子的纬度 m*5 5*n
max_epochs = 50 #1个epoch等于使用训练集中的全部样本训练一次；

place_device = "/cpu:0"


In [5]:
def get_data():
    df = read_file("./ml-1m/ratings.dat",sep="::")
    rows = len(df)
    
    #重定位进行乱序洗牌操作，消失用户顺序的影响
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    
    #指定训练集，测试集的比例 ，0.9训练
    splite_index = int(rows * 0.9)
    df_train = df[0:splite_index]
    df_test = df[splite_index:].reset_index(drop=True)
    
    return df_train, df_test

def clip(x):
    return np.clip(x,1.0,5.0)

In [6]:
def model(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        # 全局大环境
        bias_global = tf.get_variable("bias_global", shape=[])
        # 用户与电影偏差
        w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
        w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
        # 训练其中一部分 ，参数定义100个，不会再更新
        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
        # User and item weight variables
        w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        # Weight embeddings for user and items, given a batch
        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    
    with tf.device(device):
        # reduce_sum: Computes the sum of elements across dimensions of a tensor
        infer = tf.reduce_sum(tf.mul(embd_user, embd_item), 1) #矩阵乘积
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        # l2正则化惩罚项，防止过拟合
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), 
                             name="svd_regularizer")
    return infer, regularizer

In [7]:
#损失函数 = 正则loss，data_loss
def loss(infer,regularizer,rate_batch,learning_rate=0.001, reg=0.1,device="/cpu:0"):
    with tf.device(device):
        #真实值与结果值平方项差异,data_loss
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer,rate_batch))
        #正则化惩罚力度
        penalty = tf.constant(reg, dtype=tf.float32,shape=[],name="l2")
        #组装损失函数
        cost = tf.add(cost_l2,tf.multiply(regularizer,penalty))
        #梯度下降优化器
        train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    return cost, train_op

In [8]:
df_train, df_test = get_data()

samples_per_batch = len(df_train)
print("训练集:%d , 测试集:%d, 迭代次数:%d" % (len(df_train),len(df_test),samples_per_batch))

训练集:900188 , 测试集:100021, 迭代次数:900188


In [9]:
print(df_train["user"].head())
print("-----------------------")
print(df_test["user"].head())

0    5411
1    5439
2     367
3     424
4    4941
Name: user, dtype: int32
-----------------------
0    1696
1    5448
2    2242
3    5629
4     423
Name: user, dtype: int32


In [10]:
print(df_train["item"].head())
print("-----------------------")
print(df_test["item"].head())

0    2682
1     903
2    3716
3    1720
4    3696
Name: item, dtype: int32
-----------------------
0    3113
1    1195
2     749
3    3623
4    2899
Name: item, dtype: int32


In [11]:
print(df_train["rate"].head())
print("-----------------------")
print(df_test["rate"].head())

0    2.0
1    5.0
2    4.0
3    4.0
4    1.0
Name: rate, dtype: float32
-----------------------
0    5.0
1    5.0
2    5.0
3    2.0
4    2.0
Name: rate, dtype: float32


In [12]:
# 洗牌格式化数据
iter_train = readers.ShuffleIterator([df_train["user"],
                                     df_train["item"],
                                     df_train["rate"]],
                                     batch_size=batch_size)

# S测试数据
iter_test = readers.OneEpochIterator([df_test["user"],
                                     df_test["item"],
                                     df_test["rate"]],
                                     batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

#运行
infer, regularizer = model(user_batch, item_batch, user_num=u_num, item_num=i_num, dim=dims, device=place_device)
_, train_op = loss(infer, regularizer, rate_batch, learning_rate=0.10, reg=0.05, device=place_device)

In [13]:
#保存模型
saver = tf.train.Saver()
#全局变量初始化
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    print("%s\t%s\t%s\t%s" % ("Epoch", "Train Error", "Val Error", "Elapsed Time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(max_epochs * samples_per_batch):
        users, items, rates = next(iter_train)
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items,
                                                               rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            
            print("%02d\t%.3f\t\t%.3f\t\t%.3f secs" % (i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)), end - start))
            start = end

    saver.save(sess, './save/model')

Epoch	Train Error	Val Error	Elapsed Time


ValueError: negative dimensions are not allowed

In [2]:
# 运行
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    #sess.run(init_op)
    new_saver = tf.train.import_meta_graph('./save/model.meta')
    new_saver.restore(sess, tf.train.latest_checkpoint('./save/'))
    test_err2 = np.array([])
    for users, items, rates in iter_test:
        pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                item_batch: items})
        pred_batch = clip(pred_batch)
        print("Pred\tActual")
        for ii in range(10):
            print("%.3f\t%.3f" % (pred_batch[ii], rates[ii]))
        test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
        print(np.sqrt(np.mean(test_err2)))

NameError: name 'tf' is not defined