In [2]:
import tensorflow as tf
import numpy as np

RNN要求出入的样本具有相同的长度（即feature的维度相同）。但训练数据不一定如此，例如，不同句子长度不一样。若果将所有样本长度都填充到样本最大长度，则会大大浪费存储空间，也影响处理效率。

TensorFlow提供的tf.train.batch dynamic padding能够在一个batch内，将当前batch内样本数据填充到相同长度，而不是所有训练样本。

用“0”填充时，有个问题需要注意，避免填充的“0”与样本数据中的“0”无法区分。

In [14]:
# Example with tf.train.batch dynamic padding
# ==================================================

tf.reset_default_graph()

# Create a tensor [0, 1, 2, 3, 4 ,...]
x = tf.range(1, 10, name="x")

# A FIFOQueue that outputs 0,1,2,3,...limit-1
range_q = tf.train.range_input_producer(limit=5, shuffle=False)
print(range_q)
slice_end = range_q.dequeue()

# Slice x to variable length, i.e. [0], [0, 1], [0, 1, 2], ....
y = tf.slice(x, [0], [slice_end], name="y")

print(y.dtype)

# Batch the variable length tensor with dynamic padding
batched_data = tf.train.batch(
    tensors=[y],
    batch_size=3,
    dynamic_pad=True,
    name="y_batch"
)

print(batched_data)

# Run the graph
# tf.contrib.learn takes care of starting the queues for us
res = tf.contrib.learn.run_n({"y": batched_data}, n=4, feed_dict=None)

print("length of res: {}".format(len(res)))
# 可以看出动态padding效果
for i in range(len(res)):
    print("Batch shape: {}".format(res[i]["y"].shape))
    print(res[i]["y"])


<tensorflow.python.ops.data_flow_ops.FIFOQueue object at 0x7f73b07a9550>
<dtype: 'int32'>
Tensor("y_batch:0", shape=(3, ?), dtype=int32)
Instructions for updating:
graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
Instructions for updating:
graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
Instructions for updating:
graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
length of res: 4
Batch shape: (3, 2)
[[0 0]
 [1 0]
 [1 2]]
Batch shape: (3, 4)
[[1 2 3 0]
 [1 2 3 4]
 [0 0 0 0]]
Batch shape: (3, 3)
[[1 0 0]
 [1 2 0]
 [1 2 3]]
Batch shape: (3, 4)
[[1 2 3 4]
 [0 0 0 0]
 [1 0 0 0]]


In [29]:
# Example with PaddingFIFOQueue
# ==================================================

tf.reset_default_graph()

# Create a tensor [0, 1, 2, 3, 4 ,...]
x = tf.range(1, 10, name="x")

# A queue that outputs 0,1,2,3,...
range_q = tf.train.range_input_producer(limit=5, shuffle=False)
slice_end = range_q.dequeue()

# Slice x to variable length, i.e. [0], [0, 1], [0, 1, 2], ....
y = tf.slice(x, [0], [slice_end], name="y")

    
# Creating a new queue
padding_q = tf.PaddingFIFOQueue(
    capacity=10,
    dtypes=tf.int32,
    shapes=[[None]])

# Enqueue the examples
enqueue_op = padding_q.enqueue([y])
print(enqueue_op)

# Add the queue runner to the graph
qr = tf.train.QueueRunner(padding_q, [enqueue_op])
tf.train.add_queue_runner(qr)

# # Dequeue padded data
# batched_data = padding_q.dequeue_many(5)

# print(batched_data)

# # Run the graph
# # tf.contrib.learn takes care of starting the queues for us
# res = tf.contrib.learn.run_n({"y": batched_data}, n=4, feed_dict=None)



# print("length of res: {}".format(len(res)))
# 可以看出动态padding效果

# Dequeue padded data
batched_data = padding_q.dequeue_many(3) # 每个batch3个样本

print(batched_data)

# Run the graph
# tf.contrib.learn takes care of starting the queues for us
res = tf.contrib.learn.run_n({"y": batched_data}, n=4, feed_dict=None) # epoch=4
for j in range(len(res)):
    print("Batch shape: {}".format(res[j]["y"].shape))
    print(res[j]["y"])
    

name: "padding_fifo_queue_enqueue"
op: "QueueEnqueueV2"
input: "padding_fifo_queue"
input: "y"
attr {
  key: "Tcomponents"
  value {
    list {
      type: DT_INT32
    }
  }
}
attr {
  key: "timeout_ms"
  value {
    i: -1
  }
}

Tensor("padding_fifo_queue_DequeueMany:0", shape=(3, ?), dtype=int32)
Instructions for updating:
graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
Instructions for updating:
graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
Instructions for updating:
graph_actions.py will be deleted. Use tf.train.* utilities instead. You can use learn/estimators/estimator.py as an example.
Batch shape: (3, 2)
[[0 0]
 [1 0]
 [1 2]]
Batch shape: (3, 4)
[[1 2 3 0]
 [1 2 3 4]
 [0 0 0 0]]
Batch shape: (3, 3)
[[1 0 0]
 [1 2 0]
 [1 2 3]]
Batch shape: (3, 4)
[[1 2 3 4]
 [0 0 0 0]
 [1 0 0 0]]
