In [1]:
import tensorflow as tf
import tensorflow.data as data

def print_dict(dict_, only_callable_key=False):
    if only_callable_key:
        for k, v in dict_.items():
             if callable(v):
                print(k)
    else:
        for k, v in dict_.items():
            print(k, v, sep="\n", end="\n\n")

# tf.data
更多相关指导参见[这里](https://tensorflow.org/guide/data)

**PACKAGE CONTENTS**
- experimental

**CLASSES**

    DatasetSpec
    Dataset
    Options
    Iterator
    IteratorSpec
    FixedLengthRecordDataset
    TFRecordDataset
    TextLineDataset

**DATA**

    AUTOTUNE = -1
    INFINITE_CARDINALITY = -1
    UNKNOWN_CARDINALITY = -2

**FILE**: \tensorflow\_api\v2\data\\\_\_init__.py

# 

# 

In [None]:
tf.data.Dataset

# tf.data.Dataset()
`tf.data.Dataset(variant_tensor)`

创建一个`DatasetV2`对象，与`DatasetV1`不将数据传入构造函数不同，`DatasetV2`及其子类均会将所接收的数据或直接或通过`super`方法传递给`DatasetV2`的构造函数；对`Dataset`的使用主要包括：
1. 创建数据集；最简单的创建数据集的方式便是将列表传递给`.from_tensor_slices()`方法；更多的创建数据集的方式参见`.list_files()`、`.from_generator()`方法和`TextLineDataset`、`TFRecordDataset`、`FixedLengthRecordDataset`类；

2. 对数据进行预处理；例如使用`dataset.map(lambda x: x*2)`方法将数据集每个元素乘二；

3. 遍历并生成数据；由于对数据的遍历方式是流模式，进而不需要将整个数据集加载至内存中；

参数`variant_tensor`指能够表示数据集的一个`DT_VARIANT`张量；

这里需要辨识一下官方英文文档及教程中的 element 和 component 的关系；element 指对`Dataset`对象使用`next`函数时产生的单个输出，其可以是包含了众多 component 的嵌套对象，嵌套结构可以是元祖、namedtuple、字典；需要注意的是，列表这里并不被视为嵌套结构；相对的，component 则是该嵌套中的每个叶结点，其类型是能够用`tf.TypeSpec`表示的任何类型，包括`tf.Tensor`、`tf.data.Dataset`、`tf.SparseTensor`、`tf.RaggedTensor`、`tf.TensorArray`等；例如 element `(1, (3, "apple"), [1, 2])`所含的 component 为`1`、`3`、`"apple"`、`[1, 2]`；



**File**:   \tensorflow\python\data\ops\dataset_ops.py

**Type**:           ABCMeta

## tf.data.Dataset.from_tensor_slices()

`tf.data.Dataset.from_tensor_slices(tensors)`

__Docstring__

`tensors`为张量构成的数据集，所有张量第一维形状必须相同；该函数沿着它们的第一维度将`tensors`进行切片；这个操作保留了各张量的结构，移除了每个张量的第一个维数，并使用它作为数据集维数；

注意：若`tensors`包含 Numpy 数组并且预期的操作无法执行，这些值会作为一个或多个`tf.constant`嵌入至计算图中；对于大型数据集(> 1 GB)，这可能会浪费内存并遇到图形序列化的字节限制；若`tensors`包含一个或多个大型 Numpy 数组，须考虑另外的可行方案[this guide](
https://tensorflow.org/guide/data#consuming_numpy_arrays).

__Type__: function

In [None]:
# Slicing a 1D tensor produces scalar tensor elements.
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
for k, v in dataset.__dict__.items():
    print(k, v, sep="\n")
    print()
print(list(dataset.as_numpy_iterator()))

In [None]:
# Slicing a 2D tensor produces 1D tensor elements.
dataset = tf.data.Dataset.from_tensor_slices([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
print(dataset._tensors)
for element in dataset.as_numpy_iterator():
    print(element)
for data in dataset:
    print(data)

In [None]:
# Slicing a tuple of 1D tensors produces tuple elements containing
dataset = tf.data.Dataset.from_tensor_slices(([1, 2, 3], [3, 4, 5], [5, 6, 7]))
print(dataset._tensors, end="\n\n")
for element in dataset.as_numpy_iterator():
    print(element)
for data in dataset:
    print(data)
    print()

In [None]:
# Dictionary structure is also preserved.
dataset = tf.data.Dataset.from_tensor_slices({"a": [1, 2, 3], "b": [3, 4, 5]})
print(dataset._tensors, end="\n\n")
for element in dataset.as_numpy_iterator():
    print(element)
for data in dataset:
    print(data, end="\n\n")
for k, v in dataset.__dict__.items():
    print(k, v, sep="\n", end="\n\n")

In [50]:
# Two tensors can be combined into one Dataset object.
features = tf.constant([[1, 3], [2, 1], [3, 3]])
labels = tf.constant(['A', 'B', 'A'])
dataset = tf.data.Dataset.from_tensor_slices((features, labels))

for data in dataset:
    print(data, end="\n\n")
for element in dataset.as_numpy_iterator():
    print(element, end="\n\n")
print(dataset._tensors)

(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 3])>, <tf.Tensor: shape=(), dtype=string, numpy=b'A'>)

(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 1])>, <tf.Tensor: shape=(), dtype=string, numpy=b'B'>)

(<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 3])>, <tf.Tensor: shape=(), dtype=string, numpy=b'A'>)

(array([1, 3]), b'A')

(array([2, 1]), b'B')

(array([3, 3]), b'A')

[<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[1, 3],
       [2, 1],
       [3, 3]])>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'A', b'B', b'A'], dtype=object)>]


In [47]:
# Both the features and the labels tensors can be converted to a Dataset object separately and combined after.
features_dataset = Dataset.from_tensor_slices(features)
labels_dataset = Dataset.from_tensor_slices(labels)
dataset = Dataset.zip((features_dataset, labels_dataset))

for element in dataset.as_numpy_iterator():
    print(element)

[(array([1, 3]), b'A'), (array([2, 1]), b'B'), (array([3, 3]), b'A')]

(array([1, 3]), b'A')
(array([2, 1]), b'B')
(array([3, 3]), b'A')


In [46]:
# A batched feature and label set can be converted to a Dataset in similar fashion.
batched_features = tf.constant([[[1, 3], [2, 3]],
                                [[2, 1], [1, 2]],
                                [[3, 3], [3, 2]]], shape=(3, 2, 2))
batched_labels = tf.constant([['A', 'A'],
                              ['B', 'B'],
                              ['A', 'B']], shape=(3, 2, 1))
dataset = Dataset.from_tensor_slices((batched_features, batched_labels))
for element in dataset.as_numpy_iterator():
    print(element)

(array([[1, 3],
       [2, 3]]), array([[b'A'],
       [b'A']], dtype=object))
(array([[2, 1],
       [1, 2]]), array([[b'B'],
       [b'B']], dtype=object))
(array([[3, 3],
       [3, 2]]), array([[b'A'],
       [b'B']], dtype=object))


help(data.Dataset)

Help on class DatasetV2 in module tensorflow.python.data.ops.dataset_ops:



**Methods**

apply(self, transformation_func)


as_numpy_iterator(self)


batch(self, batch_size, drop_remainder=False)

cache(self, filename='')


cardinality(self)


concatenate(self, dataset)


enumerate(self, start=0)


filter(self, predicate)


flat_map(self, map_func)


interleave(self, map_func, cycle_length=None, block_length=None, num_parallel_calls=None, deterministic=None)


map(self, map_func, num_parallel_calls=None, deterministic=None)


options(self)

padded_batch(self, batch_size, padded_shapes=None, padding_values=None, drop_remainder=False)

prefetch(self, buffer_size)


reduce(self, initial_state, reduce_func)

repeat(self, count=None)


shard(self, num_shards, index)


shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None)


skip(self, count)


take(self, count)


unbatch(self)


window(self, size, shift=None, stride=1, drop_remainder=False)


with_options(self, options)



**Static methods**

from_generator(generator, output_types=None, output_shapes=None, args=None, output_signature=None)


from_tensor_slices(tensors)


from_tensors(tensors)


list_files(file_pattern, shuffle=None, seed=None)


range(*args, **kwargs)


zip(datasets)


**Readonly properties**

element_spec
