In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import fetch_california_housing

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


2.0.0-beta1
sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)
matplotlib 3.0.3
numpy 1.18.1
pandas 0.24.1
sklearn 0.21.2
tensorflow 2.0.0-beta1
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [2]:
# tfrecord 文件格式
# -> tf.train.Example
#    -> tf.train.Features -> {'key': tf.train.Feature}
#       -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List
favorite_books = [name.encode('utf-8') for name in ['machine leraning', 'docker', 'spark']]
favorite_books

[b'machine leraning', b'docker', b'spark']

In [4]:
# 字符串
favorite_books_bytelist = tf.train.BytesList(value=favorite_books)
print(favorite_books_bytelist)

value: "machine leraning"
value: "docker"
value: "spark"



In [5]:
# 浮点
hours_floatlist = tf.train.FloatList(value=[1,2,3])
print(hours_floatlist)

value: 1.0
value: 2.0
value: 3.0



In [6]:
# 整型
age_int64list = tf.train.Int64List(value=[45,3,2])
print(age_int64list)

value: 45
value: 3
value: 2



In [7]:
features = tf.train.Features(
    feature = {
        'favorite_books': tf.train.Feature(bytes_list=favorite_books_bytelist),
        'hours': tf.train.Feature(float_list=hours_floatlist),
        'age': tf.train.Feature(int64_list=age_int64list)
    }
)

print(features) # features内部是feature 字典

feature {
  key: "age"
  value {
    int64_list {
      value: 45
      value: 3
      value: 2
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine leraning"
      value: "docker"
      value: "spark"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 1.0
      value: 2.0
      value: 3.0
    }
  }
}



In [8]:
example = tf.train.Example(features=features)
print(example) # 一个Example内部是features

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 45
        value: 3
        value: 2
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine leraning"
        value: "docker"
        value: "spark"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 1.0
        value: 2.0
        value: 3.0
      }
    }
  }
}



In [9]:
#　将Example序列化
serialized_example = example.SerializeToString()
print(serialized_example)

b'\nb\n5\n\x0efavorite_books\x12#\n!\n\x10machine leraning\n\x06docker\n\x05spark\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03-\x03\x02\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00\x80?\x00\x00\x00@\x00\x00@@'


In [10]:
output_dir = 'tfrecords'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
file_name = 'test_20200124.tfrecords'
file_name_path = os.path.join(output_dir, file_name)
print(file_name_path)

# 通过ｔｆ.io.TFRecordWriter将数据写入tfrecords
with tf.io.TFRecordWriter(file_name_path) as writer:
    for i in range(3):
        writer.write(serialized_example)

tfrecords/test_20200124.tfrecords


In [11]:
# 通过tf.data.TFRecordDataset读取tfrecords文件
dataset = tf.data.TFRecordDataset(filenames=file_name_path)

In [12]:
print(dataset)

<TFRecordDatasetV2 shapes: (), types: tf.string>


In [13]:
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\nb\n5\n\x0efavorite_books\x12#\n!\n\x10machine leraning\n\x06docker\n\x05spark\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03-\x03\x02\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00\x80?\x00\x00\x00@\x00\x00@@', shape=(), dtype=string)
tf.Tensor(b'\nb\n5\n\x0efavorite_books\x12#\n!\n\x10machine leraning\n\x06docker\n\x05spark\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03-\x03\x02\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00\x80?\x00\x00\x00@\x00\x00@@', shape=(), dtype=string)
tf.Tensor(b'\nb\n5\n\x0efavorite_books\x12#\n!\n\x10machine leraning\n\x06docker\n\x05spark\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03-\x03\x02\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00\x80?\x00\x00\x00@\x00\x00@@', shape=(), dtype=string)


In [15]:
# 设置feature类型
expected_features = {
    'favorite_books': tf.io.VarLenFeature(dtype=tf.string),
    'hours': tf.io.VarLenFeature(dtype=tf.float32),
    'age': tf.io.VarLenFeature(dtype=tf.int64)
}
dataset = tf.data.TFRecordDataset(filenames=file_name_path)
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features) # 解析Example
    print(example)

{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab43c2fd0>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab433ff98>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab435bba8>}
{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab435be48>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab435bf28>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab435bd68>}
{'age': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab43c2780>, 'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab43c2710>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f4ab435b7f0>}


In [17]:
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features)
    books = tf.sparse.to_dense(example['favorite_books'], default_value=b'')
    for book in books:
        print(book)

tf.Tensor(b'machine leraning', shape=(), dtype=string)
tf.Tensor(b'docker', shape=(), dtype=string)
tf.Tensor(b'spark', shape=(), dtype=string)
tf.Tensor(b'machine leraning', shape=(), dtype=string)
tf.Tensor(b'docker', shape=(), dtype=string)
tf.Tensor(b'spark', shape=(), dtype=string)
tf.Tensor(b'machine leraning', shape=(), dtype=string)
tf.Tensor(b'docker', shape=(), dtype=string)
tf.Tensor(b'spark', shape=(), dtype=string)


In [19]:
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features)
    books = tf.sparse.to_dense(example['favorite_books'], default_value=b'')  # 转为dense tensor
    for book in books:
        print(book.numpy().decode('utf-8'))

machine leraning
docker
spark
machine leraning
docker
spark
machine leraning
docker
spark


In [20]:
import pprint
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features)
    books = tf.sparse.to_dense(example['favorite_books'], default_value=b'')
    print('favorite_books: ')
    pprint.pprint(books)
    hours = tf.sparse.to_dense(example['hours'])
    print('hours:')
    pprint.pprint(hours)
    age = tf.sparse.to_dense(example['age'])
    print('age: ')
    pprint.pprint(age)

favorite_books: 
<tf.Tensor: id=379, shape=(3,), dtype=string, numpy=array([b'machine leraning', b'docker', b'spark'], dtype=object)>
hours:
<tf.Tensor: id=382, shape=(3,), dtype=float32, numpy=array([1., 2., 3.], dtype=float32)>
age: 
<tf.Tensor: id=385, shape=(3,), dtype=int64, numpy=array([45,  3,  2])>
favorite_books: 
<tf.Tensor: id=398, shape=(3,), dtype=string, numpy=array([b'machine leraning', b'docker', b'spark'], dtype=object)>
hours:
<tf.Tensor: id=401, shape=(3,), dtype=float32, numpy=array([1., 2., 3.], dtype=float32)>
age: 
<tf.Tensor: id=404, shape=(3,), dtype=int64, numpy=array([45,  3,  2])>
favorite_books: 
<tf.Tensor: id=417, shape=(3,), dtype=string, numpy=array([b'machine leraning', b'docker', b'spark'], dtype=object)>
hours:
<tf.Tensor: id=420, shape=(3,), dtype=float32, numpy=array([1., 2., 3.], dtype=float32)>
age: 
<tf.Tensor: id=423, shape=(3,), dtype=int64, numpy=array([45,  3,  2])>


In [21]:
# 存为压缩文件
file_name_zip_path = file_name_path + '.zip'
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(file_name_zip_path, options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [None]:
# 读取压缩文件
dataset_zip = tf.data.TFRecordDataset(filenames=file_name_zip_path, compression_type='GZIP')
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features) # 解析Example
    print(example)