In [21]:
# TFRecord数据保存和读取

In [None]:
import tensorflow as tf
import os
import numpy as np

In [22]:
# 一条TFRecord数据代表一个Example，一个Example就是一个样本数据，每个Example内部由一个字典构成，每个key对应一个Feature，key为字段名，Feature为字段名所对应的数据
# tfrecord 文件格式---往下层层分类
# -> tf.train.Example
#    -> tf.train.Features -> {"key": tf.train.Feature}
#       -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List
favorite_books = [name.encode('utf-8') for name in ["machine learning", "cc150"]]
print(favorite_books) # <class 'tensorflow.core.example.feature_pb2.BytesList'>
print('-' * 50)
favorite_books_bytelist = tf.train.BytesList(value=favorite_books)  #字符串变为BytesList
print(type(favorite_books_bytelist))
print('-' * 50)
print(favorite_books_bytelist)

[b'machine learning', b'cc150']
--------------------------------------------------
<class 'tensorflow.core.example.feature_pb2.BytesList'>
--------------------------------------------------
value: "machine learning"
value: "cc150"



In [23]:
hours_floatlist = tf.train.FloatList(value=[15.5, 9.5, 7.0, 8.0])
print(type(hours_floatlist)) # <class 'tensorflow.core.example.feature_pb2.FloatList'>
print('-' * 50)
print(hours_floatlist)
print('-' * 50)

age_int64list = tf.train.Int64List(value=[22])
print(type(age_int64list)) # <class 'tensorflow.core.example.feature_pb2.Int64List'>
print('-' * 50)
print(age_int64list) 
print('-' * 50)

<class 'tensorflow.core.example.feature_pb2.FloatList'>
--------------------------------------------------
value: 15.5
value: 9.5
value: 7.0
value: 8.0

--------------------------------------------------
<class 'tensorflow.core.example.feature_pb2.Int64List'>
--------------------------------------------------
value: 22

--------------------------------------------------


In [24]:
features = tf.train.Features(
    feature={
        'favorite_books': tf.train.Feature(bytes_list=favorite_books_bytelist),
        'hours': tf.train.Feature(float_list=hours_floatlist),
        'age': tf.train.Feature(int64_list=age_int64list), })
print(type(features)) # <class 'tensorflow.core.example.feature_pb2.Features'>
print(features)  # 类似于json的格式（和字典很类似）

<class 'tensorflow.core.example.feature_pb2.Features'>
feature {
  key: "age"
  value {
    int64_list {
      value: 22
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}



In [25]:
# Features-->Example
example = tf.train.Example(features=features)
print(type(example)) # <class 'tensorflow.core.example.example_pb2.Example'>
print(example)
# 对Example对象进行序列化变成字节流写入文件
serialized_example = example.SerializeToString()
print('-' * 50)
print(type(serialized_example)) # <class 'bytes'> 
print('-' * 50)
print(serialized_example)
print('-' * 50)
print(len(serialized_example))

<class 'tensorflow.core.example.example_pb2.Example'>
features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 22
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

--------------------------------------------------
<class 'bytes'>
--------------------------------------------------
b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x16'
--------------------------------------------------
94


In [26]:
# 生成test.tfrecords文件
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = 'test.tfrecords'
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    # 只是多写点数据
    for i in range(3):
        writer.write(serialized_example)

In [27]:
# 读取record并打印
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x16', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x16', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x16', shape=(), dtype=string)


In [28]:
# VarLenFeature是变长的，得到的是sparseTensor,要通过to_dense变为Tensor
# 如果是FixedLenFeature，得到的是Tensor，必须传入原来保存时数据的shape
expected_features = {
    # 'favorite_books': tf.io.FixedLenFeature([2], dtype=tf.string),
    'favorite_books': tf.io.VarLenFeature(dtype=tf.string),
    # "hours": tf.io.FixedLenFeature([4],dtype = tf.float32),
    'hours': tf.io.VarLenFeature(dtype=tf.float32),
    'age': tf.io.FixedLenFeature([], dtype=tf.int64)
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
# sparse tensor 存储稀疏矩阵的时候效率比较高
for serialized_example_tensor in dataset:
    # parse_single_example和map类似
    example = tf.io.parse_single_example(
        serialized_example_tensor,  #游标
        expected_features)  #解析的类型
    print(example)
    print('-' * 50)
    books=tf.sparse.to_dense(example['favorite_books'],default_value=b'')
    print(books)
    print('-' * 50)
    for book in books:
        print(book.numpy().decode('utf-8'))
    print('-' * 50)
    hours = tf.sparse.to_dense(example['hours'])
    for hour in hours:
        print(hour.numpy())
    print('-' * 50)
    print(example['age'].numpy())

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000001C809A69E50>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000001C809ADBCD0>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=22>}
--------------------------------------------------
tf.Tensor([b'machine learning' b'cc150'], shape=(2,), dtype=string)
--------------------------------------------------
machine learning
cc150
--------------------------------------------------
15.5
9.5
7.0
8.0
--------------------------------------------------
22
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000001C809AD99A0>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000001C809AD9D00>, 'age': <tf.Tensor: shape=(), dtype=int64, numpy=22>}
--------------------------------------------------
tf.Tensor([b'machine learning' b'cc150'], shape=(2,), dtype=string)
--------------------------------------------------
machi

In [29]:
# 把tfrecord存为压缩文件
filename_fullpath_zip=filename_fullpath+'.zip'
options=tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(filename_fullpath_zip,options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [30]:
expected_features = {
    "favorite_books": tf.io.FixedLenFeature([2],dtype = tf.string),
    "hours": tf.io.FixedLenFeature([4],dtype = tf.float32),
    "age": tf.io.FixedLenFeature([], dtype = tf.int64),
}
# 压缩后的文件的读取方法
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], 
                                      compression_type= "GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    print(example)

{'age': <tf.Tensor: shape=(), dtype=int64, numpy=22>, 'favorite_books': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'machine learning', b'cc150'], dtype=object)>, 'hours': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([15.5,  9.5,  7. ,  8. ], dtype=float32)>}
{'age': <tf.Tensor: shape=(), dtype=int64, numpy=22>, 'favorite_books': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'machine learning', b'cc150'], dtype=object)>, 'hours': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([15.5,  9.5,  7. ,  8. ], dtype=float32)>}
{'age': <tf.Tensor: shape=(), dtype=int64, numpy=22>, 'favorite_books': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'machine learning', b'cc150'], dtype=object)>, 'hours': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([15.5,  9.5,  7. ,  8. ], dtype=float32)>}
