In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [2]:
favorite_books = [name.encode('utf-8')
                  for name in ["machine learning", "cc150"]]
print(favorite_books)


[b'machine learning', b'cc150']


In [3]:
favorite_books_bytelist = tf.train.BytesList(value = favorite_books) #字符串是变为BytesList
print(type(favorite_books_bytelist))
print(favorite_books_bytelist)

<class 'tensorflow.core.example.feature_pb2.BytesList'>
value: "machine learning"
value: "cc150"



In [4]:
#hours设置到0-24之间
hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(type(hours_floatlist))
print(hours_floatlist)

<class 'tensorflow.core.example.feature_pb2.FloatList'>
value: 15.5
value: 9.5
value: 7.0
value: 8.0



In [5]:
age_int64list = tf.train.Int64List(value = [42])
print(type(age_int64list))
print(age_int64list)

<class 'tensorflow.core.example.feature_pb2.Int64List'>
value: 42



In [6]:
#进一步，开搞features
features = tf.train.Features(
    feature = {
        "favorite_books": tf.train.Feature(
            bytes_list = favorite_books_bytelist),
        "hours": tf.train.Feature(
            float_list = hours_floatlist),
        "age": tf.train.Feature(int64_list = age_int64list),
    }
)
print(type(features))
print(features) #类似于json的格式

<class 'tensorflow.core.example.feature_pb2.Features'>
feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}



In [7]:
#example又在外面加了features封装
example = tf.train.Example(features=features)
print(type(example))
print(example)

<class 'tensorflow.core.example.example_pb2.Example'>
features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}



In [8]:
#需要把Example对象进行序列化后，序列化后是变为字节流，才能写入文件
serialized_example = example.SerializeToString()
print(type(serialized_example))
print(serialized_example)
print(len(serialized_example))

<class 'bytes'>
b'\n\\\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150'
94


In [10]:
!ls

dataset.ipynb
generate_csv
tfrecord.ipynb


In [11]:

#生成test.tfrecords文件
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    #把serialized_example写3遍到test.tfrecords里边
    for i in range(3):
        writer.write(serialized_example)

In [12]:

!ls -l tfrecord_basic

total 1
-rw-r--r-- 1 娣辩埍涓讳箟 197121 330  7鏈� 28 15:42 test.tfrecords


In [13]:
#读取record并打印
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)


tf.Tensor(b'\n\\\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150', shape=(), dtype=string)


In [None]:


# VarLenFeature是变长的，得到的是sparseTensor,要通过to_dense变为Tensor，
# 如果FixedLenFeature，得到的是Tensor，必须传入原来保存时数据的shape
expected_features = {
    "favorite_books": tf.io.FixedLenFeature([2],dtype = tf.string),
#     "hours": tf.io.FixedLenFeature([4],dtype = tf.float32),
    "hours": tf.io.VarLenFeature(dtype = tf.float32),
    "age": tf.io.FixedLenFeature([], dtype = tf.int64),
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
#sparse tensor 存储稀疏矩阵的时候效率比较高
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,#游标
        expected_features #解析的类型
    )
    print(example)
    #把books从sparse tensor解析出来
#     books = tf.sparse.to_dense(example["favorite_books"],
#                                default_value=b"")
#     print(books)
#     #这里是为了把两个字符串解析出来
#     for book in books:
#         print(book.numpy().decode("UTF-8"))
    for i in example["favorite_books"]:
        print(i.numpy().decode("UTF-8"))
    print('-'*50)
    hours = tf.sparse.to_dense(example["hours"])
    print(hours)
    for hour in hours:
        print(hour.numpy())
    print('-'*50)
    print(example["age"].numpy())
    

#%%

#把tfrecord存为压缩文件
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type = "GZIP")
with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)

#%%

!ls -l tfrecord_basic

#%%

expected_features = {
    "favorite_books": tf.io.FixedLenFeature([2],dtype = tf.string),
    "hours": tf.io.FixedLenFeature([4],dtype = tf.float32),
    "age": tf.io.FixedLenFeature([], dtype = tf.int64),
}

#%%

#压缩后的文件的读取方法
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], 
                                      compression_type= "GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    print(example)
