In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os

In [2]:
from tensorflow import keras
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
x = tf.range(10)

In [4]:
def prints(data):
    for item in data:
        print(item)

In [5]:
dataset = tf.data.Dataset.from_tensor_slices(x)

In [6]:
prints(dataset)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [7]:
dataset = dataset.repeat(3).batch(7)

In [8]:
prints(dataset)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [9]:
dataset =dataset.map(lambda x:x*2)

In [10]:
prints(dataset)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [11]:
dataset = dataset.apply(tf.data.experimental.unbatch())

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.


In [12]:
prints(dataset)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, sh

In [13]:
dataset=dataset.filter(lambda x: x<10)

In [14]:
prints(dataset)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [15]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=2020).batch(7)

In [16]:
prints(dataset)

tf.Tensor([3 5 2 0 1 7 0], shape=(7,), dtype=int64)
tf.Tensor([4 6 8 9 2 3 6], shape=(7,), dtype=int64)
tf.Tensor([8 1 7 9 1 3 5], shape=(7,), dtype=int64)
tf.Tensor([0 2 7 6 5 9 4], shape=(7,), dtype=int64)
tf.Tensor([8 4], shape=(2,), dtype=int64)


In [17]:
filepath = 'datasets/housing/housing.csv'

In [18]:
dataset = tf.data.TextLineDataset(filepath).skip(1)

# for line in dataset:
#     print(line.numpy())

In [19]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join('datasets', 'housing')
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, 'my_{}_{:02d}.csv')
    
    file_paths = []
    
    m = len(data)
    
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        file_paths.append(part_csv)
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header is not None:
                f.write(header)
                f.write('\n')
                
            for row_idx in row_indices:
                f.write(','.join([repr(col) for col in data[row_idx]]))
                f.write('\n')
    
    return file_paths
    

In [20]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, n_read_threads=None, shuffle_buffer_size=10000,
                      n_parse_threads = 5, batch_size=32):
    dataset = tf.data.Dataset.list_files(train_filepaths).repeat(repeat)
    dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
                                            cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls = n_parse_threads)
    return dataset.batch(batch_size).prefetch(1)

In [21]:
housing = fetch_california_housing()

In [22]:
x_train_full, x_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1,1), random_state=2020)

In [23]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train_full, y_train_full, random_state=2020)

In [24]:
scaler = StandardScaler()
scaler.fit(x_train)
x_mean = scaler.mean_
x_std = scaler.scale_

In [25]:
train_data = np.c_[x_train, y_train]
valid_data = np.c_[x_valid, y_valid]
test_data = np.c_[x_test, y_test]
header_cols = housing.feature_names + ['MedianHouseValue']
header = ','.join(header_cols)

In [26]:
train_filepaths = save_to_multiple_csv_files(train_data, 'train', header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, 'valid', header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, 'test', header, n_parts=10)

In [27]:
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.0068,23.0,5.230635,0.971279,2710.0,2.358573,39.76,-121.82,1.073
1,4.0873,18.0,5.452781,1.047865,2104.0,2.721863,32.84,-116.9,1.469
2,2.0575,21.0,4.901354,1.071567,1297.0,2.508704,35.41,-119.02,0.67
3,4.9044,24.0,4.88189,1.165354,513.0,2.019685,33.72,-118.07,4.85
4,4.9662,36.0,6.062598,1.026604,1826.0,2.85759,37.25,-121.92,2.585


In [28]:
with open(train_filepaths[0]) as f:
    for i in range(5):
        print(f.readlines(), end= ' ')

['MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue\n', '3.0068,23.0,5.230635335073977,0.9712793733681462,2710.0,2.3585726718885986,39.76,-121.82,1.073\n', '4.0873,18.0,5.452781371280724,1.0478654592496766,2104.0,2.721862871927555,32.84,-116.9,1.469\n', '2.0575,21.0,4.901353965183753,1.0715667311411992,1297.0,2.508704061895551,35.41,-119.02,0.67\n', '4.9044,24.0,4.881889763779528,1.1653543307086613,513.0,2.0196850393700787,33.72,-118.07,4.85\n', '4.9662,36.0,6.062597809076682,1.02660406885759,1826.0,2.8575899843505477,37.25,-121.92,2.585\n', '1.0467,33.0,3.8062200956937797,1.1148325358851674,1000.0,2.3923444976076556,38.61,-121.44,0.701\n', '6.9664,11.0,7.592705167173253,1.0395136778115501,4495.0,3.4156534954407296,32.66,-117.01,2.939\n', '2.925,19.0,5.649321266968326,1.1176470588235294,748.0,1.6923076923076923,38.55,-121.4,1.424\n', '6.0661,24.0,6.329787234042553,0.9817629179331308,2194.0,3.3343465045592704,34.27,-119.17,2.348\n', '3.4539,35.0,

In [29]:
train_filepaths

['datasets\\housing\\my_train_00.csv',
 'datasets\\housing\\my_train_01.csv',
 'datasets\\housing\\my_train_02.csv',
 'datasets\\housing\\my_train_03.csv',
 'datasets\\housing\\my_train_04.csv',
 'datasets\\housing\\my_train_05.csv',
 'datasets\\housing\\my_train_06.csv',
 'datasets\\housing\\my_train_07.csv',
 'datasets\\housing\\my_train_08.csv',
 'datasets\\housing\\my_train_09.csv',
 'datasets\\housing\\my_train_10.csv',
 'datasets\\housing\\my_train_11.csv',
 'datasets\\housing\\my_train_12.csv',
 'datasets\\housing\\my_train_13.csv',
 'datasets\\housing\\my_train_14.csv',
 'datasets\\housing\\my_train_15.csv',
 'datasets\\housing\\my_train_16.csv',
 'datasets\\housing\\my_train_17.csv',
 'datasets\\housing\\my_train_18.csv',
 'datasets\\housing\\my_train_19.csv']

In [30]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=2020)

In [31]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets\\housing\\my_train_08.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_06.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_03.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_15.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_11.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_13.csv', sh

In [32]:
n_readers = 5
dataset = filepath_dataset.interleave(
lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

In [33]:
for line in dataset.take(5):
    print(line.numpy())

b'6.3197,33.0,6.131578947368421,1.0382775119617225,1230.0,2.9425837320574164,37.31,-122.02,3.401'
b'0.7473,22.0,3.116650987770461,1.1618062088428975,2381.0,2.239887111947319,34.03,-118.29,1.688'
b'2.3011,18.0,4.856823266219239,1.0738255033557047,1527.0,3.4161073825503356,39.11,-121.56,0.575'
b'1.6645,20.0,5.671814671814672,1.16988416988417,1031.0,3.9806949806949805,36.35,-119.42,0.48'
b'6.0783,23.0,5.633986928104576,0.8660130718954249,934.0,3.052287581699346,34.27,-118.65,2.292'


In [34]:
record_defaults = [0, np.nan, tf.constant(np.nan, dtype=tf.float64), 'Hello', tf.constant([])]
parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults)
parsed_fields

[<tf.Tensor: shape=(), dtype=int32, numpy=1>,
 <tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
 <tf.Tensor: shape=(), dtype=float64, numpy=3.0>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'4'>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]

In [35]:
parsed_fields = tf.io.decode_csv(',,,,5', record_defaults)
parsed_fields

[<tf.Tensor: shape=(), dtype=int32, numpy=0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=nan>,
 <tf.Tensor: shape=(), dtype=float64, numpy=nan>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Hello'>,
 <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]

In [36]:
try:
    parsed_fields = tf.io.decode_csv(',,,,', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Field 4 is required but missing in record 0! [Op:DecodeCSV]


In [37]:
try:
    parsed_fields = tf.io.decode_csv('1, 2, 3, 4, 5, 6, 7', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]


In [38]:
n_inputs = 8 # x_train.shape[-1]

@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    
    return (x - x_mean) / x_std,y

In [39]:
train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print('x =', x_batch)
    print('y =', y_batch)
    print()

x = tf.Tensor(
[[-0.23697248  1.2158908  -0.13687335 -0.25607702 -0.67110455 -0.01157876
   0.8724     -0.6079566 ]
 [-0.11491377 -0.6041829  -0.2907079  -0.13762008 -0.08762251  0.00971273
  -1.419928    1.237728  ]
 [ 0.45706406 -1.237252   -0.32740474 -0.18151961  0.51026654 -0.0683156
  -0.91730714  0.7887775 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.607]
 [1.13 ]
 [2.736]], shape=(3, 1), dtype=float32)

x = tf.Tensor(
[[-0.9574372  -1.0789847  -0.21925478 -0.07403632  0.3878073  -0.07896465
   2.8687918  -1.5158325 ]
 [-0.2930663  -0.9207175  -0.01093933 -0.14170012  0.23563376 -0.0449592
   1.3750209  -0.8773269 ]
 [-0.19276595 -0.366782    0.19436398 -0.09880608  0.2239281  -0.01895878
  -0.7106225   1.1828561 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[0.557]
 [1.094]
 [1.139]], shape=(3, 1), dtype=float32)



In [40]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [41]:
keras.backend.clear_session()

In [42]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu', input_shape=x_train.shape[1:]),
    keras.layers.Dense(1),
])

In [43]:
model.compile(loss = 'mse',
             optimizer = keras.optimizers.SGD(1e-3))

In [44]:
batch_size = 32
model.fit(train_set, epochs=10, validation_data=valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x25dfdc31f48>

In [45]:
model.evaluate(test_set)

    363/Unknown - 1s 2ms/step - loss: 0.5274

0.5274440651918574

In [46]:
new_set = test_set.take(3).map(lambda x, y: x)

In [47]:
print(model.predict(new_set))

[[2.7264075 ]
 [1.4143496 ]
 [1.2954478 ]
 [1.3723887 ]
 [3.4038458 ]
 [1.1765496 ]
 [2.700584  ]
 [1.5781198 ]
 [2.2624693 ]
 [1.6654685 ]
 [1.4642131 ]
 [1.2665484 ]
 [2.2498085 ]
 [1.9902084 ]
 [1.0313913 ]
 [4.5407147 ]
 [2.8821821 ]
 [1.6315236 ]
 [1.6099365 ]
 [3.2038043 ]
 [1.4299262 ]
 [1.8764856 ]
 [3.8301075 ]
 [1.8530257 ]
 [1.1724572 ]
 [2.4590178 ]
 [3.271871  ]
 [1.517028  ]
 [1.4448864 ]
 [1.4845127 ]
 [1.8635576 ]
 [0.68995494]
 [2.647474  ]
 [1.8083823 ]
 [1.8530586 ]
 [0.98254734]
 [0.7849967 ]
 [2.3271177 ]
 [2.3187509 ]
 [2.3228204 ]
 [2.8309236 ]
 [1.200787  ]
 [1.1900487 ]
 [0.9428574 ]
 [1.8227458 ]
 [2.3194866 ]
 [1.0574241 ]
 [2.0407398 ]
 [1.8645127 ]
 [3.7467668 ]
 [2.321295  ]
 [1.374466  ]
 [2.400399  ]
 [4.3426676 ]
 [0.46418852]
 [1.3146137 ]
 [0.9788694 ]
 [0.9136567 ]
 [2.1014051 ]
 [2.7256548 ]
 [3.3530211 ]
 [2.1778219 ]
 [1.8424964 ]
 [1.958816  ]
 [2.5366106 ]
 [2.560691  ]
 [2.5450027 ]
 [2.3932204 ]
 [4.529123  ]
 [1.0983729 ]
 [2.61274   ]
 [2.00

In [48]:
optimizer = keras.optimizers.Nadam(0.01)
loss_fn = keras.losses.mean_squared_error

In [49]:
n_epochs = 5
batch_size = 32
n_steps_per_epoch = len(x_train) // batch_size
total_steps = n_epochs * n_steps_per_epoch
global_step = 0

In [50]:
for x_batch, y_batch in train_set.take(total_steps):
    global_step += 1
    print('\rGlobal step {}/{}'.format(global_step, total_steps), end='')
    with tf.GradientTape() as tape:
        y_pred = model(x_batch)
        main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
        loss = tf.add_n([main_loss] + model.losses)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

Global step 363/1810

In [51]:
optimizer = keras.optimizers.Nadam(0.01)
loss_fn = keras.losses.mean_squared_error

In [52]:
@tf.function
def train(model, n_epochs, batch_size = 32, n_readers=5, n_read_threads = 5,
          shuffle_buffer_size = 10000, n_parse_threads=5):
    train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,
                                  n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,
                                  n_parse_threads=n_parse_threads, batch_size=batch_size)
    for x_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(x_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train(model ,5)

In [53]:
keras.backend.clear_session()

In [54]:
optimizer = keras.optimizers.Nadam(0.01)
loss_fn = keras.losses.mean_squared_error

In [55]:
@tf.function
def train(model, n_epochs, batch_size = 32, n_readers=5, n_read_threads = 5,
          shuffle_buffer_size = 10000, n_parse_threads=5):
    train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,
                                  n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,
                                  n_parse_threads=n_parse_threads, batch_size=batch_size)
    n_steps_per_epoch = len(x_train) // batch_size
    total_steps = n_epochs * n_steps_per_epoch
    global_step = 0
    for x_batch, y_batch in train_set:
        global_step += 1
        if tf.equal(global_step % 100, 0):
            tf.print("\rGlobal step", global_step, "/", total_steps)
        with tf.GradientTape() as tape:
            y_pred = model(x_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train(model ,5)

Global step 100 / 1810
Global step 200 / 1810
Global step 300 / 1810


In [56]:
for m in dir(tf.data.Dataset):
    if not (m.startswith("_") or m.endswith("_")):
        func = getattr(tf.data.Dataset, m)
        if hasattr(func, "__doc__"):
            print("● {:21s}{}".format(m + "()", func.__doc__.split("\n")[0]))

● apply()              Applies a transformation function to this dataset.
● as_numpy_iterator()  Returns an iterator which converts all elements of the dataset to numpy.
● batch()              Combines consecutive elements of this dataset into batches.
● cache()              Caches the elements in this dataset.
● concatenate()        Creates a `Dataset` by concatenating the given dataset with this dataset.
● element_spec()       The type specification of an element of this dataset.
● enumerate()          Enumerates the elements of this dataset.
● filter()             Filters this dataset according to `predicate`.
● flat_map()           Maps `map_func` across this dataset and flattens the result.
● from_generator()     Creates a `Dataset` whose elements are generated by `generator`.
● from_tensor_slices() Creates a `Dataset` whose elements are slices of the given tensors.
● from_tensors()       Creates a `Dataset` with a single element, comprising the given tensors.
● interleave()      

In [57]:
with tf.io.TFRecordWriter('my_data.tfrecord') as f:
    f.write(b'This is the first record')
    f.write(b'And this is the second record')

## TFRecord

In [58]:
filepaths=['my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


In [59]:
filepaths = ['my_test_{}.tfrecord'.format(i) for i in range(5)]
for i, filepath in enumerate(filepaths):
    with tf.io.TFRecordWriter(filepath) as f:
        for j in range(3):
            f.write('File {} record {}'.format(i, j).encode('utf-8'))

dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=3)
for item in dataset:
    print(item)

tf.Tensor(b'File 0 record 0', shape=(), dtype=string)
tf.Tensor(b'File 1 record 0', shape=(), dtype=string)
tf.Tensor(b'File 2 record 0', shape=(), dtype=string)
tf.Tensor(b'File 0 record 1', shape=(), dtype=string)
tf.Tensor(b'File 1 record 1', shape=(), dtype=string)
tf.Tensor(b'File 2 record 1', shape=(), dtype=string)
tf.Tensor(b'File 0 record 2', shape=(), dtype=string)
tf.Tensor(b'File 1 record 2', shape=(), dtype=string)
tf.Tensor(b'File 2 record 2', shape=(), dtype=string)
tf.Tensor(b'File 3 record 0', shape=(), dtype=string)
tf.Tensor(b'File 4 record 0', shape=(), dtype=string)
tf.Tensor(b'File 3 record 1', shape=(), dtype=string)
tf.Tensor(b'File 4 record 1', shape=(), dtype=string)
tf.Tensor(b'File 3 record 2', shape=(), dtype=string)
tf.Tensor(b'File 4 record 2', shape=(), dtype=string)


In [60]:
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('my_compressed_tfrecord', options) as f:
    f.write(b'This is the first record')
    f.write(b'And this is the second record')

In [61]:
dataset = tf.data.TFRecordDataset(['my_compressed_tfrecord'], compression_type='GZIP')

for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)
