In [1]:
import tensorflow as tf
from tensorflow import keras

In [87]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [88]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [89]:
# chaining transformations
dataset = dataset.repeat(3).batch(7) # repeat the whole dataset 3x and produces batches of 7 items
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [90]:
# map function is applied to every item
dataset = dataset.map(lambda x: x * 2)
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [91]:
# apply function is applied to the whole dataset
dataset = dataset.apply(tf.data.experimental.unbatch())
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [92]:
# filtering the dataset
dataset = dataset.filter(lambda x: x < 10)
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [93]:
# shuffling dataset (using beffer)
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


In [94]:
# reading data from multiple sources while shuffling them
train_filepaths = ['data/housing/my_train_*.csv']
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

In [95]:
for item in dataset.take(5):
    print(item)

tf.Tensor(b'3.6471,19.0,6.479338842975206,0.9628099173553719,1631.0,3.3698347107438016,37.32,-120.45,1.014', shape=(), dtype=string)
tf.Tensor(b'3.6471,19.0,6.479338842975206,0.9628099173553719,1631.0,3.3698347107438016,37.32,-120.45,1.014', shape=(), dtype=string)
tf.Tensor(b'3.6471,19.0,6.479338842975206,0.9628099173553719,1631.0,3.3698347107438016,37.32,-120.45,1.014', shape=(), dtype=string)
tf.Tensor(b'3.6471,19.0,6.479338842975206,0.9628099173553719,1631.0,3.3698347107438016,37.32,-120.45,1.014', shape=(), dtype=string)
tf.Tensor(b'3.6471,19.0,6.479338842975206,0.9628099173553719,1631.0,3.3698347107438016,37.32,-120.45,1.014', shape=(), dtype=string)


# preprocessing data

In [103]:
# assume that we have mean, and std for X 
from sklearn.datasets import california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = california_housing.fetch_california_housing()
X = housing['data']
y = housing['target']
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_
print(X_mean, X_std)

[ 3.87614072e+00  2.86598191e+01  5.43676547e+00  1.09636817e+00
  1.43038979e+03  3.07711721e+00  3.56146835e+01 -1.19552319e+02] [1.89840947e+00 1.26200248e+01 2.56781628e+00 4.90943971e-01
 1.13369286e+03 1.11416114e+01 2.13471721e+00 2.00299702e+00]


In [114]:
n_inputs = 8

def preprocess(line):
    # here we have 8 input features with default, default value for each of them is 0.
    # then we have 1 target with default without default value => if value is missing, exception 
    # will be raised
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

preprocessing(b'3.6471,19.0,6.479338842975206,0.9628099173553719,1631.0,3.3698347107438016,37.32,-120.45,1.014')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-0.12064882, -0.7654358 ,  0.40601543, -0.27204382,  0.17695288,
         0.02627245,  0.79884803, -0.4481657 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.014], dtype=float32)>)

In [115]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, 
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [117]:
train_filepaths = ['data/housing/my_train_*.csv']
valid_filepaths = ['data/housing/my_valid.csv']
test_filepaths = ['data/housing/my_test.csv']

train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [128]:
model = keras.Sequential()
model.add(keras.layers.Dense(30))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation(activation='elu'))
model.add(keras.layers.Dense(30))
model.add(keras.layers.Activation(activation='elu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1))
model.compile(loss='mean_squared_error')
model.fit(train_set, epochs=1000, validation_data=valid_set, 
          callbacks=[keras.callbacks.EarlyStopping(patience=10)])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000


<tensorflow.python.keras.callbacks.History at 0x13d672ac8>

In [129]:
model.evaluate(test_set)

     97/Unknown - 1s 6ms/step - loss: 0.3617

0.36165607574674274

In [130]:
new_set = test_set.take(3).map(lambda X, y: X)
model.predict(new_set)

array([[1.3529316 ],
       [1.2954531 ],
       [3.1000936 ],
       [2.3362093 ],
       [1.7327207 ],
       [1.5806928 ],
       [1.5339705 ],
       [1.5658405 ],
       [1.7004864 ],
       [1.3354988 ],
       [1.2801796 ],
       [4.461221  ],
       [2.2197528 ],
       [2.3680706 ],
       [1.8634199 ],
       [1.9733782 ],
       [1.5613539 ],
       [3.0820854 ],
       [1.2606753 ],
       [2.427564  ],
       [0.29964465],
       [2.3101547 ],
       [3.324477  ],
       [2.7751498 ],
       [2.2630048 ],
       [1.863718  ],
       [2.4575977 ],
       [1.3613247 ],
       [3.9319975 ],
       [2.5511432 ],
       [1.1018442 ],
       [3.5492716 ],
       [1.2775346 ],
       [0.68823963],
       [1.7904631 ],
       [1.7566411 ],
       [0.8878375 ],
       [1.5761279 ],
       [2.2615128 ],
       [2.7767522 ],
       [1.4034616 ],
       [3.625371  ],
       [2.4840877 ],
       [1.5432407 ],
       [1.492744  ],
       [1.5696832 ],
       [1.2224617 ],
       [1.743

# TFRecord format

In [6]:
# creating TFRecord format with compression

options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_data.tfrecord", options) as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")
    
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths, compression_type="GZIP")

for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


# Preprocessing MNIST dataset (protobuf)

In [17]:
from sklearn.datasets import fetch_openml
import numpy as np
import tensorflow as tf

In [13]:
mnist = fetch_openml('mnist_784', version=1)

In [21]:
X, y = mnist['data'].astype(np.uint8), mnist['target'].astype(np.uint8)
X_train, X_valid, X_test = X[:50000], X[50000:60000], X[60000:]
y_train, y_valid, y_test = y[:50000], y[50000:60000], y[60000:]

In [84]:
def _image_to_protobuf(image, label):
    '''Transform one MNIST image + label into protobuf format.'''
    
    return tf.train.Example(
        features = tf.train.Features(
            feature={
                "image": tf.train.Feature(int64_list=tf.train.Int64List(value=image)),
                "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
            }
        )
    )

def images_to_protobuf(images, labels, filepaths):
    n_obs = len(images) // len(filepaths)
    
    for i, filepath in enumerate(filepaths):
        with tf.io.TFRecordWriter(filepath) as f:
            print(f"saving range {i*n_obs} : {(i+1)*n_obs}")
            for image, label in zip(images[i*n_obs:(i+1)*n_obs], labels[i*n_obs:(i+1)*n_obs]):
                f.write(_image_to_protobuf(image, label).SerializeToString())
            
def load_images(filepaths):
    feature_description = {
        "image": tf.io.FixedLenFeature([784], tf.int64, default_value=[0] * 784),
        "label": tf.io.FixedLenFeature([], tf.int64, default_value=0)
    }
    
    X, y = [], []
    for serialized_image in tf.data.TFRecordDataset(filepaths):
        parsed_image = tf.io.parse_single_example(serialized_image, feature_description)
        X.append(parsed_image["image"])
        y.append(parsed_image["label"])
        
    return X, y

In [83]:
filepaths = [f"data/mnist/trainset_{i}.tfrecord" for i in range(10)]
images_to_protobuf(X_train, y_train, filepaths)

saving range 0 : 5000
saving range 5000 : 10000
saving range 10000 : 15000
saving range 15000 : 20000
saving range 20000 : 25000
saving range 25000 : 30000
saving range 30000 : 35000
saving range 35000 : 40000
saving range 40000 : 45000
saving range 45000 : 50000


In [85]:
X_train, y_train = load_images(filepaths)

# Classifying movie revies

In [88]:
import tensorflow_datasets as tfds

In [99]:
dataset = tfds.load("imdb_reviews", batch_size=32, as_supervised=True)

In [100]:
ratings_train = ratings["train"].prefetch(1)
ratings_other = ratings["test"]