In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
X = tf.range(10)
for item in X:
    print(item)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [3]:
def sqrt(x):
    return x ** 2

In [4]:
# dataset = dataset.map(lambda x: sqrt(x))
# dataset = dataset.map(lambda x: x ** 2)
dataset = dataset.map(sqrt)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(25, shape=(), dtype=int32)
tf.Tensor(36, shape=(), dtype=int32)
tf.Tensor(49, shape=(), dtype=int32)
tf.Tensor(64, shape=(), dtype=int32)
tf.Tensor(81, shape=(), dtype=int32)


In [5]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

In [6]:
dataset = fetch_california_housing()
X, y = dataset.data, dataset.target.reshape(-1, 1)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [7]:
def split_dataset_to_csv(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")
    
    file_paths = []
    
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        file_paths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(','.join([repr(col) for col in data[row_idx]]))
                f.write("\n")
        
    return file_paths

In [8]:
train_data = np.c_[X_train, y_train]
test_data = np.c_[X_test, y_test]
val_data = np.c_[X_val, y_val]
header_cols = dataset.feature_names + ["MedianHouseValue"]
header = ','.join(header_cols)

train_file_paths = split_dataset_to_csv(train_data, "train", header, n_parts=20)
valid_file_paths = split_dataset_to_csv(val_data, "valid", header)
test_file_paths = split_dataset_to_csv(test_data, "test", header)

In [9]:
import pandas as pd

In [10]:
pd.read_csv(train_file_paths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


In [11]:
train_file_paths[0]

'datasets\\housing\\my_train_00.csv'

In [12]:
filepath_dataset = tf.data.Dataset.list_files(train_file_paths)
for file in filepath_dataset:
    print(file)

tf.Tensor(b'datasets\\housing\\my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_15.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_08.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_13.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_03.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_04.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_16.csv', sh

In [13]:
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=5)

In [14]:
for line in dataset.take(5):
    print(line)

tf.Tensor(b'4.7361,7.0,7.464968152866242,1.1178343949044587,846.0,2.694267515923567,34.49,-117.27,1.745', shape=(), dtype=string)
tf.Tensor(b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782', shape=(), dtype=string)
tf.Tensor(b'5.9522,26.0,6.196521739130435,1.0069565217391305,1479.0,2.5721739130434784,34.5,-119.75,4.384', shape=(), dtype=string)
tf.Tensor(b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526', shape=(), dtype=string)
tf.Tensor(b'1.6571,34.0,4.454976303317536,1.0876777251184835,1358.0,3.2180094786729856,37.94,-122.35,1.052', shape=(), dtype=string)


In [15]:
n_inputs = 8
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y
preprocess(b'4.5909,16.0,5.475877192982456,1.0964912280701755,1357.0,2.9758771929824563,33.63,-117.71,2.418')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.36618188, -0.998705  ,  0.00781878, -0.00675364, -0.06140145,
         0.0072037 , -0.94465536,  0.9367464 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.418], dtype=float32)>)

In [16]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [17]:
train_set = csv_reader_dataset(train_file_paths, repeat=None)
test_set = csv_reader_dataset(test_file_paths)
val_set = csv_reader_dataset(valid_file_paths)

In [18]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=[8,]),
    keras.layers.Dense(1)
    
])

In [19]:
print(len(X_train))
model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))
batch_size = 32
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data=val_set)

11610
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2bce0468820>

# TFRecord

In [20]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"First Record")
    f.write(b"Second Record")

In [21]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'First Record', shape=(), dtype=string)
tf.Tensor(b'Second Record', shape=(), dtype=string)


# Compressing TFRecord Files 

In [22]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
    f.write(b"Babingoblingo")
    f.write(b"Zingobenzingo")
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"], compression_type="GZIP")

In [23]:
for file in dataset:
    print(file)

tf.Tensor(b'Babingoblingo', shape=(), dtype=string)
tf.Tensor(b'Zingobenzingo', shape=(), dtype=string)


# Protocol Buffers

In [24]:
%%writefile person.proto
syntax = "proto3";
message Person {
  string name = 1;
  int32 id = 2;
  repeated string email = 3;
}

Overwriting person.proto


In [25]:
!protoc person.proto --python_out=. --descriptor_set_out=person.desc --include_imports

In [26]:
!ls person*

person.desc
person.proto
person_pb2.py


In [27]:
from person_pb2 import Person

person = Person(name="al", id=2, email=["a@b.com"])

In [28]:
print(person)

name: "al"
id: 2
email: "a@b.com"



In [29]:
person.email.append("c@d.com")

In [30]:
person.email

['a@b.com', 'c@d.com']

In [31]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

In [32]:
instance_1 = Example(
    features = Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"John"])),
            "id": Feature(int64_list=Int64List(value=[1])),
            "email": Feature(bytes_list=BytesList(value=[b"john@bitch.com"]))
        }

    )
)

In [33]:
instance_1

features {
  feature {
    key: "name"
    value {
      bytes_list {
        value: "John"
      }
    }
  }
  feature {
    key: "id"
    value {
      int64_list {
        value: 1
      }
    }
  }
  feature {
    key: "email"
    value {
      bytes_list {
        value: "john@bitch.com"
      }
    }
  }
}

In [34]:
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(instance_1.SerializeToString())

In [35]:
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "email": tf.io.VarLenFeature(tf.string)
}

In [36]:
for instance in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
    parsed_instance = tf.io.parse_single_example(instance, feature_description)

In [37]:
parsed_instance

{'email': SparseTensor(indices=tf.Tensor([[0]], shape=(1, 1), dtype=int64), values=tf.Tensor([b'john@bitch.com'], shape=(1,), dtype=string), dense_shape=tf.Tensor([1], shape=(1,), dtype=int64)),
 'id': <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'John'>}

# Images in TFRecords

In [38]:
from sklearn.datasets import load_sample_images

In [39]:
img = load_sample_images()["images"][0]

In [40]:
data = tf.io.encode_jpeg(img)

In [41]:
image_1 = Example(
    features = Features(
        feature = {
            "image": Feature(bytes_list=BytesList(value=[data.numpy()]))
        }
    
    )
)

### Handling Sequential Data Using SequenceExample

In [42]:
context = Features(feature={
    "author_id": Feature(int64_list=Int64List(value=[123])),
    "title": Feature(bytes_list=BytesList(value=[b"A", b"desert", b"place", b"."])),
    "pub_date": Feature(int64_list=Int64List(value=[1623, 12, 25]))
})

In [43]:
content = [["When", "shall", "we", "three", "meet", "again", "?"],
           ["In", "thunder", ",", "lightning", ",", "or", "in", "rain", "?"]]
comments = [["When", "the", "hurlyburly", "'s", "done", "."],
            ["When", "the", "battle", "'s", "lost", "and", "won", "."]]

In [44]:
def words_to_feature(words):
    return Feature(bytes_list=BytesList(value=[word.encode("utf-8") for word in words]))

content_features = [words_to_feature(sentence) for sentence in content]
comments_features = [words_to_feature(comment) for comment in comments]

In [45]:
from tensorflow.train import SequenceExample, FeatureLists, FeatureList

In [46]:
sequence_example = SequenceExample(
    context=context,
    feature_lists=FeatureLists(feature_list={
        "content": FeatureList(feature=content_features),
        "comments": FeatureList(feature=comments_features)
    }))

In [47]:
sequence_example

context {
  feature {
    key: "title"
    value {
      bytes_list {
        value: "A"
        value: "desert"
        value: "place"
        value: "."
      }
    }
  }
  feature {
    key: "pub_date"
    value {
      int64_list {
        value: 1623
        value: 12
        value: 25
      }
    }
  }
  feature {
    key: "author_id"
    value {
      int64_list {
        value: 123
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "content"
    value {
      feature {
        bytes_list {
          value: "When"
          value: "shall"
          value: "we"
          value: "three"
          value: "meet"
          value: "again"
          value: "?"
        }
      }
      feature {
        bytes_list {
          value: "In"
          value: "thunder"
          value: ","
          value: "lightning"
          value: ","
          value: "or"
          value: "in"
          value: "rain"
          value: "?"
        }
      }
    }
  }
  feature_list {
    key: "c

### Parsing serialized data

In [48]:
serialized_sequence_example = sequence_example.SerializeToString()
serialized_sequence_example

b"\nL\n\x14\n\x08pub_date\x12\x08\x1a\x06\n\x04\xd7\x0c\x0c\x19\n\x12\n\tauthor_id\x12\x05\x1a\x03\n\x01{\n \n\x05title\x12\x17\n\x15\n\x01A\n\x06desert\n\x05place\n\x01.\x12\xd0\x01\nb\n\x08comments\x12V\n&\n$\n\x04When\n\x03the\n\nhurlyburly\n\x02's\n\x04done\n\x01.\n,\n*\n\x04When\n\x03the\n\x06battle\n\x02's\n\x04lost\n\x03and\n\x03won\n\x01.\nj\n\x07content\x12_\n*\n(\n\x04When\n\x05shall\n\x02we\n\x05three\n\x04meet\n\x05again\n\x01?\n1\n/\n\x02In\n\x07thunder\n\x01,\n\tlightning\n\x01,\n\x02or\n\x02in\n\x04rain\n\x01?"

In [49]:
context_feature_description = {
    "author_id": tf.io.FixedLenFeature(shape=[], dtype=tf.int64, default_value=0),
    "title": tf.io.VarLenFeature(dtype=tf.string),
    "pub_date": tf.io.FixedLenFeature(shape=[3], dtype=tf.int64, default_value=[0,0,0])
}

sequence_feature_description = {
    "content": tf.io.VarLenFeature(tf.string),
    "comments": tf.io.VarLenFeature(tf.string)
}

parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(serialized_sequence_example,
                                                                    context_feature_description, 
                                                                    sequence_feature_description)

In [50]:
parsed_context

{'title': SparseTensor(indices=tf.Tensor(
 [[0]
  [1]
  [2]
  [3]], shape=(4, 1), dtype=int64), values=tf.Tensor([b'A' b'desert' b'place' b'.'], shape=(4,), dtype=string), dense_shape=tf.Tensor([4], shape=(1,), dtype=int64)),
 'author_id': <tf.Tensor: shape=(), dtype=int64, numpy=123>,
 'pub_date': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([1623,   12,   25], dtype=int64)>}

In [51]:
parsed_context["title"].values

<tf.Tensor: shape=(4,), dtype=string, numpy=array([b'A', b'desert', b'place', b'.'], dtype=object)>

In [52]:
parsed_feature_lists

{'comments': SparseTensor(indices=tf.Tensor(
 [[0 0]
  [0 1]
  [0 2]
  [0 3]
  [0 4]
  [0 5]
  [1 0]
  [1 1]
  [1 2]
  [1 3]
  [1 4]
  [1 5]
  [1 6]
  [1 7]], shape=(14, 2), dtype=int64), values=tf.Tensor(
 [b'When' b'the' b'hurlyburly' b"'s" b'done' b'.' b'When' b'the' b'battle'
  b"'s" b'lost' b'and' b'won' b'.'], shape=(14,), dtype=string), dense_shape=tf.Tensor([2 8], shape=(2,), dtype=int64)),
 'content': SparseTensor(indices=tf.Tensor(
 [[0 0]
  [0 1]
  [0 2]
  [0 3]
  [0 4]
  [0 5]
  [0 6]
  [1 0]
  [1 1]
  [1 2]
  [1 3]
  [1 4]
  [1 5]
  [1 6]
  [1 7]
  [1 8]], shape=(16, 2), dtype=int64), values=tf.Tensor(
 [b'When' b'shall' b'we' b'three' b'meet' b'again' b'?' b'In' b'thunder'
  b',' b'lightning' b',' b'or' b'in' b'rain' b'?'], shape=(16,), dtype=string), dense_shape=tf.Tensor([2 9], shape=(2,), dtype=int64))}

In [53]:
print(tf.RaggedTensor.from_sparse(parsed_feature_lists["content"]))

<tf.RaggedTensor [[b'When', b'shall', b'we', b'three', b'meet', b'again', b'?'],
 [b'In', b'thunder', b',', b'lightning', b',', b'or', b'in', b'rain', b'?']]>


# Categorical Embedding

In [54]:
vocabs = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocabs), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocabs, indices)

In [55]:
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
y = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
y_indices = table.lookup(y)

In [56]:
embedding = keras.layers.Embedding(len(vocabs) + num_oov_buckets, 2)
embedding(y_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.0087769 ,  0.02261391],
       [-0.03627525,  0.03207256],
       [-0.04719225,  0.03522695],
       [-0.04719225,  0.03522695]], dtype=float32)>

# TF Transform

In [57]:
import tensorflow_transform as tft

In [58]:
def preprocess(inputs):
    median_age = inputs["housing_median_age"]
    ocean_proximity = inputs["ocean_proximity"]
    standardized_age = tft.scale_to_z_score(median_age)
    ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
    return {"standardized_median_age": standardized_age, 
           "ocean_proximity_id": ocean_proximity_id}