In [1]:
import tensorflow as tf
import tensorflow.keras as keras

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import os

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


In [16]:
!ls images

In [6]:
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id+"."+fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [7]:
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, 
                                             housing.target.reshape(-1,1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full,
                                                     y_train_full, random_state=42)



In [11]:
housing.data.size

165120

In [12]:
scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [14]:
X_train.shape

(11610, 8)

In [25]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")
    
    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [26]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "valid", header, n_parts=10)

In [27]:
train_filepaths[0]

'datasets/housing/my_train_00.csv'

In [28]:
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


In [29]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [30]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets/housing/my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_16.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_13.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/my_train_15.csv', shape=(), dtype=string)
tf.Ten

In [31]:
n_readers = 5
dataset = filepath_dataset.interleave(
          lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers)

Cause: could not parse the source code:

          lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers)

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

          lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers)

This error may be avoided by creating the lambda in a standalone statement.



In [32]:
for line in dataset.take(5):
    print(line.numpy())

b'4.5909,16.0,5.475877192982456,1.0964912280701755,1357.0,2.9758771929824563,33.63,-117.71,2.418'
b'2.4792,24.0,3.4547038327526134,1.1341463414634145,2251.0,3.921602787456446,34.18,-118.38,2.0'
b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-122.19,2.67'
b'2.1856,41.0,3.7189873417721517,1.0658227848101265,803.0,2.0329113924050635,32.76,-117.12,1.205'
b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'


In [33]:
n_inputs = 8 #X_train.shape[-1]

@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

In [34]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, n_read_threads=None,
                      shuffle_buffer_size=10_000, n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
                                cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [35]:
tf.random.set_seed(42)

train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in train_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)
    print()

X = tf.Tensor(
[[ 0.5804519  -0.20762321  0.05616303 -0.15191229  0.01343246  0.00604472
   1.2525111  -1.3671792 ]
 [ 5.818099    1.8491895   1.1784915   0.28173092 -1.2496178  -0.3571987
   0.7231292  -1.0023477 ]
 [-0.9253566   0.5834586  -0.7807257  -0.28213993 -0.36530012  0.27389365
  -0.76194876  0.72684526]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.752]
 [1.313]
 [1.535]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[-0.8324941   0.6625668  -0.20741376 -0.18699841 -0.14536144  0.09635526
   0.9807942  -0.67250353]
 [-0.62183803  0.5834586  -0.19862501 -0.3500319  -1.1437552  -0.3363751
   1.107282   -0.8674123 ]
 [ 0.8683102   0.02970133  0.3427381  -0.29872298  0.7124906   0.28026953
  -0.72915536  0.86178064]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[0.919]
 [1.028]
 [2.182]], shape=(3, 1), dtype=float32)



In [36]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [37]:
train_set?

[0;31mType:[0m           PrefetchDataset
[0;31mString form:[0m    <PrefetchDataset shapes: ((None, 8), (None, 1)), types: (tf.float32, tf.float32)>
[0;31mFile:[0m           ~/anaconda3/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py
[0;31mDocstring:[0m      A `Dataset` that asynchronously prefetches its input.
[0;31mInit docstring:[0m
See `Dataset.prefetch()` for details.

Args:
  input_dataset: The input dataset.
  buffer_size: See `Dataset.prefetch()` for details.
  slack_period: (Optional.) An integer. If non-zero, determines the number
    of GetNext calls before injecting slack into the execution. This may
    reduce CPU contention at the start of a step. Note that a tensorflow
    user should not have to set this manually; enable this behavior
    automatically via `tf.data.Options.experimental_slack` instead. Defaults
    to None.


In [40]:
keras.backend.clear_session()

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [41]:
model.compile(loss="mse", optimizer=keras.optimizers.SGD(lr=1e-3))

In [43]:
batch_size = 32
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10, 
          validation_data=valid_set,)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f447c645310>

In [44]:
!pwd

/home/ubuntu/Documents/handsON/preprocess


In [47]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_int = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_int, num_oov_buckets)


In [49]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [50]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

In [51]:
embedding_dim = 2
embed_init =tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.5749372 , 0.19900596],
       [0.7038239 , 0.05541027],
       [0.5884645 , 0.5430788 ],
       [0.9185413 , 0.37144518],
       [0.84554374, 0.9902985 ],
       [0.27800095, 0.5500362 ],
       [0.43682623, 0.68426645]], dtype=float32)>

In [52]:
embed_init

<tf.Tensor: shape=(7, 2), dtype=float32, numpy=
array([[0.5749372 , 0.19900596],
       [0.7038239 , 0.05541027],
       [0.5884645 , 0.5430788 ],
       [0.9185413 , 0.37144518],
       [0.84554374, 0.9902985 ],
       [0.27800095, 0.5500362 ],
       [0.43682623, 0.68426645]], dtype=float32)>

In [56]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.9185413 , 0.37144518],
       [0.27800095, 0.5500362 ],
       [0.7038239 , 0.05541027],
       [0.7038239 , 0.05541027]], dtype=float32)>

In [55]:
tf.nn.embedding_lookup(embed_init, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.9185413 , 0.37144518],
       [0.27800095, 0.5500362 ],
       [0.7038239 , 0.05541027],
       [0.7038239 , 0.05541027]], dtype=float32)>

In [57]:
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [58]:
regular_inputs = keras.layers.Input(shape=[8])
categories = keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = keras.layers.Embedding(input_dim=6, output_dim=2)(cat_indices)
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])
encoded_inputs

<tf.Tensor 'concatenate/Identity:0' shape=(None, 10) dtype=float32>

In [59]:
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs=[regular_inputs, categories], outputs=[outputs])

In [60]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None,)              0           input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 2)            12          lambda[0][0]                     
______________________________________________________________________________________________

-------------------- CSV ----------------------------

In [4]:
import functools

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

In [6]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [8]:
np.set_printoptions(precision=3, suppress=True)

In [9]:
train_file_path

'/home/ubuntu/.keras/datasets/train.csv'

In [14]:
!head {train_file_path}

survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


In [15]:
LABEL_COLUMN = "survived"
LABELS = [0,1]

In [16]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
    file_path, batch_size=5, label_name=LABEL_COLUMN,
    na_value="?", num_epochs=1, ignore_errors=True, **kwargs)
    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [17]:
raw_train_data

<PrefetchDataset shapes: (OrderedDict([(sex, (None,)), (age, (None,)), (n_siblings_spouses, (None,)), (parch, (None,)), (fare, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,))]), (None,)), types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)]), tf.int32)>

In [30]:
def show_batch(dataset):
    for batch, label in dataset.take(2):
        for key, value in batch.items():
            print("{:20s} {}".format(key, value.numpy()))
        print()

In [31]:
show_batch(raw_train_data)

sex                  [b'male' b'female' b'female' b'male' b'female']
age                  [35. 23. 24. 28. 35.]
n_siblings_spouses   [0 3 0 0 0]
parch                [0 2 3 0 0]
fare                 [ 26.288 263.     19.258   7.896 135.633]
class                [b'First' b'First' b'Third' b'Third' b'First']
deck                 [b'E' b'C' b'unknown' b'unknown' b'C']
embark_town          [b'Southampton' b'Southampton' b'Cherbourg' b'Cherbourg' b'Southampton']
alone                [b'y' b'n' b'n' b'y' b'y']

sex                  [b'male' b'male' b'male' b'male' b'male']
age                  [28. 45. 29. 28. 60.]
n_siblings_spouses   [0 0 0 0 0]
parch                [0 0 0 0 0]
fare                 [ 0.     8.05   9.5    7.229 26.55 ]
class                [b'Second' b'Third' b'Third' b'Third' b'First']
deck                 [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town          [b'Southampton' b'Southampton' b'Southampton' b'Cherbourg' b'Southampton']
alone          

In [33]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, select_columns = SELECT_COLUMNS, column_defaults = DEFAULTS)

show_batch(temp_dataset)

age                  [ 4. 21. 24. 38. 45.]
n_siblings_spouses   [4. 0. 0. 0. 0.]
parch                [1. 0. 2. 0. 0.]
fare                 [29.125  7.75  16.7   80.     6.975]

age                  [30. 34. 22.  7. 36.]
n_siblings_spouses   [0. 1. 0. 4. 0.]
parch                [0. 1. 0. 1. 2.]
fare                 [106.425  32.5     9.     29.125  71.   ]



In [34]:
example_batch, labels_batch = next(iter(temp_dataset))

In [36]:
def pack(features, label):
    return tf.stack(list(features.values()), axis = -1), label

In [37]:
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[43.     1.     1.    26.25 ]
 [56.     0.     0.    26.55 ]
 [32.     0.     0.     7.896]
 [40.5    0.     2.    14.5  ]
 [28.     0.     0.     7.225]]

[0 0 0 0 0]


In [40]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names
        
    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features["numeric"] = numeric_features
        
        return features, labels

In [41]:
NUMERIC_FEATURES = ["age", "n_siblings_spouses", "parch", "fare"]

packed_train_data = raw_train_data.map(PackNumericFeatures(NUMERIC_FEATURES))
packed_test_data = raw_test_data.map(PackNumericFeatures(NUMERIC_FEATURES))

In [42]:
show_batch(packed_train_data)

sex                  [b'male' b'male' b'male' b'female' b'male']
class                [b'Third' b'Second' b'First' b'Second' b'Third']
deck                 [b'unknown' b'unknown' b'C' b'unknown' b'unknown']
embark_town          [b'Southampton' b'Southampton' b'Southampton' b'Cherbourg' b'Southampton']
alone                [b'y' b'y' b'n' b'n' b'n']
numeric              [[28.     0.     0.     7.896]
 [28.     0.     0.     0.   ]
 [37.     1.     0.    53.1  ]
 [22.     1.     2.    41.579]
 [32.     1.     0.    15.85 ]]

sex                  [b'female' b'male' b'male' b'male' b'male']
class                [b'Second' b'Third' b'First' b'Second' b'Third']
deck                 [b'unknown' b'unknown' b'B' b'unknown' b'unknown']
embark_town          [b'Southampton' b'Cherbourg' b'Cherbourg' b'Cherbourg' b'Southampton']
alone                [b'n' b'y' b'y' b'y' b'n']
numeric              [[33.     0.     2.    26.   ]
 [28.     0.     0.     7.896]
 [32.     0.     0.    30.5  ]
 [23.     

In [43]:
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [44]:
MEAN = np.array(desc.T["mean"])
STD = np.array(desc.T["std"])

In [45]:
def normalize_numeric_data(data, mean, std):
    return (data-mean)/std

In [46]:
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column("numeric", normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column

NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x7f2a13dd5dd0>, mean=array([29.631,  0.545,  0.38 , 34.385]), std=array([12.512,  1.151,  0.793, 54.598])))

In [47]:
example_batch, labels_batch = next(iter(packed_train_data)) 

In [48]:
example_batch['numeric']

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[ 28.5  ,   0.   ,   0.   ,   7.229],
       [ 22.   ,   0.   ,   0.   ,   7.775],
       [ 23.   ,   3.   ,   2.   , 263.   ],
       [ 15.   ,   1.   ,   0.   ,  14.454],
       [ 28.   ,   0.   ,   0.   ,   7.896]], dtype=float32)>

In [49]:
numeric_layer = keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()

array([[-0.09 , -0.474, -0.479, -0.497],
       [-0.61 , -0.474, -0.479, -0.487],
       [-0.53 ,  2.132,  2.043,  4.187],
       [-1.169,  0.395, -0.479, -0.365],
       [-0.13 , -0.474, -0.479, -0.485]], dtype=float32)

In [50]:
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}

In [51]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=vocab)
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [56]:
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [54]:
categorical_layer = keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])

[1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0.]


In [57]:
len(categorical_layer(example_batch).numpy()[0])

20

In [58]:
preprocessing_layer = keras.layers.DenseFeatures(categorical_columns + numeric_columns)

In [59]:
print(preprocessing_layer(example_batch).numpy()[0])

[ 1.     0.     0.     0.     1.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.     1.     0.     0.    -0.09  -0.474
 -0.479 -0.497  1.     0.   ]


In [60]:
model = keras.Sequential([
    preprocessing_layer,
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1),
])

In [61]:
model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
             optimizer="adam",
             metrics=["accuracy"])


In [62]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [63]:
model.fit(train_data, epochs=20)

Epoch 1/20
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f29f3f24450>

In [64]:
test_loss, test_accuracy = model.evaluate(test_data)
print(f"\n\nTest Loss {test_loss}, Test Accuracy {test_accuracy}")

Consider rewriting this model with the Functional API.


Test Loss 0.46618175506591797, Test Accuracy 0.8522727489471436


In [65]:
predictions = model.predict(test_data)

for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
    prediction = tf.sigmoid(prediction).numpy()
    print("Predicted survival: {:.2%}".format(prediction[0]),
         "| Actual outcome: ",
         ("SURVIVED" if bool(survived) else "DIED"))

Consider rewriting this model with the Functional API.
Predicted survival: 52.34% | Actual outcome:  DIED
Predicted survival: 53.05% | Actual outcome:  DIED
Predicted survival: 25.06% | Actual outcome:  DIED
Predicted survival: 2.02% | Actual outcome:  DIED
Predicted survival: 11.24% | Actual outcome:  SURVIVED


In [66]:
bool(0)

False