# 4주차, 5일차 : 1-3주차 튜토리얼 작성하기
- ### Contents 
    1. Basic classification: Classify images of clothing : https://www.tensorflow.org/tutorials/keras/basic_classification
    2. Text classification with TensorFlow Hub: Movie reviews : https://www.tensorflow.org/tutorials/keras/basic_text_classification
    3. Basic regression: Predict fuel efficiency : https://www.tensorflow.org/tutorials/keras/basic_regression
    4. Build a linear model with Estimators : https://www.tensorflow.org/tutorials/estimator/linear
    5. Overfit And Underfit : https://www.tensorflow.org/tutorials/keras/overfit_and_underfit
    6. Load CSV Data : https://www.tensorflow.org/tutorials/load_data/csv
    7. TFRecord and tf.Example : https://www.tensorflow.org/tutorials/load_data/tfrecord
    8. Save and load models : https://www.tensorflow.org/tutorials/keras/save_and_restore_models
    9. Better performance with the tf.data API: https://www.tensorflow.org/guide/data_performance
    10. Time Series Forecasting: https://www.tensorflow.org/tutorials/structured_data/time_series
    11. Text Classification with an RNN: https://www.tensorflow.org/tutorials/text/text_classification_rnn
    12. Distributed training with Keras: https://www.tensorflow.org/tutorials/distribute/keras



## 1. Basic classification: Classify images of clothing

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [0]:
fashion_mnist = keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

In [0]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker',
              'Bag', 'Ankel boot']

In [0]:
# Expore the data
train_images.shape

In [0]:
len(train_labels)

In [0]:
train_labels

In [0]:
test_images.shape

In [0]:
len(test_labels)

In [0]:
# preprocess the data
plt.figure()
plt.imshow(train_images[0])
plt.colorbar()
plt.grid(False)
plt.show()

In [0]:
train_images = train_images / 255.0
test_images  = test_images  / 255.0

In [0]:
plt.figure(figsize=(10, 10))
for i in range(25):
    plt.subplot(5, 5, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labels[i]])
plt.show()

In [0]:
# Build the model
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

In [0]:
# Compile th model
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [0]:
# Train the model
model.fit(train_images, train_labels, epochs=10)

In [0]:
# Evaluate accuracy
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)

print('\nTest accuracy:', test_acc)

In [0]:
# Make predictions
predictions = model.predict(test_images)

In [0]:
predictions[0]

In [0]:
np.argmax(predictions[0])

In [0]:
test_labels[0]

In [0]:
def plot_image(i, predictions_array, true_label, img):
    predictions_array, true_label, img = predictions_array, true_label[i], img[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    
    plt.imshow(img, cmap=plt.cm.binary)
    
    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        color = 'blue'
    else:
        color = 'red'
        
    plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                        100*np.max(predictions_array),
                                        class_names[true_label]),
                                        color=color)
    
def plot_value_array(i, predictions_array, true_label):
    predictions_array, true_label = predictions_array, true_label[i]
    plt.grid(False)
    plt.xticks(range(10))
    plt.yticks([])
    thisplot = plt.bar(range(10), predictions_array, color='#777777')
    plt.ylim([0, 1])
    predicted_label = np.argmax(predictions_array)
    
    thisplot[predicted_label].set_color('red')
    thisplot[true_label].set_color('blue')

In [0]:
i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], test_labels, test_images)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i], test_labels)
plt.show()

In [0]:
i = 12
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], test_labels, test_images)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i], test_labels)
plt.show()

In [0]:
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
    plt.subplot(num_rows, 2*num_cols, 2*i+1)
    plot_image(i, predictions[i], test_labels, test_images)
    plt.subplot(num_rows, 2*num_cols, 2*i+2)
    plot_value_array(i, predictions[i], test_labels)
plt.tight_layout()
plt.show()

In [0]:
img = test_images[i]

print(img.shape)

In [0]:
img = (np.expand_dims(img,0))
img.shape

In [0]:
predictions_single = model.predict(img)

predictions_single

In [0]:
plot_value_array(1, predictions_single[0], test_labels)
_ = plt.xticks(range(10), class_names, rotation=45)

In [0]:
np.argmax(predictions_single[0])

## 2. Test classification with TensorFlow Hub: Movie reviews

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds

print('Version: ', tf.__version__)
print('Eager mode: ', tf.executing_eagerly())
print('Hub version: ', hub.__version__)
print('GPU is', 'avaliable' if tf.config.experimental.list_physical_devices('GPU') else 'NOT AVALIABLE')

In [0]:
# Download the IMDB dataset
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

(train_data, validation_data), test_data = tfds.load(
    name='imdb_reviews',
    split=(train_validation_split, tfds.Split.TEST),
    as_supervised=True
)

In [0]:
# Explore the data
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch

In [0]:
train_labels_batch

In [0]:
# Build the model
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
                          dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

In [0]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [0]:
# Loss function and optimizer
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [0]:
# Train the model
history = model.fit(train_data.shuffle(10000).batch(512),
                   epochs=20,
                   validation_data=validation_data.batch(512),
                   verbose=1)

In [0]:
# Evaluate the model
results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
    print('%s: %.3f' % (name, value))

## 3. Basic regression: Predict fuel efficiency

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [0]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

In [0]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [0]:
# The Auto MPG dataset
# Get the data
dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path

In [0]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
               'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names,
                         na_values='?', comment='\t', sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.tail()

In [0]:
# Clean the data
dataset.isna().sum()

In [0]:
dataset = dataset.dropna()

In [0]:
dataset['Origin'] = dataset['Origin'].map(lambda x : {1:'USA', 2:'Europe', 3:'Japan'}.get(x))

In [0]:
dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')
dataset.tail()

In [0]:
# Split the data into train and test
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset  = dataset.drop(train_dataset.index)

In [0]:
# Inspect the data
sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')

In [0]:
train_stats = train_dataset.describe()
train_stats.pop('MPG')
train_stats = train_stats.transpose()
train_stats

In [0]:
# Split features from labels

In [0]:
train_labels = train_dataset.pop('MPG')
test_labels  = test_dataset.pop('MPG')

In [0]:
# Normalize the data
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset)
normed_test_data  = norm(test_dataset)

In [0]:
# The model 
# Build the model
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    
    optimizer = tf.keras.optimizers.RMSprop(0.001)
    
    model.compile(loss='mse',
                 optimizer=optimizer,
                 metrics=['mae', 'mse'])
    return model

In [0]:
model = build_model()

In [0]:
# Inspect the model
model.summary()

In [0]:
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result

In [0]:
# Train the model
EPOCHS = 1000

history = model.fit(
    normed_train_data, train_labels,
    epochs=EPOCHS, validation_split=0.2, verbose=0,
    callbacks=[tfdocs.modeling.EpochDots()]
)

In [0]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [0]:
plotter = tfdocs.plots.HistoryPlotter(smoothing_std=2)

In [0]:
plotter.plot({'Basic': history}, metric='mae')
plt.ylim([0, 10])
plt.ylabel('MAE [MPG]')

In [0]:
plotter.plot({'Basic': history}, metric='mse')
plt.ylim([0, 20])
plt.ylabel('MSE [MPG^2]')

In [0]:
model = build_model()

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

early_history = model.fit(normed_train_data, train_labels,
                         epochs=EPOCHS, validation_split=0.2, verbose=0,
                         callbacks=[early_stop, tfdocs.modeling.EpochDots()])

In [0]:
plotter.plot({'Early Stopping': early_history}, metric='mae')
plt.ylim([0, 10])
plt.ylabel('MAE [MPG]')

In [0]:
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)

print('Testing set Mean Abs Error: {:5.2f} MPG'.format(mae))

In [0]:
# Make predictions

In [0]:
test_predictions = model.predict(normed_test_data).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [0]:
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')

## 4. Build a linear model with Estimators

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

In [0]:
# Load the titanic dataset
import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf

In [0]:
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

In [0]:
# Explore the data
dftrain.head()

In [0]:
dftrain.describe()

In [0]:
dftrain.shape[0], dfeval.shape[0]

In [0]:
dftrain.age.hist(bins=20)

In [0]:
dftrain.sex.value_counts().plot(kind='barh')

In [0]:
dftrain['class'].value_counts().plot(kind='barh')

In [0]:
pd.concat([dftrain, y_train], axis=1).groupby('sex').survived.mean().plot(kind='barh').set_xlabel('% survive')

In [0]:
# Feature Engineering for the Model
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
                      'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = dftrain[feature_name].unique()
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, 
                                                                                    vocabulary))
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [0]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn  = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [0]:
ds = make_input_fn(dftrain, y_train, batch_size=10)()
for feature_batch, label_batch in ds.take(1):
    print('Some feature keys:', list(feature_batch.keys()))
    print()
    print('A batch of class:', feature_batch['class'].numpy())
    print()
    print('A batch of Labels:', label_batch.numpy())

In [0]:
age_column = feature_columns[7]
tf.keras.layers.DenseFeatures([age_column])(feature_batch).numpy()

In [0]:
gender_column = feature_columns[0]
tf.keras.layers.DenseFeatures([tf.feature_column.indicator_column(gender_column)])(feature_batch).numpy()

In [0]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)

In [0]:
age_x_gender = tf.feature_column.crossed_column(['age', 'sex'], hash_bucket_size=100)

In [0]:
derived_feature_columns = [age_x_gender]
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns+derived_feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)

In [0]:
pred_dicts = list(linear_est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')

In [0]:
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(y_eval, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,)

# 5. Overfit and underfit

In [0]:
# Setup
from __future__ import absolute_import, division, print_function, unicode_literals

%tensorflow_version 2.x

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import regularizers

print(tf.__version__)

In [0]:
!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

In [0]:
from IPython import display
from matplotlib import pyplot as plt

import numpy as np

import pathlib
import shutil
import tempfile

In [0]:
logdir = pathlib.Path(tempfile.mkdtemp())/'tensorboard_logs'
shutil.rmtree(logdir, ignore_errors=True)

In [0]:
# The Higgs Dataset
gz = tf.keras.utils.get_file('HIGGS.csv.gz',
                             'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz')

In [0]:
FEATURES = 28

In [0]:
ds = tf.data.experimental.CsvDataset(gz,[float(),]*(FEATURES+1), compression_type='GZIP')

In [0]:
def pack_row(*row):
  label = row[0]
  features = tf.stack(row[1:], 1)
  return features, label

In [0]:
packed_ds = ds.batch(10000).map(pack_row).unbatch()

In [0]:
for features, label in packed_ds.batch(1000).take(1):
  print(features[0])
  plt.hist(features.numpy().flatten(), bins=101)

In [0]:
N_VALIDATION = int(1e3)
N_TRAIN = int(1e4)
BUFFER_SIZE = int(1e4)
BATCH_SIZE = 500
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

In [0]:
validate_ds = packed_ds.take(N_VALIDATION).cache()
train_ds = packed_ds.skip(N_VALIDATION).take(N_TRAIN).cache()

In [0]:
train_ds

In [0]:
validate_ds = validate_ds.batch(BATCH_SIZE)
train_ds = train_ds.shuffle(BUFFER_SIZE).repeat().batch(BATCH_SIZE)

In [0]:
# Demonstrate overfitting
# Training precedure
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    0.001,
    decay_steps=STEPS_PER_EPOCH*1000,
    decay_rate=1,
    staircase=False
)

def get_optimizer():
  return tf.keras.optimizers.Adam(lr_schedule)

In [0]:
step = np.linspace(0, 100000)
lr = lr_schedule(step)
plt.figure(figsize=(8,6))
plt.plot(step/STEPS_PER_EPOCH, lr)
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch')
_ = plt.ylabel('Learning Rate')

In [0]:
def get_callbacks(name):
  return [
          tfdocs.modeling.EpochDots(),
          tf.keras.callbacks.EarlyStopping(monitor='val_binary_crossentropy', patience=200),
          tf.keras.callbacks.TensorBoard(logdir/name),
  ]

In [0]:
def compile_and_fit(model, name, optimizer=None, max_epochs=10000):
  if optimizer is None:
    optimizer = get_optimizer()
  model.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics=['accuracy', 'binary_crossentropy'])
  model.summary()

  history = model.fit(
      train_ds,
      steps_per_epoch = STEPS_PER_EPOCH,
      epochs=max_epochs,
      validation_data=validate_ds,
      callbacks=get_callbacks(name),
      verbose=0
  )
  return history

In [0]:
# Tiny model
tiny_model = tf.keras.Sequential([
                                  layers.Dense(16, activation='elu', input_shape=(FEATURES,)),
                                  layers.Dense(1, activation='sigmoid')
])

In [0]:
size_histories = {}

In [0]:
size_histories['Tiny'] = compile_and_fit(tiny_model, 'sizes/Tiny')

In [0]:
plotter = tfdocs.plots.HistoryPlotter(metric = 'binary_crossentropy', smoothing_std=10)
plotter.plot(size_histories)
plt.ylim([0.5, 0.7])

In [0]:
# Small model
small_model = tf.keras.Sequential([
  # 'input_shape'를 입력해야 .summary()가 작동한다.
  layers.Dense(16, activation='elu', input_shape=(FEATURES,)),
  layers.Dense(16, activation='elu'),
  layers.Dense(1, activation='sigmoid')
])

In [0]:
size_histories['Small'] = compile_and_fit(small_model, 'sizes/Small')

In [0]:
# Medium model
medium_model = tf.keras.Sequential([
  layers.Dense(64, activation='elu', input_shape=(FEATURES, )),
  layers.Dense(64, activation='elu'),
  layers.Dense(64, activation='elu'),
  layers.Dense(1,  activation='sigmoid')
])

In [0]:
size_histories['Medium'] = compile_and_fit(medium_model, 'sizes/Medium')

In [0]:
# Large model
large_model = tf.keras.Sequential([
  layers.Dense(512, activation='elu', input_shape=(FEATURES,)),
  layers.Dense(512, activation='elu'),
  layers.Dense(512, activation='elu'),
  layers.Dense(512, activation='elu'),
  layers.Dense(1, activation='sigmoid')
])

In [0]:
size_histories['large'] = compile_and_fit(large_model, 'sizes/large')

In [0]:
# Plot the training and validation losses
plotter.plot(size_histories)
a = plt.xscale('log')
plt.xlim([5, max(plt.xlim())])
plt.ylim([0.5, 0.7])
plt.xlabel('Epochs [Log Scale]')

In [0]:
# View in Tensorboard
 
%tensorboard --logdir {logdir}/sizes

In [0]:
display.IFrame(
    src="https://tensorboard.dev/experiment/vW7jmmF9TmKmy3rbheMQpw/#scalars&_smoothingWeight=0.97",
    width="100%", height="800px"
)

In [0]:
!tensorbard dev upload --logdir {logdir}/sizes

In [0]:
#Strategies to prevent overfitting
shutil.rmtree(logdir/'regularizers/Tiny', ignore_errors=True)
shutil.copytree(logdir/'sizes/Tiny', logdir/'regularizers/Tiny')

In [0]:
regularizer_histories={}
regularizer_histories['Tiny'] = size_histories['Tiny']

In [0]:
# Add weight regularization
l2_model = tf.keras.Sequential([
  layers.Dense(512, activation='elu',
               kernel_regularizer=regularizers.l2(0.001),
               input_shape=(FEATURES, )),
  layers.Dense(512, activation='elu',
               kernel_regularizer=regularizers.l2(0.001)),
  layers.Dense(512, activation='elu',
               kernel_regularizer=regularizers.l2(0.001)),
  layers.Dense(512, activation='elu',
               kernel_regularizer=regularizers.l2(0.001)),
  layers.Dense(1, activation='sigmoid')
])

regularizer_histories['l2'] = compile_and_fit(l2_model, 'regularizers/l2')

In [0]:
plotter.plot(regularizer_histories)
plt.ylim([0.5, 0.7])

In [0]:
result = l2_model(features)
regularization_loss = tf.add_n(l2_model.losses)

In [0]:
# Add dropout
dropout_model = tf.keras.Sequential([
  layers.Dense(512, activation='elu', input_shape=(FEATURES,)),
  layers.Dropout(0.5),
  layers.Dense(512, activation='elu'),
  layers.Dropout(0.5),
  layers.Dense(512, activation='elu'),
  layers.Dropout(0.5),
  layers.Dense(512, activation='elu'),
  layers.Dropout(0.5),
  layers.Dense(1, activation='sigmoid')
])

regularizer_histories['dropout'] = compile_and_fit(dropout_model, 'regularizers/dropout')

In [0]:
plotter.plot(regularizer_histories)
plt.ylim([0.5, 0.7])

In [0]:
# Combined L2 + dropout
combined_model = tf.keras.Sequential([
  layers.Dense(512, kernel_regularizer=regularizers.l2(0.0001),
               activation='elu', input_shape=(FEATURES, )),
  layers.Dropout(0.5),
  layers.Dense(512, kernel_regularizer=regularizers.l2(0.0001),
               activation='elu'),
  layers.Dropout(0.5),
  layers.Dense(512, kernel_regularizer=regularizers.l2(0.0001),
               activation='elu'),
  layers.Dropout(0.5),
  layers.Dense(512, kernel_regularizer=regularizers.l2(0.0001),
               activation='elu'),
  layers.Dropout(0.5),
  layers.Dense(1, activation='sigmoid')
])

regularizer_histories['combined'] = compile_and_fit(combined_model, 'regularizers/combined')

In [0]:
plotter.plot(regularizer_histories)
plt.ylim([0.5, 0.7])

In [0]:
%tensorboard --logdir {logdir}/regularizers

In [0]:
display.IFrame(
    src="https://tensorboard.dev/experiment/fGInKDo8TXes1z7HQku9mw/#scalars&_smoothingWeight=0.97",
    width = "100%",
    height="800px")

# 6. Load CSV data

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools

import numpy as np
import tensorflow as tf

In [0]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

In [0]:
np.set_printoptions(precision=3, suppress=True)

In [0]:
# Load data
!head {train_file_path}

In [0]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

In [0]:
def get_dataset(file_path, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
   file_path,
   batch_size=5,
   label_name=LABEL_COLUMN,
   na_value="?",
   num_epochs=1,
   ignore_errors=True,
   **kwargs   
  )

  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [0]:
def show_batch(dataset):
  for batch, label in dataset.take(1):
    for key, value in  batch.items():
      print('{:20s}: {}'.format(key, value.numpy()))

In [0]:
show_batch(raw_train_data)

In [0]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch',
               'fare', 'class', 'deck', 'embark_town', 'alone']
temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

In [0]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

show_batch(temp_dataset)

In [0]:
# Data preprocessing
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path,
                           select_columns=SELECT_COLUMNS,
                           column_defaults=DEFAULTS)

show_batch(temp_dataset)

In [0]:
example_batch , labels_batch = next(iter(temp_dataset))

In [0]:
def pack(features, label):
  return tf.stack(list(features.values()), axis=-1), label


In [0]:
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
  print(features.numpy())
  print()
  print(labels.numpy())

In [0]:
show_batch(raw_train_data)

In [0]:
example_batch, labels_batch = next(iter(temp_dataset))

In [0]:
class PackNumericFeatures(object):
  def __init__(self, names):
    self.names = names

  def __call__(self, features, labels):
    numeric_features = [features.pop(name) for name in self.names]
    numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
    numeric_features = tf.stack(numeric_features, axis=-1)
    features['numeric'] = numeric_features

    return features, labels

In [0]:
NUMERIC_FEATURES = ['age', 'n_siblings_spouses', 'parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)

In [0]:
show_batch(packed_train_data)

In [0]:
example_batch, labels_batch = next(iter(packed_train_data))

In [0]:
# Data Normalization
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

In [0]:
MEAN = np.array(desc.T['mean'])
STD  = np.array(desc.T['std'])

In [0]:
def normalize_numeric_data(data, mean, std):
  return (data-mean)/std

In [0]:
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column('numeric', 
                                                  normalizer_fn=normalizer, 
                                                  shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column

In [0]:
example_batch['numeric']

In [0]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()

In [0]:
# Categorical data
CATEGORIES = {
    'sex' : ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}

In [0]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
  cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
      key=feature, vocabulary_list=vocab
  )
  categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [0]:
categorical_columns

In [0]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])

In [0]:
# Combine preprocessing layer
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)

In [0]:
print(preprocessing_layer(example_batch).numpy()[0])

In [0]:
# Build the model
model = tf.keras.Sequential([
  preprocessing_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [0]:
# Train, evlauate, and predict
train_data = packed_train_data.shuffle(500)
test_data  = packed_test_data

In [0]:
model.fit(train_data, epochs=20)

In [0]:
test_loss, test_accuracy = model.evaluate(test_data)

print('\n\nTest Loss: {}, Test Accuracy: {}'.format(test_loss, test_accuracy))

In [0]:
predictions = model.predict(test_data)

for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
  print('Predicted survival: {:.2%}'.format(prediction[0]),
        '| Acutal outcome: ',
        ('SURVIVED' if bool(survived) else 'DIED'))

# 7. TFRecord and tf.Example

In [0]:
# Setup

from __future__ import absolute_import, division, print_function, unicode_literals

%tensorflow_version 2.x

import tensorflow as tf

import numpy as np
import IPython.display as display

In [0]:
def _bytes_feature(value):
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [0]:
print(_bytes_feature(b'test_string'))
print(_bytes_feature(u'test_bytes'.encode('utf-8')))

print(_float_feature(np.exp(1)))

print(_int64_feature(True))
print(_int64_feature(1))

In [0]:
feature = _float_feature(np.exp(1))

feature.SerializeToString()

In [0]:
# Creating a tf.Example message
n_observations = int(1e4)

feature0 = np.random.choice([False, True], n_observations)

feature1 = np.random.randint(0, 5, n_observations)

strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

feature3 = np.random.randn(n_observations)

In [0]:
def serialize_example(feature0, feature1, feature2, feature3):
  feature = {
      'feature0' : _int64_feature(feature0),
      'feature1' : _int64_feature(feature1),
      'feature2' : _bytes_feature(feature2),
      'feature3' : _float_feature(feature3),
  }

  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [0]:
example_obsevation = []

serialized_example = serialize_example(False, 4, b'goat', 0.9876)
serialized_example

In [0]:
example_proto = tf.train.Example.FromString(serialized_example)
example_proto

In [0]:
# TFRecord files using tf.data
tf.data.Dataset.from_tensor_slices(feature1)

In [0]:
features_dataset = tf.data.Dataset.from_tensor_slices((feature0, feature1,
                                                       feature2, feature3))
features_dataset

In [0]:
for f0,f1,f2,f3 in features_dataset.take(1):
  print(f0)
  print(f1)
  print(f2)
  print(f3)

In [0]:
# 이 함수랑 serialize_example 함수랑은 뭔 차이인가.. 단순히 파이썬 함수를 텐서플로우 함수로 매핑해주는걸까..
def tf_serialize_example(f0,f1,f2,f3):
  tf_string = tf.py_function(
      serialize_example,
      (f0,f1,f2,f3),
      tf.string
  )
  return tf.reshape(tf_string, ())

In [0]:
tf_serialize_example(f0,f1,f2,f3)

In [0]:
serialized_features_dataset = features_dataset.map(tf_serialize_example)
serialized_features_dataset

In [0]:
def generator():
  for features in features_dataset:
    yield serialize_example(*features)

In [0]:
serialized_features_dataset = tf.data.Dataset.from_generator(
    generator, output_types=tf.string, output_shapes=()
)

In [0]:
serialized_features_dataset

In [0]:
filename = 'test.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(serialized_features_dataset)

In [0]:
# Reading a TFRecord file
filenames = [filename]
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

In [0]:
for raw_record in raw_dataset.take(10):
  print(repr(raw_record))

In [0]:
feature_description = {
    'feature0' : tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'feature1' : tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'feature2' : tf.io.FixedLenFeature([], tf.string, default_value=''),
    'feature3' : tf.io.FixedLenFeature([], tf.float32, default_value=0.0)
}

def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

In [0]:
parsed_dataset = raw_dataset.map(_parse_function)
parsed_dataset

In [0]:
for parsed_record in parsed_dataset.take(10):
  print(repr(parsed_record))

In [0]:
# TFRecord files in Python
# Writing a TFRecord file
with tf.io.TFRecordWriter(filename) as writer:
  for i in range(n_observations):
    example = serialize_example(feature0[i], feature1[i], feature2[i], feature3[i])
    writer.write(example)

In [0]:
# Reading a TFRecored file
filenames = [filename]
raw_dataset = tf.data.TFRecordDataset(filename)
raw_dataset

In [0]:
for raw_record in raw_dataset.take(1):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
  print(example)

In [0]:
# Walkthrough: Reading and writing image data
cat_in_snow  = tf.keras.utils.get_file('320px-Felis_catus-cat_on_snow.jpg', 'https://storage.googleapis.com/download.tensorflow.org/example_images/320px-Felis_catus-cat_on_snow.jpg')
williamsburg_bridge = tf.keras.utils.get_file('194px-New_East_River_Bridge_from_Brooklyn_det.4a09796u.jpg','https://storage.googleapis.com/download.tensorflow.org/example_images/194px-New_East_River_Bridge_from_Brooklyn_det.4a09796u.jpg')

In [0]:
display.display(display.Image(filename=cat_in_snow))
display.display(display.HTML('Image cc-by: <a "href=https://commons.wikimedia.org/wiki/File:Felis_catus-cat_on_snow.jpg">Von.grzanka</a>'))

In [0]:
display.display(display.Image(filename=williamsburg_bridge))
display.display(display.HTML('<a "href=https://commons.wikimedia.org/wiki/File:New_East_River_Bridge_from_Brooklyn_det.4a09796u.jpg">From Wikimedia</a>'))

In [0]:
# Write the TFREcord file
image_labels = {
    cat_in_snow : 0,
    williamsburg_bridge : 1,
}

In [0]:
image_string = open(cat_in_snow, 'rb').read()

label = image_labels[cat_in_snow]

def image_example(image_string, label):
  image_shape = tf.image.decode_jpeg(image_string).shape

  feature = {
      'height': _int64_feature(image_shape[0]),
      'width': _int64_feature(image_shape[1]),
      'depth': _int64_feature(image_shape[2]),
      'label': _int64_feature(label),
      'image_raw': _bytes_feature(image_string)
  }

  return tf.train.Example(features=tf.train.Features(feature=feature))

for line in str(image_example(image_string, label)).split('\n')[:5]:
  print(line)
print('...')

In [0]:
record_file = 'images.tfrecords'
with tf.io.TFRecordWriter(record_file) as writer:
  for filename, label in image_labels.items():
    image_string = open(filename, 'rb').read()
    tf_example = image_example(image_string, label)
    writer.write(tf_example.SerializeToString())

In [0]:
# Read the TFRecord file
raw_image_dataset = tf.data.TFRecordDataset('images.tfrecords')

image_feature_description = {
    'height': tf.io.FixedLenFeature([], tf.int64),
    'width': tf.io.FixedLenFeature([], tf.int64),
    'depth': tf.io.FixedLenFeature([], tf.int64),
    'label': tf.io.FixedLenFeature([], tf.int64),
    'image_raw': tf.io.FixedLenFeature([], tf.string),
}

def _parse_image_function(example_proto):
  return tf.io.parse_single_example(example_proto, image_feature_description)

parsed_image_dataset = raw_image_dataset.map(_parse_image_function)
parsed_image_dataset

In [0]:
for image_features in parsed_image_dataset:
  image_raw = image_features['image_raw'].numpy()
  display.display(display.Image(data=image_raw))