## Tensorflow Model Serving

### Using TF Serving

In [None]:
# make and save model
from pathlib import Path
import tensorflow as tf

mnist = tf.keras.datasets.mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = mnist
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

tf.random.set_seed(42)
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8),
    tf.keras.layers.Rescaling(scale=1 / 255),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-2),
              metrics=["accuracy"])
model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

model_name = "my_mnist_model"
model_version = "0001"
model_path = Path(model_name) / model_version
model.save(model_path, save_format="tf")

In [None]:
# inspect SavedModel.
# Output will be a 'tag', which is a classification of metagraph(calculation graph + function signature(e.g. type, input & output size)).
!saved_model_cli show --dir '{model_path}'

In [None]:
# Look at the 'tag set' above.
# Output will be a two signature definition, '__saved_model_init_op' and 'serving_default'.
!saved_model_cli show --dir '{model_path}' --tag_set serve

In [None]:
# Look closely at the basic serving function 'serving_default'.
!saved_model_cli show --dir '{model_path}' --tag_set serve \
                      --signature_def serving_default

In [None]:
# install tensorflow serving
url = "https://storage.googleapis.com/tensorflow-serving-apt"
src = "stable tensorflow-model-server tensorflow-model-server-universal"
!echo 'deb {url} {src}' > /etc/apt/sources.list.d/tensorflow-serving.list
!curl '{url}/tensorflow-serving.release.pub.gpg' | apt-key add -
!apt update -q && apt-get install -y tensorflow-model-server
%pip install -q -U tensorflow-serving-api==2.11.1

In [None]:
import os

os.environ["MODEL_DIR"] = str(model_path.parent.absolute())

In [None]:
# implement server
%%bash --bg
tensorflow_model_server \
    --port=8500 \
    --rest_api_port=8501 \
    --model_name=my_mnist_model \
    --model_base_path="${MODEL_DIR}" >my_server.log 2>&1

In [None]:
# query to TF serving using REST API
# make a request
import json

X_new = X_test[:3]
request_json = json.dumps({
    "signature_name": "serving_default",
    "instances": X_new.tolist()
})

In [None]:
# json is 100% text
request_json

In [None]:
# deliver request data to TF serving using HTTP POST method
import requests

server_url = "http://localhost:8501/v1/models/my_mnist_model:predict"
response = requests.post(server_url, data=request_json)
response.raise_for_status()
response = response.json()

In [None]:
# make a prediction
import numpy as np
y_proba = np.array(response['predictions'])
y_proba.round(2)

In [None]:
# query to TF serving using gRPC API
# Make a request.
# Make a PredictRequest protocol buffer and fill in fields.
from tensorflow_serving.apis.predict_pb2 import PredictRequest

request = PredictRequest()
request.model_spec.name = model_name
request.model_sepc.signature_name = 'serving_default'
input_name = model.input_names[0]
request.inputs[input_name].CopyFrom(tf.make_tensor_proto(X_new))

In [None]:
import grpc
from tensorflow_serving.apis import prediction_service_pb2_grpc

# make a channel
channel = grpc.insecure_channel('localhost:8500')

# make a gRPC service for the channel
predict_service = prediction_service_pb2_grpc.PredictServiceStub(channel)

# send a request
response = predict_service.Predict(request, timeout=10.0)

In [None]:
# change protocol buffer to tensor
output_name = model.output_names[0]
outputs_proto = response.outputs[output_name]
y_proba = tf.make_ndarray(outputs_proto)

In [None]:
# make a new version of model
np.random.seed(42)
tf.random.set_seed(42)
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8),
    tf.keras.layers.Rescaling(scale=1 / 255),
    tf.keras.layers.Dense(50, activation="relu"),
    tf.keras.layers.Dense(50, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-2),
              metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,
                    validation_data=(X_valid, y_valid))

In [None]:
# save a new version of model
model_version = "0002"
model_path = Path(model_name) / model_version
model.save(model_path, save_format="tf")

### Vertex AI

In [None]:
# Authorization
from google.colab import auth

auth.authenticate_user()

In [None]:
# Make GCS bucket to save SavedModel.
from google.cloud import storage

project_id = 'my_project'
bucket_name = 'my_bucket'
location = 'us-central1'

storage_client = storage.Client(project=project_id)
bucket = storage_client.create_bucket(bucket_name, location=location)

In [None]:
# a function to upload directory to a new bucket.
def upload_directory(bucket, dirpath):
    dirpath = Path(dirpath)
    for filepath in dirpath.glob("**/*"):
        if filepath.is_file():
            blob = bucket.blob(filepath.relative_to(dirpath.parent).as_posix())
            blob.upload_from_filename(filepath)
    upload_directory(bucket, "my_mnist_model")

In [None]:
# multithreading
!gsutil -m cp -r my_mnist_model gs://{bucket_name}/

In [None]:
# Inform Vertex AI about the model.
from google.cloud import aiplatform

server_image = 'gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-8:latest'

aiplatform.init(project=project_id, location=location)
mnist_model = aiplatform.Model.upload(
    display_name='mnist',
    artifact_uri=f'gs://{bucket_name}/my_mnist_model/0001',
    serving_container_image_uri=server_image,
)

In [None]:
# Make endpoint
endpoint = aiplatform.Endpoint.create(display_name='mnist-endpoint')

endpoint.deploy(
    mnist_model,
    min_replica_count=1,
    max_replica_count-5,
    machine_type='n1-standard-4',
    accelerator_type='NVIDIA_TESLA_K80',
    accelerator_count=1
)

In [None]:
# make a prediction
response = endpoint.predict(instances=X_new.tolist())

In [None]:
import numpy as np
np.round(response.predictions, 2)

In [None]:
# remove endpoint
endpoint.undeploy_all()
endpoint.delete()

### Batch prediction on Vertex AI

In [None]:
# prepare batch and upload to GCS
# make JSON Lines file
batch_path = Path('my_mnist_batch')
batch_path.mkdir(exist_ok=True)
with open(batch_path / 'my_mnist_batch.jsonl', 'w') as jsonl_file:
    for image in X_test[:100].tolist():
        jsonl_file.write(json.dumps(image))
        jsonl.file.write('\n')

upload_directory(bucket, batch_path)

In [None]:
# set directory path
batch_prediction_job = mnist_model.batch_predict(
    job_display_name="my_batch_prediction_job",
    machine_type="n1-standard-4",
    starting_replica_count=1,
    max_replica_count=5,
    accelerator_type="NVIDIA_TESLA_K80",
    accelerator_count=1,
    gcs_source=[f"gs://{bucket_name}/{batch_path.name}/my_mnist_batch.jsonl"],
    gcs_destination_prefix=f"gs://{bucket_name}/my_mnist_predictions/",
    sync=True
)

In [None]:
# make predictions
y_probas = []
for blob in batch_prediction_job.iter_outputs():
    if 'prediction.results' in blob.name:
        for line in blob.download_as_text().splitlines():
            y_proba = json.loads(line)['prediction']
            y_probas.append(y_proba)

In [None]:
# accuracy
y_pred = np.argmax(y_probas, axis=1)
accuracy = np.sum(y_pred == y_test[:100]) / 100

In [None]:
# delete model, bucket and batch prediction job
for prefix in ['my_mnist_model/', 'my_mnist_batch/', 'my_mnist_predictions/']:
    blobs = bucket.list_blobs(prefix=prefix)
    for blob in blobs:
        blob.delete()

bucket.delete()
batch_prediction_job.delete()

## Distribute models on mobile or embeded device

In [None]:
# convert SavedModel to FlatBuffers and save as .tflite
converter = tf.lite.TFLiteConverter.from_saved_model(str(model_path))
tflite_model = converter.convert()
with open("my_converted_savedmodel.tflite", 'wb') as f:
    f.write(tflite_model)

In [None]:
# after-training quantization
converter.optimizations = [tf.lite.Optimize.DEFAULT]

## Use GPU to Speed Up

In [None]:
# check tensorflow recognizes GPU
physical_gpus = tf.config.list_physical_devices('GPU')
physical_gpus

In [None]:
# set RAM limit of tensorflow
for gpu in physical_gpus:
    tf.config.set_logical_device_configuration(
        gpu,
        [tf.config.LogicalDeviceConfiguration(memory_limit=2048)]
    )

In [None]:
# make tensorflow occupy GPU only if necessary
for gpu in physical_gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
# divide GPU into 2+ logical devices
tf.config.set_logical_device_configuration(
    physical_gpus[0],
    [tf.config.LogicalDeviceConfiguration(memory_limit=2048),
     tf.config.LogicalDeviceConfiguration(memory_limit=2048)]
)

In [None]:
# check logical devices
logical_gpus = tf.config.list_logical_devices('GPU')
logical_gpus

### Allocating Computation and Variable to Device

In [None]:
a = tf.Variable([1., 2., 3.])
a.device    # check device
b = tf.Variable([1, 2, 3])
b.device

In [None]:
# change device
with tf.device("/cpu:0"):
    c = tf.Variable([1., 2., 3.])
c.device

## Train Model on Multiple Devices

### Large Scale Training Using Distributed Strategy API



In [None]:
# make strategy object
strategy = tf.distribute.MirroredStrategy()

# wrap with distirbute context
with strategy.scope():
    model = tf.keras.Sequential([...])
    model.compile([...])

batch_size = 100        # should be divided by the number of mirrored models
model.fit(X_train, y_train, epochs=10,
          validation_data=(X_valid, y_valid), batch_size=batch_size)

In [None]:
# wrap with context when loading model
with strategy.scope():
    model = tf.keras.models.load_model("my_mirrored_model")

In [None]:
# deliver the list of devices if you want to use part of GPUs
strategy = tf.distribute.MirroredStrategy(devices=['/gpu:0', '/gpu:1'])

In [None]:
# use centralized parameter
strategy = tf.distribute.experimental.CentralStorageStrategy()

### TF Cluster

In [None]:
# cluster specification
cluster_spec = {
    "worker": [
        "machine-a.example.com:2222",     # /job:worker/task:0
        "machine-b.example.com:2222"      # /job:worker/task:1
    ],
    "ps": ["machine-a.example.com:2221"]  # /job:ps/task:0
}

In [None]:
# deliver cluster specification and set task type when starting task
os.environ['TF_CONFIG'] = json.dumps({
    'cluster': cluster_spec,
    'task': {'type': 'worker', 'index': 0}
})

In [None]:
# training
import tempfile
import tensorflow as tf

strategy = tf.distribute.MultiWorkerMirroredStrategy()
resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
print(f"Starting task {resolver.task_type} #{resolver.task_id}")

mnist = tf.keras.datasets.mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = mnist
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

with strategy.scope():
    model = tf.keras.Sequential([
        tf.keras.layers.Reshape([28, 28, 1], input_shape=[28, 28],
                                dtype=tf.uint8),
        tf.keras.layers.Rescaling(scale=1 / 255),
        tf.keras.layers.Conv2D(filters=64, kernel_size=7, activation="relu",
                               padding="same", input_shape=[28, 28, 1]),
        tf.keras.layers.MaxPooling2D(pool_size=2),
        tf.keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu",
                               padding="same"),
        tf.keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu",
                               padding="same"),
        tf.keras.layers.MaxPooling2D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(units=64, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(units=10, activation="softmax"),
    ])
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=tf.keras.optimizers.SGD(learning_rate=1e-2),
                  metrics=["accuracy"])

model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10)

if resolver.task_id == 0:  # chief stores model in the right position
    model.save("my_mnist_multiworker_model", save_format="tf")
else:
    tmpdir = tempfile.mkdtemp()  # other workers stored in temporary directory
    model.save(tmpdir, save_format="tf")
    tf.io.gfile.rmtree(tmpdir)  # remove directory

In [None]:
# assert NCCL to network communication
strategy = tf.distribute.MultiWorkerMirroredStrategy(
    communication_options = tf.distribute.experimental.CommunicationOptions(
        implementation = tf.distribute.experimental.CollectiveCommunication.NCCL
    )
)

In [None]:
# TPUStrategy when TPU is available
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)

### Large Scale Training in Vertex AI


In [None]:
import os
from pathlib import Path
import tempfile
import tensorflow as tf

strategy = tf.distribute.MultiWorkerMirroredStrategy()
resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()

if resolver.task_type == "chief":
    model_dir = os.getenv("AIP_MODEL_DIR")  # provided by Vertex AI
    tensorboard_log_dir = os.getenv("AIP_TENSORBOARD_LOG_DIR")
    checkpoint_dir = os.getenv("AIP_CHECKPOINT_DIR")
else:
    tmp_dir = Path(tempfile.mkdtemp())
    model_dir = tmp_dir / "model"
    tensorboard_log_dir = tmp_dir / "logs"
    checkpoint_dir = tmp_dir / "ckpt"

callbacks = [tf.keras.callbacks.TensorBoard(tensorboard_log_dir),
             tf.keras.callbacks.ModelCheckpoint(checkpoint_dir)]

mnist = tf.keras.datasets.mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = mnist
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

with strategy.scope():
    model = tf.keras.Sequential([
        tf.keras.layers.Reshape([28, 28, 1], input_shape=[28, 28],
                                dtype=tf.uint8),
        tf.keras.layers.Lambda(lambda X: X / 255),
        tf.keras.layers.Conv2D(filters=64, kernel_size=7, activation="relu",
                               padding="same", input_shape=[28, 28, 1]),
        tf.keras.layers.MaxPooling2D(pool_size=2),
        tf.keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu",
                               padding="same"),
        tf.keras.layers.Conv2D(filters=128, kernel_size=3, activation="relu",
                               padding="same"),
        tf.keras.layers.MaxPooling2D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(units=64, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(units=10, activation="softmax"),
    ])
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=tf.keras.optimizers.SGD(learning_rate=1e-2),
                  metrics=["accuracy"])

model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10,
          callbacks=callbacks)
model.save(model_dir, save_format="tf")

In [None]:
custom_training_job = aiplatform.CustomTrainingJob(
    display_name="my_custom_training_job",
    script_path="my_vertex_ai_training_task.py",
    container_uri="gcr.io/cloud-aiplatform/training/tf-gpu.2-4:latest",
    model_serving_container_image_uri=server_image,
    requirements=["gcsfs==2022.3.0"],
    staging_bucket=f"gs://{bucket_name}/staging"
)

In [None]:
# run on two workers
mnist_model2 = custom_training_job.run(
    machine_type="n1-standard-4",
    replica_count=2,
    accelerator_type="NVIDIA_TESLA_K80",
    accelerator_count=2,
)

### Hyperparameter Tuning in Vertex AI


In [None]:
# Use argparse library to get hyperparameter value as an argument.
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--n_hidden', type=int, default=2)
parser.add_argument('--n_neurons', type=int, default=256)
parser.add_argument('--learning_rate', type=float, default=1e-2)
parser.add_argument('--optimizer', default='adam')
args = parser.parse_args()

In [None]:
# Use mirrored strategy to perform trials in multi GPU machine.
import tensorflow as tf

def build_model(args):
    with tf.distribute.MirroredStrategy().scope():
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8))
        for _ in range(args.n_hidden):
            model.add(tf.keras.layers.Dense(args.n_neurons, activation="relu"))
        model.add(tf.keras.layers.Dense(10, activation="softmax"))
        opt = tf.keras.optimizers.get(args.optimizer)
        opt.learning_rate = args.learning_rate
        model.compile(loss='sparse_categorical_crossentropy', optimizer=opt,
                      metrixs=['accuracy'])
        return model

mnist = tf.keras.datasets.mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = mnist
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

import os
model_dir = os.getenv("AIP_MODEL_DIR")
tensorboard_log_dir = os.getenv("AIP_TENSORBOARD_LOG_DIR")
checkpoint_dir = os.getenv("AIP_CHECKPOINT_DIR")
trial_id = os.getenv("CLOUD_ML_TRIAL_ID")
tensorboard_cb = tf.keras.callbacks.TensorBoard(tensorboard_log_dir)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)
callbacks = [tensorboard_cb, early_stopping_cb]

model = build_model(args)
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid),
                    epochs=10, callbacks=callbacks)
model.save(model_dir, save_format="tf")

In [None]:
# Use hypertune library to report performance of model to Vertex AI service and select the next combinations
import hypertune

hypertune = hypertune.HyperTune()
hypertune.report_hyperparameter_tuning_metric(
    hyperparameter_metric_tag="accuracy",
    metric_value=max(history.history["val_accuracy"]),
    global_step=model.optimizer.iterations.numpy(),
)

In [None]:
# Define type of machine.
trial_job = aiplatform.CustomJob.from_local_script(
    display_name="my_search_trial_job",
    script_path="my_vertex_ai_trial.py",
    container_uri="gcr.io/cloud-aiplatform/training/tf-gpu.2-4:latest",
    staging_bucket=f'gs://{bucket_name}/staging',
    accelerator_type='NVIDIA_TESLA_K80',
    accelerator_count=2,
)

In [None]:
# Tune hyperparemeters.
from google.cloud.aiplatform import hyperparameter_tuning as hpt

hp_job = aiplatform.HyperparameterTuningJob(
    display_name='my_hp_search_job',
    custom_job=trial_job,
    metric_spec={'accuracy': 'maximize'},
    parameter_spec={
        'learning_rate': hpt.DoubleParameterSpec(min=1e-4, max=10, scale='log'),
        'n_neurons':hpt.IntegerParameterSpec(min=1, max=300, scale='linear'),
        'n_hidden': hpt.IntegerParameterSpec(min=1, max=10, scale='linear'),
        'optimizer': hpt.CategoricalParameterSpec(['adam', 'sgd']),
    },
    max_trial_count=100,
    parallel_trial_count=20,
)
hp_job.run()

In [None]:
# Get a result.
def get_final_metric(trial, metric_id):
    for metric in trial.final_measurement.metrics:
        if metric.metric_id == metric_id:
            return metric.value

trials = hp_job.trials
trial_accuracies = [get_final_metric(trial, 'accuracy') for trial in trials]
best_trial = trials[np.argmax(trial_accuracies)]

In [None]:
max(trial_accuracies)

In [None]:
best_trial.id

In [None]:
best_trial.parameters