In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
    <td>
        <a href="https://console.cloud.google.com/mlengine/notebooks/deploy-notebook?q=download_url%3Dhttps://github.com/GoogleCloudPlatform/tensorflow-gcp-tools/blob/master/examples/cloud_fit.ipynb">
            <img src="https://www.gstatic.com/images/branding/product/1x/google_cloud_48dp.png" alt="AI Platform Notebooks"> Run in AI Platform Notebooks
        </a>
    </td>
    <td>
        <a href="https://colab.research.google.com/github/GoogleCloudPlatform/tensorflow-gcp-tools/blob/master/examples/cloud_fit.ipynb">
            <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
        </a>
    </td>
    <td>
        <a href="https://github.com/GoogleCloudPlatform/tensorflow-gcp-tools/blob/master/examples/cloud_fit.ipynb">
            <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">View on GitHub
        </a>
     </td>
</table>

# Overview

Following is a quick introduction to *cloud_fit*. *cloud_fit* enables training on Google Cloud AI Platform in the same manner as model.fit().
In this notebook, we will start by installing libraries required, then proceed with two samples showing how to use Numpy.array and TF.data.dataset with cloud_fit


### What are the components of the cloud_fit()

Cloud fit has two main components as follows:

**client.py:** serializes the provided data and model along with typical model.fit() parameters and triggers a AI platform training
``` python
def cloud_fit(model,
              remote_dir: Text,
              region: Text = None,
              project_id: Text = None,
              image_uri: Text = None,
              distribution_strategy: Text = DEFAULT_DISTRIBUTION_STRATEGY,
              job_spec: Dict[str, Any] = None,
              **fit_kwargs) -> Text:
  """Facilitates remote execution of in memory Models and Datasets on AI Platform.

  Args:
    model: A compiled Keras Model.
    remote_dir: Google Cloud Storage path for temporary assets and AI Platform
      training output. Will overwrite value in job_spec.
    region: Target region for running the AI Platform Training job.
    project_id: Project id where the training should be deployed to.
    image_uri: base image used to use for AI Platform Training
    distribution_strategy: Specifies the distribution strategy for remote
      execution when a jobspec is provided. Accepted values are strategy names
      as specified by 'tf.distribute.<strategy>.__name__'.
    job_spec: AI Platform training job_spec, will take precedence over all other
      provided values except for remote_dir. If none is provided a default
      cluster spec and distribution strategy will be used.
    **fit_kwargs: Args to pass to model.fit() including training and eval data.
      Only keyword arguments are supported. Callback functions will be
      serialized as is.

  Returns:
    AI Platform job ID

  Raises:
    RuntimeError: If executing in graph mode, eager execution is required for
    cloud_fit.
    NotImplementedError: Tensorflow v1.x is not supported.
  """
```

**remote.py:** A job that takes in a remote_dir as parameter , load model and data from this location and executes the training with stored parameters.
```python
def run(remote_dir: Text, distribution_strategy_text: Text):
  """deserializes Model and Dataset and runs them.

  Args:
    remote_dir: Temporary cloud storage folder that contains model and Dataset
      graph. This folder is also used for job output.
    distribution_strategy_text: Specifies the distribution strategy for remote
      execution when a jobspec is provided. Accepted values are strategy names
      as specified by 'tf.distribute.<strategy>.__name__'.
  """
```

### Costs

This tutorial uses billable components of Google Cloud:

* AI Platform Training
* Cloud Storage

Learn about [AI Platform Training
pricing](https://cloud.google.com/ai-platform/training/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project.](https://console.cloud.google.com/cloud-resource-manager) When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [Enable the AI Platform APIs](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com)

4. If running locally on your own machine, you will need to install the [Google Cloud SDK](https://cloud.google.com/sdk).

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

### Authenticate your Google Cloud account

**If you are using [AI Platform Notebooks](https://cloud.google.com/ai-platform/notebooks/docs/)**, your environment is already
authenticated. Skip these steps.

In [None]:
import sys

# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your Google Cloud account. This provides access
# to your Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

if 'google.colab' in sys.modules:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()

# If you are running this tutorial in a notebook locally, replace the string
# below with the path to your service account key and run this cell to
# authenticate your Google Cloud account.
else:
    %env GOOGLE_APPLICATION_CREDENTIALS your_path_to_credentials.json

# Log in to your account on Google Cloud
! gcloud auth application-default login --quiet
! gcloud auth login --quiet

## Clone and build tensorflow_enterprise_addons

To use the latest version of the tensorflow_enterprise_addons, we will clone and build the repo. The resulting whl file is both used in the client side as well as in construction of a docker image for remote execution.


In [None]:
!git clone https://github.com/GoogleCloudPlatform/tensorflow-gcp-tools.git

In [None]:
!cd tensorflow-gcp-tools/python && python3 setup.py -q bdist_wheel

In [None]:
!pip install -U tensorflow-gcp-tools/python/dist/tensorflow_enterprise_addons-*.whl --quiet

#### Restart the Kernel

We will automatically restart your kernel so the notebook has access to the packages you installed.

In [None]:
# Restart the kernel after pip installs
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Import libraries and define constants

In [None]:
import os
import uuid
import numpy as np
import tensorflow as tf
from tensorflow_enterprise_addons.cloud_fit import client

# Setup and imports
REMOTE_DIR = '[gcs-bucket-for-temporary-files]' #@param {type:"string"}
REGION = 'us-central1' #@param {type:"string"}
PROJECT_ID = '[your-project-id]' #@param {type:"string"}
! gcloud config set project $PROJECT_ID
IMAGE_URI = 'gcr.io/{PROJECT_ID}/[name-for-docker-image]:latest' #@param {type:"string"}

### Created a docker file with tensorflow_enterprise_addons
In the next step we create a base docker file with the latest wheel file to use for remote training. You may use any base image however DLVM base images come pre-installed with most needed packages.


In [None]:
%%file Dockerfile

# Using DLVM base image
FROM gcr.io/deeplearning-platform-release/tf2-cpu
WORKDIR /root

# Path configuration
ENV PATH $PATH:/root/tools/google-cloud-sdk/bin

# Make sure gsutil will use the default service account
RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg

# Copy and install tensorflow_enterprise_addons wheel file
ADD tensorflow-gcp-tools/python/dist/tensorflow_enterprise_addons-*.whl /tmp/
RUN pip3 install --upgrade /tmp/tensorflow_enterprise_addons-*.whl --quiet

# Sets up the entry point to invoke cloud_fit.
ENTRYPOINT ["python3","-m","tensorflow_enterprise_addons.cloud_fit.remote"]

In [None]:
!docker build -t {IMAGE_URI} -f Dockerfile . -q && docker push {IMAGE_URI}

## Tutorial 1 - Functional model
In this sample we will demonstrate using numpy.array as input data by creating a basic model and and submit it for remote training.

### Define model building function

In [None]:
"""Simple model to compute y = wx + 1, with w trainable."""
inp = tf.keras.layers.Input(shape=(1,), dtype=tf.float32)
times_w = tf.keras.layers.Dense(
  units=1,
  kernel_initializer=tf.keras.initializers.Constant([[0.5]]),
  kernel_regularizer=tf.keras.regularizers.l2(0.01),
  use_bias=False)
plus_1 = tf.keras.layers.Dense(
  units=1,
  kernel_initializer=tf.keras.initializers.Constant([[1.0]]),
  bias_initializer=tf.keras.initializers.Constant([1.0]),
  trainable=False)
outp = plus_1(times_w(inp))
simple_model = tf.keras.Model(inp, outp)

simple_model.compile(tf.keras.optimizers.SGD(0.002),
              "mean_squared_error", run_eagerly=True)

### Prepare Data

In [None]:
# Creating sample data
x = [[9.], [10.], [11.]] * 10
y = [[xi[0]/2. + 6] for xi in x]

### Run the model locally for validation

In [None]:
# Verify the model by training locally for one step.
simple_model.fit(np.array(x), np.array(y), batch_size=len(x), epochs=1)

### Submit model and dataset for remote training

In [None]:
# Create a unique remote sub folder path for assets and model training output.
SIMPLE_REMOTE_DIR = os.path.join(REMOTE_DIR, str(uuid.uuid4()))
print('your remote folder is %s' % (SIMPLE_REMOTE_DIR))

In [None]:
# Using default configuration with two workers dividing the dataset between the two.
simple_model_job_id = client.cloud_fit(model=simple_model, remote_dir = SIMPLE_REMOTE_DIR, region =REGION , image_uri=IMAGE_URI, x=np.array(x), y=np.array(y), epochs=100, steps_per_epoch=len(x)/2,verbose=2)

In [None]:
!gcloud ai-platform jobs describe projects/{PROJECT_ID}/jobs/{simple_model_job_id}

### Retrieve the trained model
Once the training is complete you can access the trained model at remote_folder/output

In [None]:
# Load the trained model from gcs bucket
trained_simple_model = tf.keras.models.load_model(os.path.join(SIMPLE_REMOTE_DIR, 'output'))

In [None]:
# Test that the saved model loads and works properly
trained_simple_model.evaluate(x,y)

## Tutorial 2 - Sequential Models and Datasets
In this sample we will demonstrate using datasets by creating a basic model and submitting it for remote training.

### Define model building function

In [None]:
# create a model
fashion_mnist_model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10)
])

fashion_mnist_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

### Prepare Data

In [None]:
train, test = tf.keras.datasets.fashion_mnist.load_data()
images, labels = train
images = images/255
dataset = tf.data.Dataset.from_tensor_slices((images, labels))
dataset = dataset.batch(32)

### Run the model locally for validation

In [None]:
# Verify the model by training locally for one step. This is not necessary prior to cloud.fit() however it is recommended.
fashion_mnist_model.fit(dataset, epochs=1)

### Submit model and dataset for remote training

In [None]:
# Create a unique remote sub folder path for assets and model training output.
FASHION_REMOTE_DIR = os.path.join(REMOTE_DIR, str(uuid.uuid4()))
print('your remote folder is %s' % (FASHION_REMOTE_DIR))

In [None]:
fashion_mnist_model_job_id = client.cloud_fit(model=fashion_mnist_model, remote_dir = FASHION_REMOTE_DIR,region =REGION , image_uri=IMAGE_URI,  x=dataset,epochs=10, steps_per_epoch=15,verbose=2)

In [None]:
!gcloud ai-platform jobs describe projects/{PROJECT_ID}/jobs/{fashion_mnist_model_job_id}

### Retrieve the trained model
Once the training is complete you can access the trained model at remote_folder/output

In [None]:
# Load the trained model from gcs bucket
trained_fashion_mnist_model = tf.keras.models.load_model(os.path.join(FASHION_REMOTE_DIR, 'output'))

In [None]:
# Test that the saved model loads and works properly
test_images, test_labels = test
test_images = test_images/255
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels))
test_dataset = test_dataset.batch(32)

trained_fashion_mnist_model.evaluate(test_dataset)