In [None]:
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# XManager codelab notebook

This notebook will take you through running an XManager experiment on Google Cloud Platform (GCP).

A stand-alone Jupyter Notebook can be created via GCP's [Vertex AI Notebooks](https://console.cloud.google.com/vertex-ai/notebooks/list/instances)

JupyterLab can be installed on your machine following [Jupyter's installation](https://jupyter.org/install).

## Install any prerequisites

1. Create a [GCP project](https://console.cloud.google.com/) if one does not already exist.

2. Install [Docker](https://docs.docker.com/engine/install/) if it is not already installed.

## Download and install XManager

In [None]:
!git clone https://github.com/google-research/raksha.git ~/xmanager
!pip install ~/xmanager

## Set default GCP values

The following gets the GCP project.

In [None]:
from google import auth
credentials = auth.default()[0]
project = auth.default()[1]
print('GCP Project:', project)

Use `gcloud auth application-default login` if the above command results in an error or the project is incorrect.

XManager requires a Google Cloud Storage Bucket. [Create one](https://cloud.google.com/storage/docs/creating-buckets) if one does not already exist and enter it in the box below.

In [1]:
from IPython.display import display
import ipywidgets
import os

def bucket_changed(change):
    os.environ['GOOGLE_CLOUD_BUCKET_NAME'] = change.new

GOOGLE_CLOUD_BUCKET_NAME = ipywidgets.Text(
    description='GOOGLE_CLOUD_BUCKET_NAME:',
    style={'description_width': 'initial'},
    layout=ipywidgets.Layout(width='50%'),
)
GOOGLE_CLOUD_BUCKET_NAME.observe(bucket_changed, names='value')

display(GOOGLE_CLOUD_BUCKET_NAME)

Text(value='', description='GOOGLE_CLOUD_BUCKET_NAME:', layout=Layout(width='50%'), style=DescriptionStyle(des…

In [None]:
from xmanager import xm
# This code block sets FLAGS to use default values to avoid an absl.flags.UnparsedFlagAccessError.
# Normally XManager flags are set via the command-line with `xmanager train.py -- --key=value`
from absl import flags
flags.FLAGS([''])
flags.FLAGS.xm_wrap_late_bindings = True

## Launching an experiment

This code block has everything needed to launch an first experiment.

In [None]:
import asyncio
import itertools
import os

from xmanager import xm
from xmanager import xm_local

async with xm_local.create_experiment(experiment_title='cifar10') as experiment:
    [executable] = experiment.package([
        xm.python_container(
            executor_spec=xm_local.Caip.Spec(),
            path=os.path.expanduser('~/xmanager/examples/cifar10_torch'),
            entrypoint=xm.ModuleName('cifar10'),
        )
    ])

    batch_sizes = [64, 128]
    learning_rates = [0.01, 0.001]
    trials = list(
        dict([('batch_size', bs), ('learning_rate', lr)])
        for (bs, lr) in itertools.product(batch_sizes, learning_rates)
    )
    for hyperparameters in trials:
        experiment.add(xm.Job(
            executable=executable,
            executor=xm_local.Caip(requirements=xm.JobRequirements(T4=1)),
            args=hyperparameters,
        ))

## Breaking down an experiment

The above experiment can be broken down into smaller steps. Let's break down the above code block by defining `launch_experiment` that has the following steps:

1. Creating the experiment.
2. Defining the executable specification.
3. Define the execution environment.
4. Defining the hyperparameters.
5. Launching the jobs.

These 6 helper methods are undefined right now, but we will define them in the next sections accompanied with an explanation for what is happening.

In [None]:
async def launch_experiment():
    async with create_experiment() as experiment:                   # (1) Creating the experiment.
        executable = create_executable(experiment)                  # (2) Defining the executable.
        executor = execution_environment()                          # (3) Define the execution environment.
        hparams = define_hyperparameters()                          # (4) Defining the hyperparameters.
        launch_jobs(experiment, executable, executor, hparams)      # (5) Launching the jobs.

### Creating the experiment

Give the experiment a name. The `create_experiment` method will also create a unique integer id for the experiment and save this experiment to a database.

In [None]:
def create_experiment():
    return xm_local.create_experiment(experiment_title='cifar10')

### Defining the executable

Define the job that will run in the experiment. A `PythonContainer` is an example of a executable specificaiton. This executable specification tells XManager to package everything inside the `PythonContainer.path` as a container and use `PythonContainer.entrypoint` as the main module. Because we cloned XManager to `~/xmanager` in an early step, we can use one of the examples, `~/xmanager/examples/cifar10_torch` as the path.

We also need to declare where the executable should be staged. This step will upload the executable specification to the correct storage option that is best suited for the execution environment. For example, if the execution environment is Vertex AI, the executable must be stored in Google Container Registry. The `Caip.Spec()` specification will upload the specification to Google Container Registry, where it will be accessible by Vertex AI.

In [None]:
def create_spec(experiment):
    [executable] = experiment.package([
        xm.python_container(
            executor_spec=xm_local.Caip.Spec(),
            path=os.path.expanduser('~/xmanager/examples/cifar10_torch'),
            entrypoint=xm.ModuleName('cifar10'),
        )
    ])
    return executable

### Define the execution environment

Declare where the job will run and what compute requirements are necessary to run one job. To run on AI Vertex, we must use the `xm_local.Caip` executor. Each job should use 1 NVidia T4 GPU, so we must pass in a `xm.JobRequirements` to the executor.

In [None]:
def execution_environment():
    return xm_local.Caip(xm.JobRequirements(T4=1))

### Define the hyperparameters

Define the hyperparameters trials for this experiment. A grid search is easy to define using `itertools`.

In [None]:
def define_hyperparameters():
    batch_sizes = [64, 128]
    learning_rates = [0.01, 0.001]
    return list(
      dict([('batch_size', bs), ('learning_rate', lr)])
      for (bs, lr) in itertools.product(batch_sizes, learning_rates)
    )

from pprint import pprint
pprint(define_hyperparameters())

### Launch the jobs

Finally, we can add experiment units to our experiment. To add a single job to the experiment, create a `xm.Job` object that combine the executable, compute requirements, and custom arguments hyperparameters, and the job to the experiment.

In [None]:
def launch_single_job(experiment, executable, executor, batch_size, learning_rate):
    experiment.add(xm.Job(
        executable=executable,
        executor=executor,
        args={'batch_size': batch_size, 'learning_rate': learning_rate},
    ))

To do a grid search, loop over all the hyperparameters, passing a different hyperparameter configuration to the `args` parameter of each job. Add each job to the experiment.

In [None]:
def launch_jobs(experiment, executable, executor, trials):
    for hparams in trials:
        experiment.add(xm.Job(
            executable=executable,
            executor=executor,
            args=hparams,
        ))

Finally, we call the parent method we defined at the beginning of this section.

In [None]:
await launch_experiment()