In [7]:
import urllib
from IPython.display import Markdown as md
_nb_loc = "04_detect_segment/04ab_retinanet_arthropods_train.ipynb" # change to reflect your notebook
_nb_title = "Object Detection with RetinaNet on Arthropods dataset / training" # change to reflect your notebook
_nb_message = "This notebook is set up to run on TPU or GPU. It has been executed on a TPUv3 but it works fine on TPUv2 (Colaboratory). Training on TPU requires a private writable GCS bucket. See the GCS bucket section below. This example uses the RetinaNet implementation from Tensorflow model Garden." # change to reflect your notebook
_icons=["https://raw.githubusercontent.com/GoogleCloudPlatform/practical-ml-vision-book/master/logo-cloud.png", "https://www.tensorflow.org/images/colab_logo_32px.png", "https://www.tensorflow.org/images/GitHub-Mark-32px.png", "https://www.tensorflow.org/images/download_logo_32px.png"]
_links=["https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?" + urllib.parse.urlencode({"name": _nb_title, "download_url": "https://github.com/GoogleCloudPlatform/practical-ml-vision-book/raw/master/"+_nb_loc}), "https://colab.research.google.com/github/GoogleCloudPlatform/practical-ml-vision-book/blob/master/{0}".format(_nb_loc), "https://github.com/GoogleCloudPlatform/practical-ml-vision-book/blob/master/{0}".format(_nb_loc), "https://raw.githubusercontent.com/GoogleCloudPlatform/practical-ml-vision-book/master/{0}".format(_nb_loc)]
md("""<table class="tfo-notebook-buttons" align="left"><td><a target="_blank" href="{0}"><img src="{4}"/>Run in AI Platform Notebook</a></td><td><a target="_blank" href="{1}"><img src="{5}" />Run in Google Colab</a></td><td><a target="_blank" href="{2}"><img src="{6}" />View source on GitHub</a></td><td><a href="{3}"><img src="{7}" />Download notebook</a></td></table><br/><br/><h1>{8}</h1>{9}""".format(_links[0], _links[1], _links[2], _links[3], _icons[0], _icons[1], _icons[2], _icons[3], _nb_title, _nb_message))

<table class="tfo-notebook-buttons" align="left"><td><a target="_blank" href="https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?name=Object+Detection+with+RetinaNet+on+Arthropods+dataset+%2F+training&download_url=https%3A%2F%2Fgithub.com%2FGoogleCloudPlatform%2Fpractical-ml-vision-book%2Fraw%2Fmaster%2F04_detect_segment%2F04ab_retinanet_arthropods_train.ipynb"><img src="https://raw.githubusercontent.com/GoogleCloudPlatform/practical-ml-vision-book/master/logo-cloud.png"/>Run in AI Platform Notebook</a></td><td><a target="_blank" href="https://colab.research.google.com/github/GoogleCloudPlatform/practical-ml-vision-book/blob/master/04_detect_segment/04ab_retinanet_arthropods_train.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a></td><td><a target="_blank" href="https://github.com/GoogleCloudPlatform/practical-ml-vision-book/blob/master/04_detect_segment/04ab_retinanet_arthropods_train.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a></td><td><a href="https://raw.githubusercontent.com/GoogleCloudPlatform/practical-ml-vision-book/master/04_detect_segment/04ab_retinanet_arthropods_train.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a></td></table><br/><br/><h1>Object Detection with RetinaNet on Arthropods dataset / training</h1>This notebook is set up to run on TPU or GPU. It has been executed on a TPUv3 but it works fine on TPUv2 (Colaboratory). Training on TPU requires a private writable GCS bucket. See the GCS bucket section below. This example uses the RetinaNet implementation from Tensorflow model Garden.

In [1]:
!pip install --quiet tf-models-official

In [9]:
import time, re, os
import tensorflow as tf
import numpy as np
import pprint as pp
AUTO = tf.data.AUTOTUNE
print("Tensorflow version", tf.__version__)

# Tensorflow Model Garden imports
import official as model_garden
from official.vision.beta.configs import retinanet as retinanet_cfg
from official.vision.beta.configs import backbones as backbones_cfg
from official.vision.beta.serving import export_saved_model_lib
from official.core import train_lib

# TODO
# load the backbone checkpoint from the official loacation as soon as it is published
# save the model configuration to the saved_odel folder as per best practices

Tensorflow version 2.5.0


# GCS bucket
This bucket will receive:
 - Tensorboard summaries that allow you to follow the training
 - checkpoints
 - the saved model after training


In [10]:
# Use your own GCS bucket here. GCS is required if training on TPU.
# On GPU, a local folder will work.
MODEL_ARTIFACT_BUCKET = 'gs://ml1-demo-martin/arthropod_jobs/'
MODEL_DIR = MODEL_ARTIFACT_BUCKET + str(int(time.time()))

# If you are running on Colaboratory, you must authenticate
# for Colab to have write access to the bucket.

IS_COLAB_BACKEND = 'COLAB_GPU' in os.environ  # this is always set on Colab, the value is 0 or 1 depending on GPU presence
if IS_COLAB_BACKEND:
    from google.colab import auth
    auth.authenticate_user()

# TPU / GPU detection 

In [None]:
strategy = tf.distribute.get_strategy()

try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError: # detect GPUs or multi-GPU machines
    strategy = tf.distribute.MirroredStrategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Configuration

In [13]:
TRAIN_DATA_PATH_PATTERN = 'gs://practical-ml-vision-book/arthropod_detection_tfr/size_w1024px/*.train.tfrec'
VALID_DATA_PATH_PATTERN = 'gs://practical-ml-vision-book/arthropod_detection_tfr/size_w1024px/*.test.tfrec'
SPINET_MOBILE_CHECKPOINT = 'gs://practical-ml-vision-book/arthropod_detection_tfr/spinenet_mobile_checkpoint/'

BATCH_SIZE = 32 * strategy.num_replicas_in_sync

EPOCHS = 80

RAW_CLASSES = ['Lepidoptera', 'Hymenoptera', 'Hemiptera', 'Odonata', 'Diptera', 'Araneae', 'Coleoptera',
               '_truncated', '_blurred', '_occluded', ]
CLASSES = [klass for klass in RAW_CLASSES if klass not in ['_truncated', '_blurred', '_occluded']]

# Lepidoptera = butterfies and moths
# Hymenoptera = wasps, bees and ants
# Hemiptera = true bugs (cicadas, aphids, shield bugs, ...)
# Odonata = dragonflies
# Diptera = fies
# Araneae = spiders
# Coleoptera = beetles

# NOT IN DATASET
# Orthoptera = grasshoppers

print("Model dir:", MODEL_DIR)

Model dir: gs://ml1-demo-martin/arthropod_jobs/1625841317


# Load data files
The dataset is already prepared in TFRecord format.<br/>
The script that prepared the data is in "04aa_retinanet_arthropods_dataprep.ipynb"<br/>
To parse the TFRecord files by hand and visulaize their contents, see code in "04ac_retinanet_arthropods_predict.ipynb"

In [14]:
def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return int(np.sum(n))

TRAIN_FILENAMES = tf.io.gfile.glob(TRAIN_DATA_PATH_PATTERN)
NB_TRAIN_IMAGES = count_data_items(TRAIN_FILENAMES)
STEPS_PER_EPOCH = NB_TRAIN_IMAGES // BATCH_SIZE

VALID_FILENAMES = tf.io.gfile.glob(VALID_DATA_PATH_PATTERN)
NB_VALID_IMAGES = count_data_items(VALID_FILENAMES)
VALID_STEPS = NB_VALID_IMAGES // BATCH_SIZE

print("Training dataset:")
print(f"    {len(TRAIN_FILENAMES)} TFRecord files.")
print(f"    {NB_TRAIN_IMAGES} images")
print("    Steps per epoch:", STEPS_PER_EPOCH)
print()
print("Validation dataset:")
print(f"    {len(VALID_FILENAMES)} TFRecord files.")
print(f"    {NB_VALID_IMAGES} images")
print("    Validation steps:", VALID_STEPS)
print()
print("Global batch size:", BATCH_SIZE)

Training dataset:
    24 TFRecord files.
    11544 images
    Steps per epoch: 45

Validation dataset:
    8 TFRecord files.
    3832 images
    Validation steps: 14

Global batch size: 256


# Model configuration

In [15]:
IMAGE_SIZE = [384, 384]

# default parameters can be overriden in two ways:
# 1) params.override({'task': {'model': {'backbone': backbone_cfg.as_dict()}}})
# 2) params.task.model.backbone = backbone_cfg
# params.override checks that the dictionary keys exist
# the second options will silently add new keys

params = model_garden.core.exp_factory.get_exp_config('retinanet')

params.task.model.num_classes = len(CLASSES)+1 # class 0 is reserved for backgrounds
params.task.model.input_size = [*IMAGE_SIZE, 3] # this automatically configures the input reader to random crop training images
params.task.init_checkpoint = SPINET_MOBILE_CHECKPOINT
params.task.init_checkpoint_modules = 'backbone'
params.task.model.backbone = backbones_cfg.Backbone(type='spinenet_mobile', spinenet_mobile=backbones_cfg.SpineNetMobile())

train_data_cfg=retinanet_cfg.DataConfig(
    input_path=TRAIN_DATA_PATH_PATTERN,
    is_training=True,
    global_batch_size=BATCH_SIZE,
    parser=retinanet_cfg.Parser(aug_rand_hflip=True, aug_scale_min=0.7, aug_scale_max=2.0))

valid_data_cfg=retinanet_cfg.DataConfig(
    input_path=VALID_DATA_PATH_PATTERN,
    is_training=False,
    global_batch_size=BATCH_SIZE)

params.override({'task': {'train_data': train_data_cfg.as_dict(), 'validation_data': valid_data_cfg.as_dict()}})

trainer_cfg=model_garden.core.config_definitions.TrainerConfig(
    train_steps=EPOCHS * STEPS_PER_EPOCH,
    validation_steps=VALID_STEPS,
    validation_interval=8*STEPS_PER_EPOCH,
    steps_per_loop=STEPS_PER_EPOCH,
    summary_interval=STEPS_PER_EPOCH,
    checkpoint_interval=8*STEPS_PER_EPOCH)

optim_cfg = model_garden.modeling.optimization.OptimizationConfig({
    'optimizer': {
                  'type': 'sgd',
                  'sgd': {'momentum': 0.9}},
    'learning_rate': {'type': 'stepwise',
                      'stepwise': {'boundaries': [15 * STEPS_PER_EPOCH,
                                                  30 * STEPS_PER_EPOCH,
                                                  45 * STEPS_PER_EPOCH,
                                                  60 * STEPS_PER_EPOCH,
                                                  75 * STEPS_PER_EPOCH],
                                   'values': [0.016, #0.01,
                                              0.008, #0.005,
                                              0.004, #0.0025,
                                              0.002, #0.001,
                                              0.001, #0.0005,
                                              0.0005]} #0.00025]}
                     },
    #'warmup': {'type': 'linear','linear': {'warmup_steps': 5*STEPS_PER_EPOCH, 'warmup_learning_rate': 0.00001}}
})

trainer_cfg.override({'optimizer_config': optim_cfg})
params.override({'trainer': trainer_cfg})

pp.pprint(params.as_dict())

{'runtime': {'all_reduce_alg': None,
             'batchnorm_spatial_persistent': False,
             'dataset_num_private_threads': None,
             'default_shard_dim': -1,
             'distribution_strategy': 'mirrored',
             'enable_xla': False,
             'gpu_thread_mode': None,
             'loss_scale': None,
             'mixed_precision_dtype': None,
             'num_cores_per_replica': 1,
             'num_gpus': 0,
             'num_packs': 1,
             'per_gpu_thread_count': 0,
             'run_eagerly': False,
             'task_index': -1,
             'tpu': None,
             'tpu_enable_xla_dynamic_padder': None,
             'worker_hosts': None},
 'task': {'annotation_file': None,
          'init_checkpoint': 'gs://practical-ml-vision-book/arthropod_detection_tfr/spinenet_mobile_checkpoint/',
          'init_checkpoint_modules': 'backbone',
          'losses': {'box_loss_weight': 50,
                     'focal_loss_alpha': 0.25,
                 

# Create the model

In [22]:
task = model_garden.core.task_factory.get_task(params.task, logging_dir=MODEL_DIR)

# this works too:
#task = official.vision.beta.tasks.retinanet.RetinaNetTask(params.task)

# this returns a RetinaNetModel
#task.build_model()
# note: none of the expected model functionalities work: model.fit(), model.predict(), model.save()

# this returns the training dataset
#train_dataset = task.build_inputs(train_data_cfg)
# note: the dataset already includes FPN level and anchor pairing and is therefore not very readable

# this returns the validation dataset
#valid_dataset = task.build_inputs(valid_data_cfg)
# note: the dataset already includes FPN level and anchor pairing and is therefore not very readable

# this code allows you to see if the TFRecord fields are read correctly
#ds = tf.data.TFRecordDataset(tf.io.gfile.glob(TRAIN_DATA_PATH_PATTERN))
#dec = official.vision.beta.dataloaders.tf_example_decoder.TfExampleDecoder()
#ds = ds.map(dec.decode)

# training and validatoin data parsing happens in:
# official.vision.beta.dataloaders.retinanet_input.Parser._parse_train_data
# official.vision.beta.dataloaders.retinanet_input.Parser._parse_eval_data
# official.vision.beta.dataloaders.Parser.parse() # dispatches between _parse_train_data and _parse_eval_data

# Train the model
Training takes approximately 30min on a TPUv3-8, 40min on a TPUv2-8 on Colab

In [12]:
print(MODEL_DIR)
model,_ = train_lib.run_experiment(
    distribution_strategy=strategy,
    task=task,
    mode="train_and_eval", # 'train', 'eval', 'train_and_eval' or 'continuous_eval'
    params=params,
    model_dir=MODEL_DIR)

gs://ml1-demo-martin/arthropod_jobs/job1625829421
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


restoring or initializing model...
initialized model.
train | step:      0 | training until step 360...
train | step:     45 | steps/sec:    0.1 | output: 
    {'box_loss': 0.009544796,
     'cls_loss': 0.9081224,
     'learning_rate': 0.016,
     'model_loss': 1.3853621,
     'total_loss': 1.3853621,
     'training_loss': 1.3853621}
saved checkpoint to gs://ml1-demo-martin/arthropod_jobs/job1625829421/ckpt-45.
train | step:     90 | steps/sec:    1.6 | output: 
    {'box_loss': 0.0053183795,
     'cls_loss': 0.63108325,
     'learning_rate': 0.016,
     'model_loss': 0.8970024,
     'total_loss': 0.8970024,
     'training_loss': 0.8970024}
train | step:    135 | steps/sec:    2.3 | output: 
    {'box_loss': 0.004413114,
     'cls_loss': 0.5682768,
     'learning_rate': 0.016,
     'model_loss': 0.78893256,
     'total_loss': 0.78893256,
     'training_loss': 0.78893256}
train | step:    180 | steps/sec:    2.4 | output: 
    {'box_loss': 0.0039812424,
     'cls_loss': 0.5247026,
     

# Export the model
To test the exported model, please use the notebook "04ac_retinanet_arthropods_predict.ipynb"

In [13]:
export_saved_model_lib.export_inference_graph(
      input_type='image_tensor',
      batch_size=4,
      input_image_size=IMAGE_SIZE,
      params=params,
      checkpoint_path=MODEL_DIR,
      export_dir=MODEL_DIR,
      export_checkpoint_subdir='saved_chkpt',
      export_saved_model_subdir='saved_model')










FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.


INFO:tensorflow:Assets written to: gs://ml1-demo-martin/arthropod_jobs/job1625829421/saved_model/assets


INFO:tensorflow:Assets written to: gs://ml1-demo-martin/arthropod_jobs/job1625829421/saved_model/assets


## License
Copyright 2021 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.