# Overview

This colab demonstrates an example of using ViP-DeepLab to output sequence-level
depth-aware video panoptic segmentation predictions. It loads an exported ViP-DeepLab model trained for Cityscapes-DVPS and visualizes the outputs for a sequence of the dataset.

[1] Marius Cordts, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, and Bernt Schiele. The cityscapes dataset for semantic urban scene understanding. CVPR, 2016.

[2] Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon. Video panoptic segmentation. CVPR, 2020.

[3] Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen.
ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic Segmentation. CVPR, 2021.

# Inputs

In [1]:
# MODEL_DIR The directory of the exported ViP-DeepLab model.
MODEL_URL = 'https://storage.googleapis.com/gresearch/tf-deeplab/saved_model/resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model.tar.gz' #@param {type:"string"}

# SEQUENCE_PATTERN The file name pattern for the input sequence.
SEQUENCE_PATTERN = './data/*.png' #@param {type:"string"}

# LABEL_DIVISOR The label divisor for the dataset.
LABEL_DIVISOR = 1000 #@param {type:"integer"}

# ViP-DeepLab Sequence Inference Class

In [2]:
#@title Import Python Libaries
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import copy
import collections
import typing
import tempfile
import urllib
import os

2023-04-30 13:54:03.896897: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-30 13:54:04.333151: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-30 13:54:06.085003: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/ros/foxy/opt/yaml_cpp_vendor/lib:/opt/ros/foxy/opt/rviz_ogre_vendor/lib:/opt/ros/f

In [3]:
#@title Define ViP-DeepLab Sequence Inference Class

class ViPDeepLab:
  """Sequence inference model for ViP-DeepLab.

  Frame-level ViP-DeepLab takes two consecutive frames as inputs and generates
  temporarily consistent depth-aware video panoptic predictions. Sequence-level
  ViP-DeepLab takes a sequence of images as input and propages the instance IDs
  between all 2-frame predictions made by frame-level ViP-DeepLab.

  Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen.
  ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic
  Segmentation. CVPR, 2021.
  """

  def __init__(self, model_path: str, label_divisor: int):
    """Initializes a ViP-DeepLab model.

    Args:
      model_path: A string specifying the path to the exported ViP-DeepLab.
      label_divisor: An integer specifying the dataset label divisor.
    """
    self._model = tf.saved_model.load(model_path)
    self._label_divisor = label_divisor
    self._overlap_offset = label_divisor // 2
    self._combine_offset = 2 ** 32
    self.reset()

  def reset(self):
    """Resets the sequence predictions."""
    self._max_instance_id = 0
    self._depth_preds = []
    self._stitched_panoptic = []
    self._last_panoptic = None

  def _infer(self, input_array, next_input_array):
    """Inference for two consecutive input frames."""
    print("input_array.shape", input_array.shape)
    print("next_input_array.shape", next_input_array.shape)
    input_array = np.concatenate((input_array, next_input_array), axis=-1)
    print(input_array.shape)
    output = self._model(input_array)
    depth = output['depth_pred'].numpy()
    panoptic = output['panoptic_pred'].numpy()
    next_panoptic = output['next_panoptic_pred'].numpy()
    return depth, panoptic, next_panoptic

  def infer(self, inputs: typing.List[tf.Tensor]):
    """Inference for a sequence of input frames.

    Args:
      inputs: A list of tf.Tensor storing the input frames.
    """
    self.reset()
    for input_idx in range(len(inputs) - 1):
      depth, panoptic, next_panoptic = self._infer(inputs[input_idx],
                                                   inputs[input_idx + 1])
      self._depth_preds.append(copy.deepcopy(depth))
      # Propagate instance ID from last_panoptic to next_panoptic based on ID
      # matching between panoptic and last_panoptic. panoptic and last_panoptic
      # stores panoptic predictions for the same frame but from different runs.
      next_new_mask = next_panoptic % self._label_divisor > self._overlap_offset
      if self._last_panoptic is not None:
        intersection = (
            self._last_panoptic.astype(np.int64) * self._combine_offset +
            panoptic.astype(np.int64))
        intersection_ids, intersection_counts = np.unique(
            intersection, return_counts=True)
        intersection_ids = intersection_ids[np.argsort(intersection_counts)]
        for intersection_id in intersection_ids:
          last_panoptic_id = intersection_id // self._combine_offset
          panoptic_id = intersection_id % self._combine_offset
          next_panoptic[next_panoptic == panoptic_id] = last_panoptic_id
      # Adjust the IDs for the new instances in next_panoptic.
      self._max_instance_id = max(self._max_instance_id,
                                  np.max(panoptic % self._label_divisor))
      next_panoptic_cls = next_panoptic // self._label_divisor
      next_panoptic_ins = next_panoptic % self._label_divisor
      next_panoptic_ins[next_new_mask] = (
          next_panoptic_ins[next_new_mask] - self._overlap_offset
          + self._max_instance_id)
      next_panoptic = (
          next_panoptic_cls * self._label_divisor + next_panoptic_ins)
      if not self._stitched_panoptic:
        self._stitched_panoptic.append(copy.deepcopy(panoptic))
      self._stitched_panoptic.append(copy.deepcopy(next_panoptic))
      self._max_instance_id = max(self._max_instance_id,
                                  np.max(next_panoptic % self._label_divisor))
      self._last_panoptic = copy.deepcopy(next_panoptic)

  def results(self):
    """Returns the sequence inference results."""
    return self._depth_preds, self._stitched_panoptic

# A Sequence Example on Cityscapes-DVPS

In [5]:
#@title Download pre-trained checkpoint.
model_name = 'resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model'
model_dir = tempfile.mkdtemp()
download_path = os.path.join(model_dir, 'model.tar.gz')
urllib.request.urlretrieve(MODEL_URL, download_path)
!tar -xzvf {download_path} -C {model_dir}
model_path = os.path.join(model_dir, model_name, 'exports')

resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/
resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/saved_model.pb
resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/exports/
resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/exports/assets/
resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/exports/saved_model.pb
resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/exports/variables/
resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/exports/variables/variables.data-00000-of-00001
resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/exports/variables/variables.index


In [6]:
print(model_path)
#@title Run Inference on Examples from Cityscapes-DVPS
vip_deeplab = ViPDeepLab(model_path=model_path, label_divisor=LABEL_DIVISOR)
filenames = sorted(tf.io.gfile.glob(SEQUENCE_PATTERN))[0:3]
inputs = []
for filename in filenames:
    print(filename)
    RGBA = tf.image.decode_png(tf.io.read_file(filename))
    RGB = RGBA[:, :, :3]
    inputs.append(RGB)
inputs.append(inputs[-1])
vip_deeplab.infer(inputs)
depth_preds, stitched_panoptic = vip_deeplab.results()

/tmp/tmp6nhaj7yv/resnet50_beta_os32_vip_deeplab_cityscapes_dvps_train_saved_model/exports


2023-04-30 13:54:31.832876: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-30 13:54:32.435446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/ros/foxy/opt/yaml_cpp_vendor/lib:/opt/ros/foxy/opt/rviz_ogre_vendor/lib:/opt/ros/foxy/lib/x86_64-linux-gnu:/opt/ros/foxy/lib:/usr/local/cuda/lib64
2023-04-30 13:54:32.435506: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skip





































































./data/LA_1.png
./data/LA_2.png
input_array.shape (1080, 1920, 3)
next_input_array.shape (1080, 1920, 3)
(1080, 1920, 6)
input_array.shape (1080, 1920, 3)
next_input_array.shape (1080, 1920, 3)
(1080, 1920, 6)


In [None]:
#@title Visualization Utilities

DatasetInfo = collections.namedtuple(
    'DatasetInfo',
    'num_classes, label_divisor, thing_list, colormap, class_names')


def _cityscapes_label_colormap():
  """Creates a label colormap used in CITYSCAPES segmentation benchmark.

  See more about CITYSCAPES dataset at https://www.cityscapes-dataset.com/
  M. Cordts, et al. "The Cityscapes Dataset for Semantic Urban Scene Understanding." CVPR. 2016.

  Returns:
    A 2-D numpy array with each row being mapped RGB color (in uint8 range).
  """
  colormap = np.zeros((256, 3), dtype=np.uint8)
  colormap[0] = [128, 64, 128]
  colormap[1] = [244, 35, 232]
  colormap[2] = [70, 70, 70]
  colormap[3] = [102, 102, 156]
  colormap[4] = [190, 153, 153]
  colormap[5] = [153, 153, 153]
  colormap[6] = [250, 170, 30]
  colormap[7] = [220, 220, 0]
  colormap[8] = [107, 142, 35]
  colormap[9] = [152, 251, 152]
  colormap[10] = [70, 130, 180]
  colormap[11] = [220, 20, 60]
  colormap[12] = [255, 0, 0]
  colormap[13] = [0, 0, 142]
  colormap[14] = [0, 0, 70]
  colormap[15] = [0, 60, 100]
  colormap[16] = [0, 80, 100]
  colormap[17] = [0, 0, 230]
  colormap[18] = [119, 11, 32]
  return colormap


def _cityscapes_class_names():
  return ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
          'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
          'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
          'bicycle')


def cityscapes_dataset_information():
  return DatasetInfo(
      num_classes=19,
      label_divisor=1000,
      thing_list=tuple(range(11, 19)),
      colormap=_cityscapes_label_colormap(),
      class_names=_cityscapes_class_names())


def perturb_color(color, noise, used_colors, max_trials=50, random_state=None):
  """Pertrubs the color with some noise.

  If `used_colors` is not None, we will return the color that has
  not appeared before in it.

  Args:
    color: A numpy array with three elements [R, G, B].
    noise: Integer, specifying the amount of perturbing noise (in uint8 range).
    used_colors: A set, used to keep track of used colors.
    max_trials: An integer, maximum trials to generate random color.
    random_state: An optional np.random.RandomState. If passed, will be used to
      generate random numbers.

  Returns:
    A perturbed color that has not appeared in used_colors.
  """
  if random_state is None:
    random_state = np.random

  for _ in range(max_trials):
    random_color = color + random_state.randint(
        low=-noise, high=noise + 1, size=3)
    random_color = np.clip(random_color, 0, 255)

    if tuple(random_color) not in used_colors:
      used_colors.add(tuple(random_color))
      return random_color

  print('Max trial reached and duplicate color will be used. Please consider '
        'increase noise in `perturb_color()`.')
  return random_color


def color_panoptic_map(panoptic_prediction,
                       dataset_info,
                       perturb_noise,
                       used_colors,
                       color_mapping):
  """Helper method to colorize output panoptic map.

  Args:
    panoptic_prediction: A 2D numpy array, panoptic prediction from deeplab
      model.
    dataset_info: A DatasetInfo object, dataset associated to the model.
    perturb_noise: Integer, the amount of noise (in uint8 range) added to each
      instance of the same semantic class.
    used_colors: A set, used to keep track of used colors.
    color_mapping: A dict, used to map exisiting panoptic ids.

  Returns:
    colored_panoptic_map: A 3D numpy array with last dimension of 3, colored
      panoptic prediction map.
    used_colors: A dictionary mapping semantic_ids to a set of colors used
      in `colored_panoptic_map`.
  """
  if panoptic_prediction.ndim != 2:
    raise ValueError('Expect 2-D panoptic prediction. Got {}'.format(
        panoptic_prediction.shape))

  semantic_map = panoptic_prediction // dataset_info.label_divisor
  instance_map = panoptic_prediction % dataset_info.label_divisor
  height, width = panoptic_prediction.shape
  colored_panoptic_map = np.zeros((height, width, 3), dtype=np.uint8)

  # Use a fixed seed to reproduce the same visualization.
  random_state = np.random.RandomState(0)

  unique_semantic_ids = np.unique(semantic_map)
  for semantic_id in unique_semantic_ids:
    semantic_mask = semantic_map == semantic_id
    if semantic_id in dataset_info.thing_list:
      # For `thing` class, we will add a small amount of random noise to its
      # correspondingly predefined semantic segmentation colormap.
      unique_instance_ids = np.unique(instance_map[semantic_mask])
      for instance_id in unique_instance_ids:
        instance_mask = np.logical_and(semantic_mask,
                                       instance_map == instance_id)
        panoptic_id = semantic_id * dataset_info.label_divisor + instance_id
        if panoptic_id not in color_mapping:
          random_color = perturb_color(
              dataset_info.colormap[semantic_id],
              perturb_noise,
              used_colors[semantic_id],
              random_state=random_state)
          colored_panoptic_map[instance_mask] = random_color
          color_mapping[panoptic_id] = random_color
        else:
          colored_panoptic_map[instance_mask] = color_mapping[panoptic_id]
    else:
      # For `stuff` class, we use the defined semantic color.
      colored_panoptic_map[semantic_mask] = dataset_info.colormap[semantic_id]
      used_colors[semantic_id].add(tuple(dataset_info.colormap[semantic_id]))
  return colored_panoptic_map


In [None]:
#@title Visualize the Predictions
used_colors = collections.defaultdict(set)
color_mapping = dict()
for i in range(len(filenames)):
  fig, ax = plt.subplots(1, 3, figsize=(18, 6))
  ax[0].title.set_text('Input Image')
  ax[0].imshow(np.squeeze(inputs[i]))
  ax[1].title.set_text('Depth')
  ax[1].imshow(np.squeeze(depth_preds[i]))
  panoptic = stitched_panoptic[i]
  ax[2].title.set_text('Video Panoptic Segmentation')
  panoptic_map = color_panoptic_map(
      np.squeeze(panoptic), cityscapes_dataset_information(), 60, used_colors,
      color_mapping)
  ax[2].imshow(panoptic_map)