In [1]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU


def restart_kernel():
  """
  Restart the kernel. This will clear off the variables as
  well as the entire workspace. Any downloaded files or cloned repos
  will be cleared off. Think of this as a hard reset.
  """
  os.system('kill -9 -1')
  
def check_available_memory():
  """
  Prints a summary of both the general and GPU RAM status.
  """
  GPUs = GPU.getGPUs()
  # XXX: only one GPU on Colab and isn’t guaranteed
  gpu = GPUs[0]
  def printm():
   process = psutil.Process(os.getpid())
   print("Gen RAM Free: " + \
         humanize.naturalsize( psutil.virtual_memory().available ),
         " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
   print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | "\
         "Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed,
                                  gpu.memoryUtil*100, gpu.memoryTotal))
  printm()
  
def refresh_repo():
  """
  Refreshes the local repo. If the repo is not present, clones the Mask R-CNN
  fork, if it's present, takes a pull from the origin.
  """
  items = os.listdir()
  if items == ['.config', 'sample_data']:
    os.system('git clone https://github.com/JayadeepSasikumar/Mask_RCNN.git')
  else:
    if items == ['.config', 'Mask_RCNN', 'sample_data']:
      os.system('cd Mask_RCNN')
    os.system('git pull origin master')

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Running setup.py bdist_wheel for gputil ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


## Cloning into (or taking a pull of) the Mask R-CNN fork

In [0]:
refresh_repo()

In [3]:
cd /content/Mask_RCNN/samples/davis

/content/Mask_RCNN/samples/davis


In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

def upload_file(file_path, target_file_name):
  """
  Upload a local file to Google Drive.
  
  Inputs -
    file_path - str, the local path to the file to be saved.
    target_file_name - str, the name of the file under which
      to save in Google Drive.
      
  Returns -
    uploaded_file_id - str, the id of the uploaded file as
      assigned by Google Drive.
  """
  uploaded = drive.CreateFile({'title': target_file_name})
  uploaded.SetContentFile(file_path)
  uploaded.Upload()
  uploaded_file_id = uploaded.get('id')
  print('Uploaded file with ID {}'.format(uploaded_file_id))
  return uploaded_file_id


def download_file(file_path, file_id):
  """
  Downloads a file from Google Drive and save it locally.
  
  Inputs -
    file_path - str, the local path to which the downloaded
      file is to be saved.
    file_id - str, the id of the uploaded file as
      assigned by Google Drive.
  """
  f_ = drive.CreateFile({'id': file_id})
  f_.GetContentFile(file_path)

# Part 1 - Training the model

## 1. Loading the required imports and setting the constants

In [0]:
import os
import sys
import random
import math
import re
import time
import numpy as np
import cv2
import matplotlib
import matplotlib.pyplot as plt

# Root directory of the project
ROOT_DIR = os.path.abspath("../../")

# Import Mask RCNN
sys.path.append(ROOT_DIR)  # To find local version of the library
from mrcnn.config import Config
from mrcnn import utils
import mrcnn.elu_model
import mrcnn.model

from mrcnn import visualize
from mrcnn.model import log

%matplotlib inline 

# Directory to save logs and trained model
MODEL_DIR = os.path.join(ROOT_DIR, "logs")

DATA_DIR = os.path.join(ROOT_DIR, "davis_data")

# relu or elu, modellib would be changed based on this setting
ACTIVATION_UNIT = 'elu'

## 2. Choosing the proper model file according to the ACTIVATION_UNIT setting

In [0]:
if ACTIVATION_UNIT == 'relu':
  modellib = mrcnn.model
  # Local path to trained weights file
  COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
  # Download COCO trained weights from Releases if needed
  if not os.path.exists(COCO_MODEL_PATH):
      utils.download_trained_weights(COCO_MODEL_PATH)
else:
  modellib = mrcnn.elu_model

## 3. Overriding the default configurations for the DAVIS 2016 dataset.

In [0]:
class DAVISConfig(Config):
    """Configuration for training on the DAVIS 2016 dataset.
    Derives from the base Config class and overrides values specific
    to the DAVIS 2016 dataset.
    """
    # Give the configuration a recognizable name
    NAME = "davis"

    # Batch size is 1 (GPU_COUNT * IMAGES_PER_GPU).
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1  # because of the limited memory

    # Number of classes (including background)
    NUM_CLASSES = 1 + 1  # background + object

    # Use small images for faster training. Set the limits of the small side
    # the large side, and that determines the image shape.
    IMAGE_MIN_DIM = 832
    IMAGE_MAX_DIM = 832
    IMAGE_RESIZE_MODE = "square"  # This is the default option as well

    # Use smaller anchors because our image and objects are small
    RPN_ANCHOR_SCALES = (32, 64, 128, 256)  # anchor side in pixels

    # Reduce training ROIs per image because the images are small and have
    # few objects. Aim to allow ROI sampling to pick 33% positive ROIs.
    TRAIN_ROIS_PER_IMAGE = 5  #32

    # Need not be the same size as the training dataset
    STEPS_PER_EPOCH = 500

    # Use larger numbers for higher confidence, smaller ones for faster
    # epochs.
    VALIDATION_STEPS = 356
    
config = DAVISConfig()

## 4. Extending the Dataset class to serve the DAVIS 2016 dataset.

Three methods need to be implemented for any custom dataset to work with Mask R-CNN -  
1. image_reference  
2. load_images . 
3. load_mask

In [0]:
class DAVISDataset(utils.Dataset):
    """Encapsulates the DAVIS dataset.
    """
    def image_reference(self, image_id):
        """Return the davis data of the image."""
        info = self.image_info[image_id]
        if info["source"] == "davis":
            return info["davis"]
        else:
            super(self.__class__, self).image_reference(image_id)
    
    def load_images(self, mode='train'):
        """
        Loads the 480p images from the DAVIS dataset.
        """
        images_dir = os.path.join(DATA_DIR, 'JPEGImages', '480p')
        self.add_class('davis', 1, 'object')
        image_paths_file_name = mode + '.txt'
        image_paths_file_path = os.path.join(DATA_DIR, image_paths_file_name)
        with open(image_paths_file_path, 'r') as image_paths_file:
            for i, line in enumerate(image_paths_file):
                try:
                  image_path, mask_path = line.split()[0], line.split()[1]
                except IndexError:
                  continue
                image_path = DATA_DIR + image_path
                mask_path = DATA_DIR + mask_path
                pic_name = image_path.split('/')[-1]
                pic_class = image_path.split('/')[-2]
                self.add_image("davis", image_id=i, path=image_path,
                           pic_name=pic_name, pic_class='object',
                           mask_path=mask_path)
            
    def load_mask(self, image_id):
        """Load instance masks for the given image.

        Different datasets use different ways to store masks. Override this
        method to load instance masks and return them in the form of am
        array of binary masks of shape [height, width, instances].

        Returns:
            masks: A bool array of shape [height, width, instance count] with
                a binary mask per instance.
            class_ids: a 1D array of class IDs of the instance masks.
        """
        info = self.image_info[image_id]
        mask_path = info['mask_path']
        pic_class = info['pic_class']
        mask = cv2.imread(mask_path)
        mask = mask[:, :, 0:1]
        class_ids = np.array([self.class_names.index(pic_class)])
        return mask.astype(np.bool), class_ids.astype(np.int32)

## 5. Creating the training, validation and test datasets and preparing them.

In [0]:
# Training dataset
dataset_train = DAVISDataset()
dataset_train.load_images(mode='train')
dataset_train.prepare()

# Validation dataset
dataset_val = DAVISDataset()
dataset_val.load_images(mode='val')
dataset_val.prepare()

# Test dataset
dataset_test = DAVISDataset()
dataset_test.load_images(mode='test')
dataset_test.prepare()

## 6. Create a model and load the pre-trained weights for the backbone.

In [0]:
# Create model in training mode
model = modellib.MaskRCNN(mode="training", config=config,
                          model_dir=MODEL_DIR)

if ACTIVATION_UNIT == 'elu':
  model_google_drive_id = 'google_drive_id_of_the_elu_model'
  model_path = 'path/where/to/save/model'
  download_file(model_path, model_google_drive_id)
  model.load_weights(model_path, by_name=True,
                   exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", 
                            "mrcnn_bbox", "mrcnn_mask"])
else:
  model.load_weights(COCO_MODEL_PATH, by_name=True,
                       exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", 
                                "mrcnn_bbox", "mrcnn_mask"])

## 7. Training the model on the DAVIS 2016 dataset

Training is done in 2 stages, the number of epochs below are exactly as suggested in coco.py. Could be changed as needed.

**Stage 1** involves training the network head.  
**Stage 2** fine-tunes the entire network.  
  
After each epoch, the trained model is saved at MODEL_DIR. At the end of each training phase, the model at the end of the epoch at which the validation error elbows out could be chosen as the model to go ahead with.

In [0]:
# Stage 1 - training the head
model.train(dataset_train, dataset_val, 
            learning_rate=config.LEARNING_RATE,
            epochs = 10,
            layers='heads')

In [0]:
# Choose the model from the proper epoch as stated above.
best_model_path = 'path/to/the/best/model'

model = modellib.MaskRCNN(mode="training", config=config,
                          model_dir=MODEL_DIR)
model.load_weights(best_model_path, by_name=True)

In [0]:
# Stage 2 - training the entire model
best_model.train(dataset_train, dataset_val, 
                 learning_rate=config.LEARNING_RATE / 3,
                 epochs=10, 
                 layers="all")

In [0]:
# Choose the model with the least validation loss in the second training phase
trained_model_path = 'path/to/model/with/least/validation/loss'

## 8. Create a base model for inference

1. Create a new config for inference.  
2. Create a new model in inference mode and load it up with the weights of the model chosen as the best one after the second phase of training.
3. This model would act as the base model on which one-shot fine-tuning will be applied.

In [0]:
class InferenceConfig(DAVISConfig):
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    DETECTION_MIN_CONFIDENCE = 0.8

inference_config = InferenceConfig()


# Recreate the model in inference mode
trained_model = modellib.MaskRCNN(mode="inference", 
                              config=inference_config,
                              model_dir=MODEL_DIR)
trained_model.load_weights(trained_model_path, by_name=True)

# Part 2 - One-shot fine-tuning

## 9. Create a new Config

1. An epoch would contain only one image - the idea is to fine-tune the entire model with the first frame of a video sequence, and its ground-truth.

In [0]:
# def get_specific_model(image_id, dataset.)
sequence_name = ""
class SpecificConfig(Config):
    """Configuration for training on the DAVIS 2017 dataset.
    Derives from the base Config class and overrides values specific
    to the DAVIS 2017 dataset. - SEQUENCE SPECIFIC VERSION
    """
    # Give the configuration a recognizable name
    NAME = "specific"

    # Train on 1 GPU and 8 images per GPU. We can put multiple images on each
    # GPU because the images are small. Batch size is 8 (GPUs * images/GPU).
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

    # Number of classes (including background)
    NUM_CLASSES = 1 + 1  # background + object

    # Use small images for faster training. Set the limits of the small side
    # the large side, and that determines the image shape.
    IMAGE_MIN_DIM = 832
    IMAGE_MAX_DIM = 832
    IMAGE_RESIZE_MODE = "square"  # This is the default option as well

    # Use smaller anchors because our image and objects are small
    RPN_ANCHOR_SCALES = (32, 64, 128, 256)  # anchor side in pixels

    # Reduce training ROIs per image because the images are small and have
    # few objects. Aim to allow ROI sampling to pick 33% positive ROIs.
    TRAIN_ROIS_PER_IMAGE = 5  #32

    # Use a small epoch since the data is simple
    STEPS_PER_EPOCH = 1

    # use small validation steps since the epoch is small
    VALIDATION_STEPS = 1
    
sp_config = SpecificConfig()

## 10. Extending the Dataset class to serve a single video sequence from DAVIS 2016 dataset.

1. A global variable `sequence_name` is maintained which will be used to maintain which of the test sequences are being predicted for at any moment during execution.  
2. When the mode is 'training', only the first frame from the video sequence will be returned as the training dataset.  
3. When the mode is 'test', all the frames for the sequence are returned.  


In [0]:
sequence_name = ''

class SpecificDataset(utils.Dataset):
    """Encapsulates the DAVIS dataset - returns a dataset with only one entry.
    """
    def image_reference(self, image_id):
        """Return the davis data of the image."""
        info = self.image_info[image_id]
        if info["source"] == "davis":
            return info["davis"]
        else:
            super(self.__class__, self).image_reference(image_id)
    
    def load_images(self, mode='train'):
        """
        Loads the 480p images from the DAVIS dataset.
        """
        images_dir = os.path.join(DATA_DIR, 'JPEGImages', '480p')
        self.add_class('davis', 1, 'object')
        image_paths_file_name = 'test.txt'
        image_paths_file_path = os.path.join(DATA_DIR, image_paths_file_name)
        with open(image_paths_file_path, 'r') as image_paths_file:
            for i, line in enumerate(image_paths_file):
                try:
                  image_path, mask_path = line.split()[0], line.split()[1]
                except IndexError:
                  continue
                if mode == 'train':
                  if '00000' in image_path and sequence_name in image_path:
                    image_path = DATA_DIR + image_path
                    mask_path = DATA_DIR + mask_path
                    pic_name = image_path.split('/')[-1]
                    pic_class = image_path.split('/')[-2]
                    self.add_image("davis", image_id=i, path=image_path,
                               pic_name=pic_name, pic_class='object',
                               mask_path=mask_path)
                elif mode == 'test':
                    if sequence_name in image_path and '00000' not in image_path:
                      image_path = DATA_DIR + image_path
                      mask_path = DATA_DIR + mask_path
                      pic_name = image_path.split('/')[-1]
                      pic_class = image_path.split('/')[-2]
                      self.add_image("davis", image_id=i, path=image_path,
                                 pic_name=pic_name, pic_class='object',
                                 mask_path=mask_path) 
                else:
                  if '00001' in image_path and sequence_name in image_path:
                    image_path = DATA_DIR + image_path
                    mask_path = DATA_DIR + mask_path
                    pic_name = image_path.split('/')[-1]
                    pic_class = image_path.split('/')[-2]
                    self.add_image("davis", image_id=i, path=image_path,
                               pic_name=pic_name, pic_class='object',
                               mask_path=mask_path)
            
    def load_mask(self, image_id):
        """Load instance masks for the given image.

        Different datasets use different ways to store masks. Override this
        method to load instance masks and return them in the form of am
        array of binary masks of shape [height, width, instances].

        Returns:
            masks: A bool array of shape [height, width, instance count] with
                a binary mask per instance.
            class_ids: a 1D array of class IDs of the instance masks.
        """
        info = self.image_info[image_id]
        mask_path = info['mask_path']
        pic_class = info['pic_class']
        mask = cv2.imread(mask_path)
        mask = mask[:, :, 0:1]
        class_ids = np.array([self.class_names.index(pic_class)])
        return mask.astype(np.bool), class_ids.astype(np.int32)

In [0]:
def get_sp_datasets():
  """
  Creates the training, validation and test datasets based
  on the video sequence that is currently being tested for
  and prepares them. The sequence being predicted for is
  understood by the SpecificDataset class from the global
  variable maintained, sequence_name.
  """
  # Training dataset
  sp_dataset_train = SpecificDataset()
  sp_dataset_train.load_images(mode='train')
  sp_dataset_train.prepare()

  # Validation dataset
  sp_dataset_val = SpecificDataset()
  sp_dataset_val.load_images(mode='val')
  sp_dataset_val.prepare()

  # Test dataset
  sp_dataset_test = SpecificDataset()
  sp_dataset_test.load_images(mode='test')
  sp_dataset_test.prepare()
  
  return sp_dataset_train, sp_dataset_val

In [0]:
import glob
import os

def get_specific_model():
  """
  Returns the model fine-tuned on the first frame of the
  current sequence being predicted for.
  
  Returns -
    sp_inf_model - a Mask R-CNN model fine-tuned on the
      first frame of the current sequence being predicted
      for.
  """
  specific_model = copy(trained_model)
  sp_dataset_train, sp_dataset_val = get_sp_datasets()
  specific_model.train(sp_dataset_train, sp_dataset_val, 
                      learning_rate=config.LEARNING_RATE / 3,
                      epochs=1, 
                      layers="all")
  model_paths = glob.glob('/content/Mask_RCNN/logs/*/*')
  latest_model_path = max(model_paths, key=os.path.getctime)
  print (latest_model_path)
  sp_inf_model = modellib.MaskRCNN(mode="inference", 
                              config=inference_config,
                              model_dir=MODEL_DIR)
  sp_inf_model.load_weights(latest_model_path, by_name=True)
  return sp_inf_model

## 11. Use fine-tuned models for prediction

1. Iterate through the test frames, and get the fine-tuned model for the specific sequence.  
2. Predict for the frame using the respective model.  
3. Store the results in `test_results_dict`. This would be used for evaluation.  
  
PS - the RAM constraints could cause problems with storing up all the different models. If faced with such a problem, the prediction could be done in batches, the `test_results_dict` for each batch pickled and uploaded to Google Drive. These pickled dicts can be later combined during evaluation.

In [0]:
from copy import copy
test_sequence_names = ['car-roundabout', 'soapbox', 'goat', 'blackswan',
                       'cows', 'kite-surf', 'dance-twirl', 'breakdance',
                       'horsejump-high', 'paragliding-launch',
                       'scooter-black', 'camel', 'libby', 'parkour',
                       'drift-straight', 'drift-chicane', 'motocross-jump',
                       'dog', 'car-shadow', 'bmx-trees']

sequence_name = ""
test_results_dict = {}


# dataset_train and dataset_val can also be predicted for the same way and
# saved and pickled and uploaded to Google Drive if needed.
for image_id in dataset_test.image_ids:
    # Load image and ground truth data
    mask_path = dataset_test.image_info[image_id]['mask_path']
    folder_name = mask_path.split('/')[-2]
    if sequence_name != folder_name:
      specific_model = get_specific_model()
      sequence_name = folder_name
    image, image_meta, gt_class_id, gt_bbox, gt_mask =\
        modellib.load_image_gt(dataset_test, inference_config,
                               image_id, use_mini_mask=False)
    molded_images = np.expand_dims(modellib.mold_image(image, inference_config), 0)
    # Run object detection
    results = specific_model.detect([image], verbose=0)
    test_results_dict[image_id] = results[0]
    print ("\r{}".format(image_id), end="")

## 12. Pickle `test_results_dict` and upload it to Google Drive

In [0]:
import pickle

test_results_path = 'path/to/save/test/results'
with open(test_results_path, 'wb') as fp:
  pickle.dump(test_results_dict, fp)
upload_file(test_results_path, 'test_results_dict.pickle')