# All imports

In [1]:
# import some common libraries
import os, json, cv2, random, shutil, time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from tqdm import tqdm

%matplotlib inline

In [2]:
# setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor, DefaultTrainer
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.structures import BoxMode

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# global variables and constants
datasets_dir = os.path.join('..', 'datasets')
json_file = os.path.join('..', 'datasets', 'gold_standard_complete.json')
with open(json_file) as f: data_json = json.load(f)
TRAIN_SPLIT = 0.7
RNG_SEED = 117

# Load in our own dataset

In [4]:
# load the functions to create the train and test split
%run Data-split-functions.ipynb

# create the data, set force_new_split to false so that we can skip 
# this phase if it has been done before and 
# extended to false to skip the 'no_annotation' pages
split_data(TRAIN_SPLIT, False, False)

In [5]:
def get_redacted_dicts(img_dir):
    '''
    Get the annotations of the images in the provided directory in the format DatasetCatalog expects
    @param  string    The name of the 
    @return list      The annotations of the files in DatasetCatalog format. Every record has the following properties:
                          - filename:string      The name of the image
                          - image_id:int         The id of the image
                          - height:int           The height of the image
                          - width:int            The width of the image
                          - bbox_mode:string     The mode for the bounding box (BoxMode.XYWH_ABS or BoxMode.XYXY_ABS)
                          - bbox:list            The values for the bounding box depending on the bbox_mode
                          - semgentation:list    The separate semgentations that belong to the same instance
                          - category_id:int      The id of the class the instance belongs to
    '''

    # initial list of segments for this image
    dataset_dicts = []
    
    # iterate over all files/images in the porvided directory
    for idx, filename in tqdm(enumerate(os.listdir(img_dir))):
        
        # the initial record for this file
        record = {}
        
        # get the image
        img_path = os.path.join(img_dir, filename)
        image = cv2.imread(img_path)
        
        # skip this image if we can't load the file
        if image is None: continue
            
        # get the height and width of the image
        height, width = image.shape[:2]
        
        # keep track of some image properties
        record["file_name"] = img_path
        record["image_id"] = idx
        record["height"] = height
        record["width"] = width
        
        # skip this file if we don't have annotations for it
        if not filename in data_json: continue

        # get all polygons of this file
        polygons = [region['shape_attributes'] for region in data_json[filename]['regions']]
        
        # initial annotation of this file
        annotations = []
        
        # iterate over all polygons of this file
        for polygon in polygons:
            
            # handle rectangle polygons
            if polygon['name'] == 'rect':
                
                # create the values needed for the rectangular polygon
                # the values that the segmentation propery expect is:
                # [x1, y1, x2, y2, ..., xn, yn]
                segment = [polygon['x'], polygon['y']]
                segment = segment + [polygon['x'] + polygon['width'], polygon['y']]
                segment = segment + [polygon['x'] + polygon['width'], polygon['y'] + polygon['height']]
                segment = segment + [polygon['x'], polygon['y'] + polygon['height']]

                # create a bounding box for this segment
                bbox = [polygon['x'], polygon['y'], polygon['width'], polygon['height']]
                bbox_mode = BoxMode.XYWH_ABS
                
            # handle generic polygons
            elif polygon['name'] == 'polygon':
                
                # create the segmentation from all x and y values of the polygon
                # the values that the segmentation propery expect is:
                # [x1, y1, x2, y2, ..., xn, yn]
                px = polygon["all_points_x"]
                py = polygon["all_points_y"]
                segment = [(x + 0.5, y + 0.5) for x, y in zip(px, py)]
                segment = [p for x in segment for p in x]
                
                # create the bounding box for this segment
                bbox = [np.min(px), np.min(py), np.max(px), np.max(py)]
                bbox_mode = BoxMode.XYXY_ABS
            
            # skip unknown polygon types
            else: continue
            
            # add the semgent specification 
            annotations.append({
                "bbox": bbox,
                "bbox_mode": bbox_mode,
                "segmentation": [segment],
                "category_id": 0,
            })
            
        # add the annotations to the record
        record["annotations"] = annotations
        
        # add the record to the dataset
        dataset_dicts.append(record)
    return dataset_dicts

In [6]:
# create the dataset catalog and metadata for the train data
DatasetCatalog.register('redacted_train', lambda x='train':get_redacted_dicts(os.path.join(datasets_dir, x)))
metadata = MetadataCatalog.get('redacted_train').set(thing_classes=["redacted"])
metadata.set(thing_dataset_id_to_contiguous_id = {'0' : 'redacted'})
metadata.set(stuff_dataset_id_to_contiguous_id = {})
redacted_metadata = MetadataCatalog.get('redacted_train')

# Create the config for the model
This is a reference to the default config with all available options: https://detectron2.readthedocs.io/en/latest/modules/config.html#yaml-config-references

In [7]:
# initial config for the detectron2 model
cfg = get_cfg() # the default config
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml")) # the config of the coco dataset
cfg.SEED = RNG_SEED 

# data config
cfg.DATASETS.TRAIN = ("redacted_train",)
cfg.DATASETS.TEST = ()
cfg.DATALOADER.NUM_WORKERS = 2 # this will alter the speed of the training, my gpu could only handle 2
cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS = False # we also want the model to train on documents without redactions
cfg.INPUT.RANDOM_FLIP = "none" # we don't add any random flips as data augmentation

# model config
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml")  # initial weights from model zoo
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 256   # The "RoIHead batch size". The default is 512, but a smaller size is faster
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class* ('redacted')
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 # inference score threshold 
# *NOTE: this config means the number of classes, but a few popular unofficial tutorials incorrect uses num_classes+1 here.

# create the output dir
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

In [8]:
# solver config that we need to use a gridsearch on
cfg.SOLVER.IMS_PER_BATCH = 2  # This is the real "batch size" commonly known to deep learning people
cfg.SOLVER.BASE_LR = 0.001  # pick a good LR
cfg.SOLVER.MAX_ITER = 5000    
cfg.SOLVER.STEPS = []        # do not decay learning rate

# Train the model with our data

In [9]:
# train the model 
trainer = DefaultTrainer(cfg) 
trainer.resume_or_load(resume=False)
trainer.train()

[32m[01/23 08:36:25 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

664it [00:42, 15.70it/s]

[32m[01/23 08:37:08 d2.data.build]: [0mDistribution of instances among all 1 categories:
[36m|  category  | #instances   |
|:----------:|:-------------|
|  redacted  | 7765         |
|            |              |[0m
[32m[01/23 08:37:08 d2.data.dataset_mapper]: [0m[DatasetMapper] Augmentations used in training: [ResizeShortestEdge(short_edge_length=(640, 672, 704, 736, 768, 800), max_size=1333, sample_style='choice')]
[32m[01/23 08:37:08 d2.data.build]: [0mUsing training sampler TrainingSampler
[32m[01/23 08:37:08 d2.data.common]: [0mSerializing the dataset using: <class 'detectron2.data.common._TorchSerializedList'>
[32m[01/23 08:37:08 d2.data.common]: [0mSerializing 664 elements to byte tensors and concatenating them all ...
[32m[01/23 08:37:08 d2.data.common]: [0mSerialized dataset takes 0.61 MiB
[32m[01/23 08:37:08 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn


Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (2, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (2,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (4, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (4,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.mask_head.predictor.weight' to the model due to incompatible shapes: (80, 256, 1, 1) in the checkpoint but (1, 256, 1, 1) i

[32m[01/23 08:37:09 d2.engine.train_loop]: [0mStarting training from iteration 0


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[32m[01/23 08:37:36 d2.utils.events]: [0m eta: 1:31:41  iter: 19  total_loss: 5.029  loss_cls: 0.5129  loss_box_reg: 0.275  loss_mask: 0.6933  loss_rpn_cls: 3.111  loss_rpn_loc: 0.2997    time: 1.0790  last_time: 1.1500  data_time: 0.1956  last_data_time: 0.0014   lr: 1.9981e-05  max_mem: 5533M
[32m[01/23 08:38:01 d2.utils.events]: [0m eta: 1:26:09  iter: 39  total_loss: 1.916  loss_cls: 0.4652  loss_box_reg: 0.3814  loss_mask: 0.6051  loss_rpn_cls: 0.2667  loss_rpn_loc: 0.1898    time: 1.0434  last_time: 0.9599  data_time: 0.0014  last_data_time: 0.0014   lr: 3.9961e-05  max_mem: 5533M
[32m[01/23 08:38:21 d2.utils.events]: [0m eta: 1:25:19  iter: 59  total_loss: 1.78  loss_cls: 0.362  loss_box_reg: 0.3742  loss_mask: 0.4648  loss_rpn_cls: 0.1986  loss_rpn_loc: 0.1823    time: 1.0271  last_time: 0.8165  data_time: 0.0013  last_data_time: 0.0014   lr: 5.9941e-05  max_mem: 5533M
[32m[01/23 08:38:42 d2.utils.events]: [0m eta: 1:24:06  iter: 79  total_loss: 1.707  loss_cls: 0.4017 

[32m[01/23 08:47:08 d2.utils.events]: [0m eta: 1:15:58  iter: 579  total_loss: 0.6715  loss_cls: 0.1313  loss_box_reg: 0.2485  loss_mask: 0.1526  loss_rpn_cls: 0.01876  loss_rpn_loc: 0.1308    time: 1.0143  last_time: 1.0925  data_time: 0.0013  last_data_time: 0.0011   lr: 0.00057942  max_mem: 5732M
[32m[01/23 08:47:28 d2.utils.events]: [0m eta: 1:15:23  iter: 599  total_loss: 0.6235  loss_cls: 0.1098  loss_box_reg: 0.2335  loss_mask: 0.1498  loss_rpn_cls: 0.01172  loss_rpn_loc: 0.09375    time: 1.0132  last_time: 0.9529  data_time: 0.0013  last_data_time: 0.0012   lr: 0.0005994  max_mem: 5732M
[32m[01/23 08:47:49 d2.utils.events]: [0m eta: 1:15:18  iter: 619  total_loss: 0.6943  loss_cls: 0.136  loss_box_reg: 0.2749  loss_mask: 0.1446  loss_rpn_cls: 0.01926  loss_rpn_loc: 0.1332    time: 1.0148  last_time: 1.1614  data_time: 0.0013  last_data_time: 0.0014   lr: 0.00061938  max_mem: 5732M
[32m[01/23 08:48:09 d2.utils.events]: [0m eta: 1:14:51  iter: 639  total_loss: 0.5827  los

[32m[01/23 08:56:42 d2.utils.events]: [0m eta: 1:06:45  iter: 1139  total_loss: 0.4899  loss_cls: 0.0889  loss_box_reg: 0.1828  loss_mask: 0.1384  loss_rpn_cls: 0.008418  loss_rpn_loc: 0.07131    time: 1.0195  last_time: 1.1685  data_time: 0.0013  last_data_time: 0.0012   lr: 0.001  max_mem: 5938M
[32m[01/23 08:57:03 d2.utils.events]: [0m eta: 1:06:21  iter: 1159  total_loss: 0.5495  loss_cls: 0.09103  loss_box_reg: 0.1931  loss_mask: 0.1351  loss_rpn_cls: 0.0164  loss_rpn_loc: 0.1192    time: 1.0196  last_time: 0.9917  data_time: 0.0013  last_data_time: 0.0013   lr: 0.001  max_mem: 5938M
[32m[01/23 08:57:23 d2.utils.events]: [0m eta: 1:06:01  iter: 1179  total_loss: 0.6067  loss_cls: 0.08955  loss_box_reg: 0.2411  loss_mask: 0.1319  loss_rpn_cls: 0.01433  loss_rpn_loc: 0.1109    time: 1.0194  last_time: 1.0354  data_time: 0.0013  last_data_time: 0.0012   lr: 0.001  max_mem: 5938M
[32m[01/23 08:57:43 d2.utils.events]: [0m eta: 1:05:38  iter: 1199  total_loss: 0.5156  loss_cls: 

[32m[01/23 09:06:15 d2.utils.events]: [0m eta: 0:57:26  iter: 1699  total_loss: 0.4145  loss_cls: 0.06122  loss_box_reg: 0.1662  loss_mask: 0.1206  loss_rpn_cls: 0.004448  loss_rpn_loc: 0.06602    time: 1.0206  last_time: 1.1446  data_time: 0.0013  last_data_time: 0.0012   lr: 0.001  max_mem: 5940M
[32m[01/23 09:06:36 d2.utils.events]: [0m eta: 0:57:07  iter: 1719  total_loss: 0.5221  loss_cls: 0.07126  loss_box_reg: 0.1986  loss_mask: 0.1255  loss_rpn_cls: 0.007276  loss_rpn_loc: 0.09874    time: 1.0206  last_time: 0.9232  data_time: 0.0013  last_data_time: 0.0012   lr: 0.001  max_mem: 5940M
[32m[01/23 09:06:56 d2.utils.events]: [0m eta: 0:56:41  iter: 1739  total_loss: 0.4802  loss_cls: 0.06875  loss_box_reg: 0.1991  loss_mask: 0.1201  loss_rpn_cls: 0.007559  loss_rpn_loc: 0.07586    time: 1.0206  last_time: 1.0062  data_time: 0.0013  last_data_time: 0.0013   lr: 0.001  max_mem: 5940M
[32m[01/23 09:07:17 d2.utils.events]: [0m eta: 0:56:26  iter: 1759  total_loss: 0.4746  loss

[32m[01/23 09:15:48 d2.utils.events]: [0m eta: 0:46:41  iter: 2259  total_loss: 0.5648  loss_cls: 0.07555  loss_box_reg: 0.2236  loss_mask: 0.1407  loss_rpn_cls: 0.007402  loss_rpn_loc: 0.09689    time: 1.0212  last_time: 0.9170  data_time: 0.0013  last_data_time: 0.0013   lr: 0.001  max_mem: 5940M
[32m[01/23 09:16:09 d2.utils.events]: [0m eta: 0:46:31  iter: 2279  total_loss: 0.4679  loss_cls: 0.06964  loss_box_reg: 0.1861  loss_mask: 0.1221  loss_rpn_cls: 0.003093  loss_rpn_loc: 0.07204    time: 1.0213  last_time: 0.9904  data_time: 0.0013  last_data_time: 0.0011   lr: 0.001  max_mem: 5940M
[32m[01/23 09:16:30 d2.utils.events]: [0m eta: 0:46:11  iter: 2299  total_loss: 0.4644  loss_cls: 0.06712  loss_box_reg: 0.1936  loss_mask: 0.1235  loss_rpn_cls: 0.005984  loss_rpn_loc: 0.08812    time: 1.0215  last_time: 0.9398  data_time: 0.0014  last_data_time: 0.0010   lr: 0.001  max_mem: 5940M
[32m[01/23 09:16:51 d2.utils.events]: [0m eta: 0:45:48  iter: 2319  total_loss: 0.3873  loss

[32m[01/23 09:25:26 d2.utils.events]: [0m eta: 0:37:47  iter: 2819  total_loss: 0.3745  loss_cls: 0.04876  loss_box_reg: 0.1495  loss_mask: 0.1219  loss_rpn_cls: 0.007273  loss_rpn_loc: 0.0525    time: 1.0232  last_time: 1.0679  data_time: 0.0014  last_data_time: 0.0014   lr: 0.001  max_mem: 5940M
[32m[01/23 09:25:48 d2.utils.events]: [0m eta: 0:37:35  iter: 2839  total_loss: 0.401  loss_cls: 0.05025  loss_box_reg: 0.1684  loss_mask: 0.1352  loss_rpn_cls: 0.002841  loss_rpn_loc: 0.06498    time: 1.0235  last_time: 0.8539  data_time: 0.0013  last_data_time: 0.0014   lr: 0.001  max_mem: 5940M
[32m[01/23 09:26:08 d2.utils.events]: [0m eta: 0:37:12  iter: 2859  total_loss: 0.3659  loss_cls: 0.047  loss_box_reg: 0.1507  loss_mask: 0.09837  loss_rpn_cls: 0.005247  loss_rpn_loc: 0.06415    time: 1.0234  last_time: 1.0427  data_time: 0.0013  last_data_time: 0.0013   lr: 0.001  max_mem: 5940M
[32m[01/23 09:26:28 d2.utils.events]: [0m eta: 0:36:49  iter: 2879  total_loss: 0.552  loss_cls

[32m[01/23 09:35:03 d2.utils.events]: [0m eta: 0:28:28  iter: 3379  total_loss: 0.4904  loss_cls: 0.05862  loss_box_reg: 0.1973  loss_mask: 0.1214  loss_rpn_cls: 0.009245  loss_rpn_loc: 0.08389    time: 1.0244  last_time: 1.0760  data_time: 0.0014  last_data_time: 0.0010   lr: 0.001  max_mem: 5940M
[32m[01/23 09:35:23 d2.utils.events]: [0m eta: 0:28:06  iter: 3399  total_loss: 0.4052  loss_cls: 0.04848  loss_box_reg: 0.1546  loss_mask: 0.1278  loss_rpn_cls: 0.005024  loss_rpn_loc: 0.08823    time: 1.0243  last_time: 1.0672  data_time: 0.0013  last_data_time: 0.0013   lr: 0.001  max_mem: 5940M
[32m[01/23 09:35:44 d2.utils.events]: [0m eta: 0:27:44  iter: 3419  total_loss: 0.3604  loss_cls: 0.04163  loss_box_reg: 0.1318  loss_mask: 0.1177  loss_rpn_cls: 0.003892  loss_rpn_loc: 0.05425    time: 1.0242  last_time: 1.0643  data_time: 0.0013  last_data_time: 0.0020   lr: 0.001  max_mem: 5940M
[32m[01/23 09:36:04 d2.utils.events]: [0m eta: 0:27:24  iter: 3439  total_loss: 0.3991  loss

[32m[01/23 09:44:39 d2.utils.events]: [0m eta: 0:18:30  iter: 3939  total_loss: 0.4286  loss_cls: 0.05536  loss_box_reg: 0.1777  loss_mask: 0.1104  loss_rpn_cls: 0.005602  loss_rpn_loc: 0.06712    time: 1.0249  last_time: 1.0390  data_time: 0.0013  last_data_time: 0.0013   lr: 0.001  max_mem: 5940M
[32m[01/23 09:44:59 d2.utils.events]: [0m eta: 0:18:09  iter: 3959  total_loss: 0.4676  loss_cls: 0.06006  loss_box_reg: 0.2047  loss_mask: 0.1161  loss_rpn_cls: 0.004457  loss_rpn_loc: 0.08473    time: 1.0247  last_time: 1.0433  data_time: 0.0013  last_data_time: 0.0014   lr: 0.001  max_mem: 5940M
[32m[01/23 09:45:19 d2.utils.events]: [0m eta: 0:17:48  iter: 3979  total_loss: 0.3458  loss_cls: 0.043  loss_box_reg: 0.158  loss_mask: 0.09556  loss_rpn_cls: 0.003496  loss_rpn_loc: 0.03558    time: 1.0246  last_time: 1.0480  data_time: 0.0013  last_data_time: 0.0012   lr: 0.001  max_mem: 5940M
[32m[01/23 09:45:39 d2.utils.events]: [0m eta: 0:17:27  iter: 3999  total_loss: 0.3849  loss_c

[32m[01/23 09:54:02 d2.utils.events]: [0m eta: 0:08:38  iter: 4499  total_loss: 0.3924  loss_cls: 0.04337  loss_box_reg: 0.1719  loss_mask: 0.1089  loss_rpn_cls: 0.004182  loss_rpn_loc: 0.06705    time: 1.0223  last_time: 0.9983  data_time: 0.0013  last_data_time: 0.0012   lr: 0.001  max_mem: 5940M
[32m[01/23 09:54:21 d2.utils.events]: [0m eta: 0:08:16  iter: 4519  total_loss: 0.4597  loss_cls: 0.05299  loss_box_reg: 0.1891  loss_mask: 0.1245  loss_rpn_cls: 0.004594  loss_rpn_loc: 0.0963    time: 1.0222  last_time: 0.9320  data_time: 0.0013  last_data_time: 0.0010   lr: 0.001  max_mem: 5940M
[32m[01/23 09:54:41 d2.utils.events]: [0m eta: 0:07:56  iter: 4539  total_loss: 0.4192  loss_cls: 0.04896  loss_box_reg: 0.1587  loss_mask: 0.1123  loss_rpn_cls: 0.004075  loss_rpn_loc: 0.05351    time: 1.0221  last_time: 1.1362  data_time: 0.0013  last_data_time: 0.0012   lr: 0.001  max_mem: 5940M
[32m[01/23 09:55:02 d2.utils.events]: [0m eta: 0:07:35  iter: 4559  total_loss: 0.4652  loss_

# Evaluation
I personally restart the kernel here to empty the memory and rerun everything except the trainer cell above.

In [9]:
# global variables and constants for the evaluation
IOU_THRESHOLD = 0.5
data_csv = pd.read_csv(os.path.join(datasets_dir, 'data_complete.csv'))

In [10]:
# Inference should use the config with parameters that are used in training
# cfg now already contains everything we've set previously. We changed it a little bit for inference:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5   # set a custom testing threshold (in this case the same as the training threshold)
predictor = DefaultPredictor(cfg)

[32m[01/25 13:24:22 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from ./output\model_final.pth ...


In [11]:
def create_gold_standard_masks(input_image_path):
    '''
    Create a separate mask for every annotated region of an image
    @param  string      The path to the image
    @return np.array    The numpy array representation of the masks with the 
                        dimensions of the image and the golden 
                        standard region drawn on it. Only the golden standard
                        region coordinates have the value True, the rest 
                        is False.
    '''
    
    # load the image
    input_image = cv2.imread(input_image_path)
    
    # get the golden standard for the image
    gold_standard_image = data_json[os.path.split(input_image_path)[-1]]
    
    # make sure that we have the golden standard for the image
    if not gold_standard_image: return None
    
    # get the golden standard regions for the image
    gold_standard_regions = gold_standard_image['regions']
    
    # extract the polygons from the regions
    polygons = [r['shape_attributes'] for r in gold_standard_regions]
    
    # array holding all masks
    masks = []
    
    # construct the polygon arrays and add them to the mask
    for polygon in polygons:
        
        # create the initial mask (black) with the dimensions of the original image
        mask = np.zeros(np.array(input_image).shape, dtype = "uint8")
        
        if polygon['name'] == 'rect':
            bottom_left = [polygon['x'], polygon['y']]
            bottom_right = [polygon['x']+polygon['width'], polygon['y']]
            top_right = [polygon['x']+polygon['width'], polygon['y']+polygon['height']]
            top_left = [polygon['x'], polygon['y']+polygon['height']]
            
            gold_standard_polygon_xy = [bottom_left, bottom_right, top_right, top_left]
        else:
            # If not a rectangle we have a more complex shape and we just add all points to it
            gold_standard_polygon_xy  = [[polygon['all_points_x'][i], polygon['all_points_y'][i]] for i in range(0, len(polygon['all_points_x']))]

        # add the polygon (white) to the mask
        mask = cv2.fillPoly(mask, [np.array(gold_standard_polygon_xy, np.int32)], (255,255,255))   
        
        # save the mask with a single channel with boolean values
        masks.append(np.array(mask).astype(bool)[:, :, 0])

    # return the masks
    return masks

In [12]:
def metric_calculation(dataframe):
    '''
    The metric calculations as done in https://github.com/irlabamsterdam/TPDLTextRedaction/blob/main/notebooks/Experiments.ipynb
    @param  pd.DataFrame    The dataframe for one class with the following columns { IOU, TP, FN, FP }
                            where the IOU is the sum of IOU scores and the others a total count.
    @return dict            The metric scores for this class
    '''
    
    SQ = dataframe['IOU'].sum() / dataframe['TP'].sum() if dataframe['TP'].sum() > 0 else 0
    RQ = dataframe['TP'].sum() / (dataframe['TP'].sum() + 0.5*dataframe['FN'].sum() + 0.5*dataframe['FP'].sum())
    PQ = SQ*RQ
    P = dataframe['TP'].sum() / (dataframe['TP'].sum() + dataframe['FP'].sum())
    R = dataframe['TP'].sum() / (dataframe['TP'].sum() + dataframe['FN'].sum())
    
    return { 'PQ': round(PQ, 2), 'SQ': round(SQ, 2), 'RQ': round(RQ, 2), 'P': round(P, 2), 'R': round(R, 2) }

In [13]:
# Function to remove the overlap between predicted masks
# this should also speed up the calculation of the overall PQ score
def remove_box_overlap(predicted_masks, scores, score_t: float=0.5, iou_t: float= 0.5):
    '''
    Remove the overlap between predicted masks
    @param  list    The predicted masks
    @param  list    The confidence scores of the predicted masks
    @param  float   The prediction confidence score threshold
    @param  float   The interval-over-intersection threshold to consider
                    an annotated region and predicted region a true positive
    @return
    '''
    
    # Filter out boxes with a low confidence score
    filtered_masks = [predicted_masks[i].numpy() for i in range(len(predicted_masks)) if scores[i] > score_t]
    filtered_scores = [scores[i] for i in range(len(scores)) if scores[i] > score_t]
    
    # Sort boxes based on their confidence scores
    sorted_masks = np.array(filtered_masks)[np.argsort(filtered_scores)]
    
    # The list was sorted from worst to best score so we have to reverse the list
    sorted_masks = sorted_masks[::-1]
    
    # after the first step, we have to remove overlaps by looping through and calculating iou
    if not len(sorted_masks):
        return None
    
    # By definition we always include the first mask in the output
    output_masks = [sorted_masks[0]]
    mask_overlap = np.copy(sorted_masks[0])
    
    # evaluate all masks
    for i in range(1, len(sorted_masks)):
        
        # get the mask and copy it to make sure we don't overwrite it
        mask = np.copy(sorted_masks[i])
        
        # get the mask size
        mask_size = (mask > 0).sum()
        
        # check if the current mask has any overlap with previously evaluated masks
        only_mask = np.logical_and((mask == 1), (mask_overlap == 0))
        
        # only keep masks that do not overlap for more
        # than the given IoU threshold with the previous masks
        if (only_mask.sum() / mask_size) > iou_t:
            output_masks.append(np.copy(only_mask))
            mask_overlap += only_mask
            mask_overlap = mask_overlap > 0
            
    # only return the masks if we have them
    if len(output_masks): return np.stack(output_masks)
    else: return None

In [14]:
def get_PQ_score(ground_truth, prediction):
    '''
    Get the values that are needed to calculate the PQ-score
    @param  np.array    The ground truth masks
    @param  np.array    The predicted masks
    @return dict        The values needed for the PQ-score (TP, FP, FN, IOU)
    '''
    
    # make sure that the prediction
    # is always numpy array by default
    if prediction is None: prediction = np.array([])
        
    # get the initial values
    TP = []
    IOU = []
    
    # get the indices of the ground truth and prediction masks
    gt_indices = list(range(ground_truth.shape[0]))
    pred_indices = list(range(prediction.shape[0]))

    # iterate over the ground truth and prediction 
    # masks to calculate the iou between all combinations
    for i in range(ground_truth.shape[0]):
        for j in range(prediction.shape[0]):
            
            # calculate the iou between this ground truth
            # and predicted mask
            ground_truth_mask = ground_truth[i, :, :]
            predicted_mask = prediction[j, :, :]
            union = ((ground_truth_mask + predicted_mask) > 0).sum()
            intersection = (predicted_mask * ground_truth_mask).sum()
            iou = intersection / union
            
            # add this combination of masks as a true positive if
            # the iou exceeds the threshold and keep track of the
            # iou score for the segmentation quality metric
            if iou > 0.5:
                TP.append((i, j))
                IOU.append(iou)
                    
    # every unused predicted mask is a false positive
    FP = set(pred_indices)-set([item[1] for item in TP])
    
    # every unused ground truth maks is a false negative
    FN = set(gt_indices)-set([item[0] for item in TP])
    
    # return the values needed for the panoptic quality metric
    return {'TP': len(TP), 'FP': len(FP), 'FN': len(FN), 'IOU': sum(IOU)}

In [15]:
def extract_panoptic_evaluation_values(image_path, score_t: float=0.5, iou_t: float= 0.5):
    '''
    Function to evaluate how well the detection is for an image
    @param  string  The path to the image
    @param  float   The prediction confidence score threshold
    @param  float   The interval-over-intersection threshold to consider
                    an annotated region and predicted region a true positive
    @return dict    The dict with the evaluation scores:
                        - iou:float  The sum of the intersection over union values of all true positive regions
                        - tp:int     The number of true positives
                        - fp:int     The number of false positives
                        - fn:int     The number of false negatives
    '''
    
    # get the golden standard masks for the image
    im = cv2.imread(image_path)
    g_masks = create_gold_standard_masks(image_path)
    
    # get the filename from the path
    filename = os.path.split(image_path)[-1]
    
    # get the label
    label = data_csv[data_csv['File'] == filename].type.item()
    
    # get the prediction masks for the image
    outputs = predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    p_masks = outputs['instances'].pred_masks.cpu()
    scores = outputs['instances'].scores.cpu()
    
    # filter the prediction masks by removing the overlap between them
    filtered_p_masks = remove_box_overlap(p_masks, scores, score_t, iou_t)
    
    # get the PQ score
    pq = get_PQ_score(np.array(g_masks), filtered_p_masks)
    
    # add the label to the score
    pq['Label'] = label
    
    #return the pq score
    return pq

In [16]:
def evaluate_dataframe(dataset: list, score_t: float=0.5, iou_t: float= 0.5):
    '''
    Function to get the panoptic quality
    @param  list    A dataset in the DatasetCatalog format 
    @param  float   The prediction confidence score threshold
    @param  float   The interval-over-intersection threshold to consider
                    an annotated region and predicted region a true positive
    '''
    
    # create a dataframe to store the panoptic quality values in
    values = {}
    
    # iterate over the samples
    for sample in tqdm(dataset):
        
        # get the values for the panoptic quality evaluation for this sample
        values[sample['file_name']] = extract_panoptic_evaluation_values(sample['file_name'], score_t, iou_t)

    # return put all values in a dataframe 
    # with the file_name as the index
    return pd.DataFrame(values).T

In [17]:
# get the validation dicts
test_dir = os.path.join(datasets_dir, 'test')
test_dicts = get_redacted_dicts(test_dir)

284it [00:19, 14.78it/s]


In [None]:
# get the dataframe with the evaluation scores
test_df = evaluate_dataframe(test_dicts, 0.5, 0.5)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  8%|▊         | 23/284 [04:46<58:25, 13.43s/it]  

In [None]:
# save the dataframe
test_df.to_csv(os.path.join('results', 'maskrcnn_results.csv'))

In [None]:
# calculates the panoptic quality metrics
results = {}
for label, label_df in test_df.groupby('Label'): results[label] = metric_calculation(label_df)
results['total'] = metric_calculation(test_df)

# show the metrics in a pandas dataframe
pd.DataFrame.from_dict(results).T[['PQ', 'SQ', 'RQ', 'P', 'R']]

# Time the model

In [9]:
# the test dir
test_dir = os.path.join(datasets_dir, 'test')

# load the trained model
# !NOTE!: Make sure that the cfg is loaded in a previous cell
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5   # set a custom testing threshold (in this case the same as the training threshold)
predictor = DefaultPredictor(cfg)

[32m[01/23 11:36:41 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from ./output\model_final.pth ...


In [10]:
def time_algorithm(input_image_path):
    '''
    Time the image loading and model prediction
    @param  string    The path to the image
    @return dict      The times of the individual parts and total time
    '''
    
    # time the image loading
    load_start = time.time()
    input_image = cv2.imread(input_image_path)
    load_end = time.time()
    
    # time the prediction
    predict_start = time.time()
    outputs = predictor(input_image)
    predict_end = time.time()
    
    # add the separate time differences
    times = {
        'loading': load_end-load_start,
        'predicting': predict_end-predict_start
    }
    
    # add the total time (sum of the individual parts)
    times['total'] = sum(times.values())
    
    # return the times
    return times

In [11]:
# do this over all the images and average
load_times = []
predicting_times = []
total_times = []

# time the model for all test images
for filename in tqdm(os.listdir(test_dir)):
    image_path = os.path.join(test_dir, filename)
    times = time_algorithm(image_path)
    load_times.append(times['loading'])
    predicting_times.append(times['predicting'])
    total_times.append(times['total'])

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|██████████| 284/284 [01:21<00:00,  3.50it/s]


In [12]:
# print the average times
print("Average loading time is %.3f seconds" % np.mean(load_times))
print("Average predicting time is %.3f seconds" % np.mean(predicting_times))
print("Average total time is %.3f seconds" % np.mean(total_times))

Average loading time is 0.064 seconds
Average predicting time is 0.220 seconds
Average total time is 0.284 seconds
