# Prerequisites



## Dependencies

In [None]:
# Dark magic happens here: installing dependencies for GDAL 3.0.4
# build process via APT and install GDAL itself via PyPI
!time (add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable && \
 apt install python3-gdal=3.0.4+dfsg-1~bionic0 && \
 apt purge --autoremove python3-gdal && \
 pip install gdal==3.0.4 && \
 apt install gdal-bin=3.0.4+dfsg-1~bionic0)

from osgeo import gdal; print(f"GDAL version {(gdal.__version__)}")

## Google Drive

In [None]:
from os import path as osp

from google.colab import drive


PATH_DRIVE = osp.join('/', 'content', 'drive')

# Do not mount if it is already attached
if not osp.exists(PATH_DRIVE):
    print("Mounting Google Drive...")
    drive.mount(PATH_DRIVE)
else:
    print("Google Drive has been already mounted!")

## Paths

In [None]:
from os import path as osp


PATH_STORAGE = osp.join('ods', 'soc')  # arbitrary subpath in Google Drive (if any)
if 'PATH_DRIVE' in locals():
    PREFIX_DRIVE = osp.join(osp.basename(PATH_DRIVE), 'MyDrive', PATH_STORAGE)
else:
    PREFIX_DRIVE = ''

PATH_TEMP = osp.join('/', 'content', 'temp')
PATH_INPUT = osp.join('/', 'content', PREFIX_DRIVE, 'input')
PATH_OUTPUT = osp.join('/', 'content', PREFIX_DRIVE, 'output')
PATH_MODELS = osp.join('/', 'content', PREFIX_DRIVE, 'models')
PATH_DATASET = osp.join('/', 'content', PREFIX_DRIVE, 'dataset')
PATH_RESOURCES = osp.join('/', 'content', 'resources')

# FILE_SHAPEFILE = osp.join(PATH_RESOURCES, 'clustering', 'cutline',
#                           'Start_Ice_Map_UTMz40WGS84f_r.shp')

print('\n'.join((PATH_STORAGE, PATH_TEMP, PATH_INPUT, PATH_OUTPUT, PATH_MODELS,
                 PATH_DATASET, PATH_RESOURCES)))

## Dataset

Exclude bad images

In [None]:
items_exclude_a = [
    'S1B_EW_GRDM_1SDH_20200203T031613_20200203T031631_020099_0260A6_D03B.tiff',
    'S1B_EW_GRDM_1SDH_20200215T031613_20200215T031630_020274_026647_9E25.tiff',
    'S1B_EW_GRDM_1SDH_20200227T031613_20200227T031630_020449_026BE9_3282.tiff',
    'S1B_EW_GRDM_1SDH_20200310T031613_20200310T031630_020624_027178_1A36.tiff',
    'S1B_EW_GRDM_1SDH_20200322T031613_20200322T031631_020799_027702_664C.tiff',
    'S1B_EW_GRDM_1SDH_20200521T031615_20200521T031633_021674_029249_923C.tiff',
]

items_exclude_b = [
    'S1A_EW_GRDM_1SDH_20191117T031700_20191117T031800_029945_036ADD_32F2.tiff',
    'S1A_EW_GRDM_1SDH_20191129T031659_20191129T031759_030120_0370EF_D07E.tiff',
    'S1A_EW_GRDM_1SDH_20200104T031658_20200104T031758_030645_038306_DDA1.tiff',
    'S1A_EW_GRDM_1SDH_20200328T031656_20200328T031756_031870_03ADB5_D992.tiff',
    'S1A_EW_GRDM_1SDH_20200421T031657_20200421T031757_032220_03BA08_1F43.tiff',
]

items_exclude_c = [
    'S1A_EW_GRDM_1SDH_20191211T031659_20191211T031759_030295_0376F4_BE3E.tiff',
    'S1A_EW_GRDM_1SDH_20191223T031658_20191223T031758_030470_037CFD_AB38.tiff',
    'S1A_EW_GRDM_1SDH_20200221T031656_20200221T031756_031345_039B6D_927B.tiff',
    'S1A_EW_GRDM_1SDH_20200304T031656_20200304T031756_031520_03A17D_08EB.tiff',
    'S1A_EW_GRDM_1SDH_20200316T031656_20200316T031756_031695_03A78C_D3A3.tiff',
    'S1A_EW_GRDM_1SDH_20200409T031657_20200409T031757_032045_03B3E6_7A01.tiff',
    'S1A_EW_GRDM_1SDH_20200503T031658_20200503T031758_032395_03C031_950B.tiff',
]

items_exclude_d = [
    'S1A_EW_GRDM_1SDH_20191107T030034_20191107T030132_029799_0365BB_F7CF.tiff',
    'S1B_EW_GRDM_1SDH_20200601T023525_20200601T023601_021834_02971A_B08C.tiff'
]

Make splits from filtered data

In [None]:
import os


items = os.listdir(osp.join(PATH_DATASET, 'masks', '2-class'))

items_all = [item for item in sorted(items) if item not in 
                items_exclude_a + items_exclude_b + items_exclude_c +
                items_exclude_d]

DATA_SPLIT = 0

if DATA_SPLIT is None:
    items_train = tuple(items_all[i] for i in range(len(items_all)) \
                        if not i or i % int(round(fraction)))

    items_valid = tuple(items_all[i] for i in range(len(items_all)) \
                        if i and not i % int(round(fraction)))
else:
    items_split = [{
        'train': sorted(set(items_all) - set(items_all[i::5])),
        'valid': sorted(items_all[i::5])
    } for i in range(5)]
    items_train = items_split[DATA_SPLIT]['train']
    items_valid = items_split[DATA_SPLIT]['valid']

Check training / validation ratio

In [None]:
print(len(items_train), '/', len(items_valid))

# Annotations

At the moment CVAT suffers from inability to convert a task with larger number of classes into a task with smaller number of classes (not all the source classes are uniquely mapped to the target ones). Thus this workaround is being done:

* make a copy of the task, so the original (the source, multiclass in our case) task has the same number and order of images as the target (binary in our case) task;
* make a dump of the source annotations;
* process the source annotations changing it for the target task;
* upload the processed annotations to the target task.

Several operations are performed further for the target task:

1. Multiclass to binary conversion (types of ice into water/ice)
2. Replace original polygons with polygons from vectorized masks



## Path to annotations

Select sample annotations dump

In [None]:
name_task_source = 'task_sea_ice_noland_multiclass_30-2021_10_12_16_22_22-coco 1.0.zip'
path_annotations = osp.join(PATH_INPUT, 'annotations', name_task_source)

print(osp.exists(path_annotations), path_annotations)

## Parse sample annotations dump from CVAT

Sample annotations dump is in COCO dataset format for the target task

In [None]:
import json

from zipfile import ZipFile

with ZipFile(path_annotations, 'r') as archive:
    annotations = json.loads(archive.read('annotations/instances_default.json'))

annotations.keys()

Check original categories (classes)

In [None]:
annotations['categories']

Copy and rename the source categories (include one extra class for the target)

In [None]:
categories_binary = annotations['categories'][:3].copy()

categories_binary[1]['name'] = 'ice'
categories_binary[2]['name'] = 'nodata'

categories_binary

The first and the last COCO image entries

In [None]:
annotations['images'][:1] + annotations['images'][-1:]

Check annotation fields taking one of the annotation items

In [None]:
annotations['annotations'][-1].keys()

The last annotation item polygon size (points)

In [None]:
len(annotations['annotations'][-1]['segmentation'][0])

## Convert classes

Convert category indexes from multiclass source to binary target:

| id | source | id | target |
| - | - | - | - |
| 1 | water | 1 | water |
| 2 | nilas | 2 | ice |
| 3 | young | 2 | ice |
| 4 | first-year | 2 | ice |
| 5 | fast | 2 | ice |
| - | - | 3 | nodata |


In [None]:
for annotation in annotations['annotations']:
    annotation['category_id'] = min(annotation['category_id'], 2)

Check unique target categories

In [None]:
set([annotation['category_id'] for annotation in annotations['annotations']])

Backup original categories

In [None]:
categories_multiclass = annotations['categories']

Replace categories

In [None]:
annotations['categories'] = categories_binary

Check categories applied

In [None]:
annotations['categories']

In [None]:
import cv2 as cv
import numpy as np
import pandas as pd

from itertools import repeat


def get_contours(mask, threshold=1e4, step=1):
    # Split values into separate channels (one-hot encoding): HxW -> HxWxC
    channels = mask.max() + 1
    mask = np.stack([mask == i for i in range(channels)],
                    axis=-1).astype('uint8')
    list_contours = []
    for i in range(channels):
        channel = mask[:, :, i]
        # Contours is a list of lists of tuples (coordinates)
        contours, hierarchy = cv.findContours(channel, cv.RETR_TREE,
                                              cv.CHAIN_APPROX_TC89_KCOS)
        # Calculate areas for the contours
        areas = [round(cv.contourArea(contour)) for contour in contours]
        # Reduce points by step parameter (no reduce: step = 1)
        contours = [contour[::step, ...].copy() for contour in contours]
        # Calculate number of points
        points = [len(contour) for contour in contours]

        # Append a dataframe for contours, contour areas and number of points
        list_contours.append(pd.DataFrame(zip(repeat(i), areas, points, contours),
                                          columns=['channel', 'area',
                                                   'points', 'contour']))
    # Make a dataframe for total contours, contour areas and number of points
    df_contours = pd.concat(list_contours)
    # Filter out duplicated contours from different channels
    count_original = len(df_contours)
    df_contours = df_contours.groupby(['area', 'points'], as_index=False).first()
    count_filtered = len(df_contours)
    # print(f"DEBUG: filtered out {count_original - count_filtered} duplicated contours!")
    # Filter out very small polygons (here come areas)
    count_original = count_filtered
    df_contours = df_contours[(df_contours['area'] > threshold) &
                              (df_contours['points'] > 2)]
    count_filtered = len(df_contours)
    # print(f"DEBUG: filtered out {count_original - count_filtered} small contours!")
    output = [{'channel': channel,
               'area': area,
               'contour': contour} for contour, area, channel \
              in zip(df_contours['contour'].to_list(),
                     df_contours['area'].to_list(),
                     df_contours['channel'].to_list())]
    return output

In [None]:
annotations_target = annotations.copy()

annotations_target['annotations'][0].keys()

In [None]:
# len([annotation['id'] for annotation in annotations_target['annotations']])

In [None]:
annotations_converted = []
index_converted = 1
map_category = {
    0: 3,
    1: 1,
    2: 2
}

for n, item in enumerate(items):
    path_item = osp.join(PATH_DATASET, 'masks', '2-class', item)
    # Vectorize raster mask
    image = cv.imread(path_item, cv.IMREAD_LOAD_GDAL)
    contours = get_contours(image, threshold=5e3, step=3)
    # Find current image id in the target annotations
    id_image = [image['id'] for image in annotations_target['images'] \
                if image['file_name'].split('/')[-1] == item][0]
    print(f"{n:3d}: id = {id_image:3d}, item = {item},",
          f"contours = {len(contours:3d)}")
    # Assemble annotations structure
    for contour in contours:
        area = contour['area']
        points = contour['contour'].squeeze().reshape(-1).tolist()
        channel = contour['channel']
        # Annotation dictionary
        annotation_converted = {}
        annotation_converted['area'] = area
        annotation_converted['attributes'] = {'occluded': False}
        points_horizontal = sorted(points[0::2])
        points_vertical = sorted(points[1::2])
        annotation_converted['bbox'] = [points_horizontal[0], points_vertical[0],
                                        points_horizontal[-1], points_vertical[-1]]
        annotation_converted['category_id'] = map_category[channel]
        annotation_converted['image_id'] = id_image
        annotation_converted['id'] = index_converted
        annotation_converted['iscrowd'] = 0
        annotation_converted['segmentation'] = [points]
        annotations_converted.append(annotation_converted)
        index_converted += 1
    # break

Check converted annotations

In [None]:
annotations_converted[0].keys()

In [None]:
len(annotations_converted)#[-1]['segmentation'][0])

Replace original annotation with converted ones

In [None]:
annotations_target['annotations'] = annotations_converted

Save processed annotations for the target task

In [None]:
from datetime import datetime

FORMAT_DATE = '%Y_%m_%d_%H_%M_%S'

name_task_target = name_task_source.split('-')
name_task_target[-2] = datetime.utcnow().strftime(FORMAT_DATE)
name_task_target = '-'.join(name_task_target)
print(name_task_target)

path_annotations_target = path_annotations.replace(name_task_source,
                                                   name_task_target)
path_annotations_target = path_annotations_target.replace(PATH_INPUT,
                                                          PATH_OUTPUT)
print(path_annotations_target)

os.makedirs(osp.dirname(path_annotations_target), exist_ok=True)
# raise KeyboardInterrupt  # DEBUG

with ZipFile(path_annotations_target, 'w') as archive:
    archive.writestr('annotations/instances_default.json',
                     json.dumps(annotations_target), compresslevel=9)

In [None]:
!du -h "{path_annotations_target}"