# Generate train, val and test dataset

- Dataset
    - MIDV500
        - Contains 50 different identity document types, 10 different condition and devices in each document type
        - Condition
            - Table
            - Keyboard
            - Hand
            - Partial
            - Clutter
        - Device
            - Iphone 5
            - Samsung S3
    - MIDV2019
        - Contains 50 different identity document types, provide 4 extra different condition and devices.
        - Condition
            - Distorted
            - Low lightning
        - Device
            - Iphone XS Max
            - Samsung S10
- How to split into train / val / test dataset
    - To make sure the generalization ability of model and consistency of distribution between train, val and test dataset, the strategies that we will use to split dataset are listed as follow:
        1. Split **midv500** into **midv500-train**, **midv500-val** and **midv500-test** with euqally frequency of different combination of condition, device and document type.
        2. Do the same thing in step 1 in **midv2019** dataset.
        3. Merge **midv500-train** and **midv2019-train** into **midv-train**, so are val and test dataset.
    - Note: the ration between train, val and test is 60%, 20% and 20%, respectivtly.

# Import dependencies

In [1]:
import json
import random

import numpy as np

random.seed(1234)

# Function definitation

In [2]:
# Note: the order of ratio is: (train, val, test)
def get_split_index(annotation: dict, ratio: list):
    index_map = {}

    # Record the idx for each combination, (document type, condition and device)
    for idx, image in enumerate(annotation["images"]):
        s = image["file_name"].split("/")
        s = s[0][:2] + s[2]
        if s in index_map:
            index_map[s].append(idx)
        else:
            index_map[s] = [idx]

    # Split into train, val and test set with given ratio
    index_map_all = {
        "train": [],
        "val": [],
        "test": [],
    }
    for key, value in index_map.items():
        random.shuffle(value)

        size = np.array(ratio) * len(value)
        train_len, val_len, test_len = size.astype(int)
        indexes = {
            "train": value[:train_len],
            "val": value[train_len : (train_len + val_len)],
            "test": value[(train_len + val_len) :],
        }

        for cat in index_map_all.keys():
            index_map_all[cat].extend(indexes[cat])
    return index_map_all


def split_annotation(annotation: dict, index_map: dict):
    images = np.array(annotation["images"])
    anno = np.array(annotation["annotations"])
    categories = annotation["categories"]

    annotation_map = {}
    for key, value in index_map.items():
        annotation_map[key] = {
            "images": list(images[value]),
            "annotations": list(anno[value]),
            "categories": categories,
        }
    return annotation_map


def merge_annotation(anno_A, anno_B):
    merge_anno = {key: value[:] for key, value in anno_A.items()}

    idx = [image["id"] for image in merge_anno["images"]]
    shift = max(idx) + 1

    for idx, val in enumerate(anno_B["annotations"]):
        anno_B["annotations"][idx]["image_id"] += shift
        anno_B["annotations"][idx]["id"] += shift

    for idx, val in enumerate(anno_B["images"]):
        anno_B["images"][idx]["id"] += shift

    for key in ["annotations", "images"]:
        merge_anno[key].extend(anno_B[key])

    return merge_anno

# Load annotations

In [3]:
with open("/data/card-segmentation/midv/midv500_coco.json", "r") as f:
    midv500_anno = json.load(f)

with open("/data/card-segmentation/midv/midv2019_coco.json", "r") as f:
    midv2019_anno = json.load(f)

# Split midv500 and midv2019 into train, val and test dataset

In [4]:
sampling_rate = [0.6, 0.2, 0.2]

index_map = get_split_index(midv500_anno, sampling_rate)
midv500_anno_split = split_annotation(midv500_anno, index_map)


index_map = get_split_index(midv2019_anno, sampling_rate)
midv2019_anno_split = split_annotation(midv2019_anno, index_map)

# Merge annotations

In [5]:
dataset_types = ["train", "val", "test"]
merge_annotations = {}

for key in dataset_types:
    merge_annotations[key] = merge_annotation(
        midv500_anno_split[key], midv2019_anno_split[key]
    )

# Verify the merge result by sample siez

In [6]:
print("midv500")
for k, v in midv500_anno_split.items():
    print("k: {}, len: {}".format(k, len(v["images"])))

print("\n\nmidv2019")
for k, v in midv2019_anno_split.items():
    print("k: {}, len: {}".format(k, len(v["images"])))


print("\n\nmerge_result")
for k, v in merge_annotations.items():
    print("k: {}, len: {}".format(k, len(v["images"])))

midv500
k: train, len: 8840
k: val, len: 2939
k: test, len: 2969


midv2019
k: train, len: 3488
k: val, len: 1154
k: test, len: 1189


merge_result
k: train, len: 12328
k: val, len: 4093
k: test, len: 4158


# Save the result

In [7]:
output_path = "/data/card-segmentation/midv/"

for key, value in merge_annotations.items():
    with open(f"{output_path}midv_{key}.json", "w") as outfile:
        json.dump(value, outfile)