In [1]:
%load_ext autoreload
%autoreload 2

# Exploratory analysis of the datasets

![](https://user-images.githubusercontent.com/45664104/86419986-e932eb00-bccc-11ea-98e3-e1dcba512804.png)

## Basis

In [16]:
DS_PATH = "../datasets/supervisely"

In [28]:
# Clone the repo which contains essential functions
!git clone https://github.com/JinhangZhu/supervisely-to-darknet.git
import sys
sys.path.append('./supervisely-to-darknet')

Cloning into 'supervisely-to-darknet'...


In [18]:
# Libraries
from tqdm import tqdm
import os
from convert import get_classes
import json
import glob

In [19]:
# Delete all separate classes.names within each person dataset
for p in tqdm(os.listdir(DS_PATH), desc='Person'):
    filepath = os.path.join(DS_PATH, p)
    if os.path.isdir(filepath):
        if os.path.isfile(filepath + './classes.names'):
            os.remove(filepath + './classes.names')

Person: 100%|██████████| 21/21 [00:00<00:00, 21472.55it/s]


In [24]:
class_names = {}    # Class names for person

for p in os.listdir(DS_PATH):
    filepath = os.path.join(DS_PATH, p)
    # print('\nPerson {}: '.format(p), filepath)

    if os.path.isdir(filepath):
        # Read meta.json to get classes
        meta_path = filepath + os.sep + 'meta.json'
        if os.path.isfile(meta_path):
            classes, names_path = get_classes(meta_path, write=False)
            class_names[p] = classes
        else:
            print('There is no meta.json file for person ', p)

# Save class names of all datasets into json
# class_names_path = DS_PATH + './ds_class_names.json'
class_names_path = './ds_class_names.json'
print('\nSave labels for person/subset/images in {}.'.format(class_names_path))
with open(class_names_path, 'w') as fp:
    json.dump(class_names, fp, indent=4)

# Iterate over key/value pairs in dict and print them
for p, clss in class_names.items():
    print(p, ' : ', clss)


Save labels for person/subset/images in ./ds_class_names.json.
P01 :['hand', '1', 'left_hand', 'right_hand']
P02 :['hand']
P03 :['hand']
P04 :['hand']
P05 :['hand']
P06 :['hand']
P07 :['hand']
P08 :['hand']
P10 :['hand']
P12 :['hand']
P13 :['hand']
P14 :['hand']
P15 :['hand']
P16 :['hand']
P17 :['hand']
P19 :['hand']
P20 :['hand']
P21 :['hand']
P22 :['hand']
P23 :['hand']


可以看出来，整体的标注信息表明，这20个人的数据集里，只有第一个人的数据包含四种标签，其余所有人的数据都是只有`hand`的标签，也就是有手和无手的区别。
下一步看：

- 各个标签的数目/该数据集的**有**标注总数/照片总数

> 但是`1`这个标签实在是很奇怪，需要继续看。对于左右手的标签，我随便点开了几个标注文件，发现没有左右手的，可能也是个占小部分的标注，也需要拿出来看看，是哪些照片有这些小比例的标注，这个比例有多少？总数又是多少？

In [14]:
# # Read json of class names
# with open(class_names_path, 'r') as fp:
#     class_names = json.load(fp)

## Details and Visualisation

In [22]:
def get_subset_labels(sub_p_path):
    """Get the labels from a subset of a person.

    Argument:
        sub_p_path: the path of the subset. e.g. '../datasets/supervisely\\P23\\P23_01'
    
    Returns:
        sub_p_labels: a dictionary whose keys are image names, values are a list of tuples.
            Each tuple is a bounding box: (class_name, b_x_center, b_y_center, b_width, b_height)
    """
    sub_p_labels={}

    # Get all file real paths
    read_path = sub_p_path + os.sep
    ann_paths = sorted(glob.glob(read_path + 'ann/' + '*.json'))
    img_paths = sorted(glob.glob(read_path + 'img/' + '*.jpg'))

    # Import all json annotation files for images
    for (ann_path, img_path) in zip(ann_paths, img_paths):
        # Current image
        img_name = os.path.basename(img_path)[:-4]
        sub_p_labels[img_name] = []

        # Import json
        with open(ann_path) as ann_f:
            ann_data = json.load(ann_f)
        
        # Image size
        image_size = ann_data['size']   # dict: {'height': , 'width': }

        # Objects bounding boxes
        bboxes = ann_data['objects']
        if len(bboxes) != 0:    # With object(s)
            for bbox in bboxes:
                class_name = bbox['classTitle']
                corner_coords = bbox['points']['exterior']  # bbox corner coordinates in [[left, top], [right, bottom]]

                # Normalisation
                b_x_center = (corner_coords[0][0] + corner_coords[1][0]) / 2 / image_size['width']
                b_y_center = (corner_coords[0][1] + corner_coords[1][1]) / 2 / image_size['height']
                b_width = (corner_coords[1][0] - corner_coords[0][0]) / image_size['width']
                b_height = (corner_coords[1][1] - corner_coords[0][1]) / image_size['height']

                # Save bbox label as a tuple for the image
                sub_p_labels[img_name].append(
                    (
                        class_name,
                        round(b_x_center, 6),
                        round(b_y_center, 6),
                        round(b_width, 6),
                        round(b_height, 6)
                    )
                ) 

                # # Check the annotation
                # if class_name == '1':
                #     print('label "1": {}'.format(img_path))
                # elif class_name == 'left_hand':
                #     print('label "left_hand": {}'.format(img_path))
                # elif class_name == 'right_hand':
                #     print('label "right_hand": {}'.format(img_path))
                # else:
                #     pass
        
    return sub_p_labels

In [25]:
# Labels of all datasets of all person
# format:
# {
#   'person':{
#               'subset':{
#                           'image': [(*bbox attributes)]
#                        }                
#            }   
# }
labels = {}

# Collect all annotations into a single json
for p in tqdm(os.listdir(DS_PATH), desc='Person'):
    p_path = os.path.join(DS_PATH, p)

    if os.path.isdir(p_path): 
        labels[p] = {}   # 'person'

        for sub_p in os.listdir(p_path):
            sub_p_path = os.path.join(p_path, sub_p)

            if os.path.isdir(sub_p_path):
                labels[p][sub_p] = get_subset_labels(sub_p_path)    # 'subset'
                print(
                    "{} image-annotation pairs in subset {} of person {}".format(len(labels[p][sub_p]), sub_p, p)
                )

labels_path = 'labels.json'
print('\nSave labels for person/subset/images in {}.'.format(labels_path))
with open(labels_path, 'w') as fp:
    json.dump(labels, fp, indent=4)

Person:   0%|          | 0/21 [00:00<?, ?it/s]1004 image-annotation pairs in subset P01_02 of person P01
238 image-annotation pairs in subset P01_03 of person P01
984 image-annotation pairs in subset P01_06 of person P01
Person:  10%|▉         | 2/21 [00:00<00:03,  5.91it/s]2070 image-annotation pairs in subset P02_06 of person P02
134 image-annotation pairs in subset P02_08 of person P02
109 image-annotation pairs in subset P02_11 of person P02
Person:  14%|█▍        | 3/21 [00:00<00:03,  4.68it/s]219 image-annotation pairs in subset P03_06 of person P03
215 image-annotation pairs in subset P03_07 of person P03
106 image-annotation pairs in subset P03_11 of person P03
49 image-annotation pairs in subset P03_18 of person P03
195 image-annotation pairs in subset P03_27 of person P03
Person:  19%|█▉        | 4/21 [00:00<00:03,  4.81it/s]394 image-annotation pairs in subset P04_21 of person P04
777 image-annotation pairs in subset P05_04 of person P05
312 image-annotation pairs in subset 

找到少数类的标注所在的图片|以及数目|以及总数

In [26]:
def find_annotation(labels, annotation):
    r_with_anno = False
    for p, p_labels in labels.items():
        for sub_p, sub_p_labels in p_labels.items():

            imgs_with_anno = []
            for img_name, bboxes in sub_p_labels.items():
                if len(bboxes) > 0:
                    for bbox in bboxes: # bbox: tuples of five attributes
                        if bbox[0] == annotation:
                            imgs_with_anno.append(img_name)
                            break
            
            if len(imgs_with_anno) > 0 and r_with_anno is False:
                r_with_anno = True  # There exists such an image with such annotation

            # print(
            #     "For subset {}, images with annotation '{}' count {}/{}.".format(sub_p, annotation, len(imgs_with_anno), len(sub_p_labels))#,
            # #    imgs_with_anno
            # )
    return r_with_anno

In [27]:
# Get all labels from class_names (for all person)
all_annotations = []
for clss in class_names.values():
    all_annotations.extend(clss)
all_annotations = set(all_annotations)
for annotation in all_annotations:
    if not find_annotation(labels, annotation):
        print("No image with annotation: '{}'".format(annotation))
    else:
        print("Image exists with annotation: '{}'".format(annotation))

No image with annotation: 'left_hand'
No image with annotation: 'right_hand'
No image with annotation: '1'
Image exists with annotation: 'hand'


现在问题很清晰了，第一个人的数据集根本没有我开始认为的少数标签：`1`, `left_hand`, `right_hand`，猜想是最开始设置了这些但是实际标注只标注了`hand`的标签。

所以得到一个结论：**这一次下载的所有的数据集都只有`hand`的标签**。可以把第一个人的`meta.json`的无用标签删掉了。

下一步，**数据清洗`hand`数据集，生成anchors，完成训练**。