In [None]:
import os
import json
import pandas as pd

boxes_path = os.path.join('..', 'data', 'BeesDataset', 'boxes.json')
splits_path = os.path.join('..', 'data', 'BeesDataset', 'splits', '1.json')
with open(boxes_path, 'r') as input_file:
    boxes = json.load(input_file)
with open(splits_path, 'r') as input_file:
    splits = json.load(input_file)

def get_split(img_name, splits):
    for split in splits:
        if img_name in splits[split]:
            return split

objects = []
for image in boxes:
    for box in image['boxes']:
        objects.append([
            image['image_name'].split()[0],
            image['image_name'].split()[1],
            box['xmin'], box['ymin'],
            box['width'], box['height'],
            box['width'] * box['height'],
            get_split(image['image_name'], splits)
            ])
objects = pd.DataFrame(objects, columns=['movie_name', 'image_name', 'xmin', 'ymin', 'width', 'height', 'area', 'split_1'])
objects.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_donut(labels:list, data:list, title:str, startangle:float, save_path:str=None, save_dpi:float=300):
    fig, ax = plt.subplots(subplot_kw=dict(aspect='equal'))
    wedges, texts = ax.pie(data, wedgeprops=dict(width=0.5), startangle=startangle)
    bbox_props = dict(boxstyle='square,pad=0.3', fc='w', ec='k', lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle='-'),
            bbox=bbox_props, zorder=0, va='center')
    for i, p in enumerate(wedges):
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: 'right', 1: 'left'}[int(np.sign(x))]
        connectionstyle = 'angle,angleA=0,angleB={}'.format(ang)
        kw['arrowprops'].update({'connectionstyle': connectionstyle})
        ax.annotate('{}: {:.2f}%'.format(labels[i], data[i]), xy=(x, y), xytext=(1.2*np.sign(x), 1.2*y),
                    horizontalalignment=horizontalalignment, **kw)
    ax.set_title(title)
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path, dpi=save_dpi)
    plt.show()

labels = ['trening', 'walidacja', 'test']
data = []
for split in ['train', 'validate', 'test']:
    tmp = objects.loc[objects['split_1'] == split].count()/objects.count()
    tmp = tmp['split_1'] * 100
    data.append(tmp)
plot_donut(labels, data, 'Procentowy podział obiektów na podzbiory', 200)

In [None]:
# https://matplotlib.org/stable/gallery/lines_bars_and_markers/horizontal_barchart_distribution.html
def plot_barh_chart(results, category_names, title):
    """
    Parameters
    ----------
    results : dict
        A mapping from labels to a list of values.
        It is assumed all lists contain the same number of entries and that
        it matches the length of *category_names*.
    category_names : list of str
        The category labels.
    """
    labels = list(results.keys())
    data = np.array(list(results.values()))
    data_cum = data.cumsum(axis=1)
    category_colors = plt.get_cmap('winter')(
        np.linspace(0.15, 0.85, data.shape[1]))

    fig, ax = plt.subplots(figsize=(6, 12))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        rects = ax.barh(labels, widths, left=starts, height=0.8,
                        label=colname, color=color)

        r, g, b, _ = color
        text_color = 'white' if r * g * b < 0.5 else 'darkgrey'
        ax.bar_label(rects, labels=['{:.1f}%'.format(x) if x > 0 else '' for x in widths], label_type='center', color=text_color)
    ax.legend(ncol=len(category_names), loc='upper center', fontsize='small')
    ax.set_title(title)
    plt.tight_layout()
    plt.show()

category_names = ['trening', 'walidacja', 'test']
data = {}
for movie in objects['movie_name'].unique():
    movie_tmp = []
    for split in ['train', 'validate', 'test']:
        tmp = objects[(objects['movie_name'] == movie) & (objects['split_1'] == split)].count()
        tmp /= objects[objects['movie_name'] == movie].count()
        tmp = tmp['movie_name'] * 100
        movie_tmp.append(tmp)
    data[movie] = movie_tmp
plot_barh_chart(data, category_names, 'Udział procentowy obiektów w podzbiorach dla każdego filmu.')

In [None]:
# TODO Check area ranges and scale
# scale = 1920/640 * 1080/480
scale = 1
area_rngs = {
    'small': pd.Interval(0, scale*32**2),
    'medium': pd.Interval(scale*32**2, scale*96**2),
    'large': pd.Interval(scale*96**2, scale*1e5**2)
}

def get_rng_label(area, area_rngs=area_rngs):
    for label in area_rngs:
        if area in area_rngs[label]:
            return label

objects['area_label'] = objects['area'].apply(get_rng_label)

def get_area_percentage(objects:pd.DataFrame, area_rngs=area_rngs):
    area_percentage = objects['area_label'].value_counts()
    area_percentage = {rng: area_percentage[rng]/sum(area_percentage)*100 for rng in area_rngs}
    return area_percentage

translate = {'small': 'małe', 'medium': 'średnie', 'large': 'duże'}
area_percentage = get_area_percentage(objects)
plot_donut([translate[label] for label in area_rngs], [value for value in area_percentage.values()], 'Udział obiektów w zbiorze ze względu na rozmiar', 220)

In [None]:
for split in splits:
    area_percent = get_area_percentage(objects.loc[objects['split_1'] == split])
    data = [area_percent[rng] for rng in area_rngs]
    labels = [translate[rng_label] for rng_label in area_rngs]
    title_dict = {'train': 'treningowym', 'validate': 'walidacyjnym', 'test': 'testowym'}
    plot_donut(labels, data, 'Udział obiektów w podzbiorze {} ze względu na rozmiar'.format(title_dict[split]), -100)

In [None]:
import matplotlib.patches as patches
from PIL import Image

# The number of images of different sizes per image
objs_sizes_count = objects.groupby(by=['movie_name', 'image_name', 'area_label']).size()
imgs = {rng: objs_sizes_count.loc[:, :, rng].idxmax() for rng in area_rngs}
for rng, img in imgs.items():
    im = Image.open(os.path.join('..', 'data', 'BeesDataset', 'images', ' '.join(img)))
    fig, ax = plt.subplots(figsize=(15, 15))
    ax.imshow(im)
    draw_bbox = lambda bbox: ax.add_patch(patches.Rectangle((bbox['xmin'], bbox['ymin']), bbox['width'], bbox['height'], linewidth=1, edgecolor='r', facecolor='none'))
    objects.loc[(objects['movie_name'] == img[0]) & (objects['image_name'] == img[1])].apply(draw_bbox, axis=1)
    plt.title(translate[rng])
    plt.show()

In [None]:
hist = (objects.width/objects.height).hist()
hist.set_title('Bbox ratio histogram')
hist.set_xlabel('Ratio (width/height)')
hist.set_ylabel('Number of appearances')

In [None]:
img_ratios = []
for filename in objects[['movie_name', 'image_name']].apply(lambda x: ' '.join(x), axis=1).unique():
    im = Image.open(os.path.join('..', 'data', 'BeesDataset', 'images', filename))
    width, height = im.size
    img_ratios.append(width/height)

In [None]:
fig, axs = plt.subplots(1, tight_layout=True)
axs.hist(img_ratios)
axs.set_title('Image ratio histogram')
axs.set_xlabel('Ratio (width/height)')
axs.set_ylabel('Number of appearances')
axs.grid()
plt.show()