In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Imports

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pylab
from pycocotools.coco import COCO

pylab.rcParams["figure.figsize"] = (8.0, 10.0)


## Configure which dataset to load:

In [None]:
dataType = ""
annFile = Path("/home/gbiamby/proj/geoscreens/datasets/geoscreens_011/geoscreens_011.json")

print("annFile: ", annFile)


## Load dataset

In [None]:
coco = COCO(annFile)


# Output some dataset stats:

In [None]:
imgs = coco.dataset["images"]
anns = coco.dataset["annotations"]
imgs_with_anns = coco.imgToAnns.keys()


print("Total images: ", len(imgs))
print("Total images with annotations: ", len(coco.imgToAnns.keys()))
print("Total annotations: ", len(anns))

## Annotations per image:

In [None]:
ann_counts_per_image = [len(anns) for image_id, anns in coco.imgToAnns.items()]
max_anns_per_image = int(np.max(ann_counts_per_image))
print(
    f"Annotations per image (ignoring images w/o any anns): "
    f"Avg.: {np.average(ann_counts_per_image)}, "
    f"Min: {np.min(ann_counts_per_image)}, "
    f"Max: {max_anns_per_image}"
)

# Plot:
fig_size, dpi = (20, 15), 80
plt.figure(num=None, figsize=fig_size, dpi=dpi, facecolor="w", edgecolor="k")
# plt.hist(ann_counts_per_image, bins=list(range(0, max_anns_per_image + 1, min(max_anns_per_image // 5, 25))))
plt.hist(ann_counts_per_image, bins=min(max_anns_per_image, 25))
plt.title("Annotations per Image Distribution " + annFile.name)
plt.xlabel("# Annotations per Image", fontsize=12)
plt.ylabel("# Images", fontsize=12)
plt.show()

# To cover full annotations for `K_coverage`% of images, we need to use opt.K=:
K_coverage = 99.9
K_percentile = np.percentile(a=ann_counts_per_image, q=[99.9])
print(
    f"To cover full annotations for {K_coverage}% of images, we need to use opt.K={int(K_percentile[0])}"
)


## Plot Histogram of bbox areas (pixels^2):



In [None]:
def plot_bbox_area_histogram(areas):
    print(f"min area: {np.min(areas)}, max area: {np.max(areas)}, avg. area: {np.average(areas)}")

    # Configure histogram bins:
    step_size = 100
    bins = list(range(0, int(np.percentile(areas, 97.5)) + step_size, step_size))

    # Plot:
    fig_size, dpi, rotation = (14, 6), 80, 90
    plt.figure(num=None, figsize=fig_size, dpi=dpi, facecolor="w", edgecolor="k")
    plt.hist(areas, bins=bins)
    plt.title("Object Size Distribution " + annFile.name)
    plt.xlabel("Area (pixels^2)", fontsize=12)
    plt.ylabel("# Annotations", fontsize=12)
    plt.show()

In [None]:
areas = [ann["bbox"][2] * ann["bbox"][3] for ann in anns]
plot_bbox_area_histogram(areas)

In [None]:
# How many zero area boxes are there?
print(list(sorted([a for a in areas if a <= 10])))
# only 6 zero area, but a lot of small areas
# plot_bbox_area_histogram(sorted([a for a in areas if a <= 10]))

small_areas = list(sorted([a for a in areas if a <= 5000]))

# Configure histogram bins:
step_size = 5
bins = list(range(0, int(np.percentile(small_areas, 97.5)) + step_size, step_size))
# print(bins)

# Plot:
fig_size, dpi, rotation = (14, 6), 80, 90
plt.figure(num=None, figsize=fig_size, dpi=dpi, facecolor="w", edgecolor="k")
plt.hist(small_areas, bins)
plt.title("Object Size Distribution " + annFile.name)
plt.xlabel("Area (pixels^2)", fontsize=12)
plt.ylabel("# Annotations", fontsize=12)
plt.show()

## Calculate new pixels^2 cutoffs

In [None]:
print("Total annotations: ", len(areas))
area_percentiles = np.percentile(a=areas, q=[33.33333, 66.66667])
print("area percentiles: ", area_percentiles)
print(f"Using these cutoffs ({area_percentiles}) we get this many annotations in each group:")
print("Small:  ", len([a for a in areas if 0.0 <= a <= area_percentiles[0]]))
print("Medium: ", len([a for a in areas if area_percentiles[0] < a <= area_percentiles[1]]))
print("Large:  ", len([a for a in areas if area_percentiles[1] < a ]))


## Inspect object center distribution

E.g., where in the images are the bbox centers? 

In [None]:
# xview chip size, if doing some other dataset you might have different image dims for each image, so be careful:
img_size = (512.0, 512.0)  
bbox_dims = [(ann["bbox"][2], ann["bbox"][3]) for ann in anns]
bbox_centers = [
    (int((ann["bbox"][0] + ann["bbox"][2] + ann["bbox"][0]) / 2.0),
    int((ann["bbox"][1] + ann["bbox"][3] + ann["bbox"][1]) / 2.0))
    for ann in anns
]
bbox_centers_normalized = [(c[0] / img_size[0], c[1] / img_size[1]) for c in bbox_centers]
centers_x, centers_y = map(list, zip(*bbox_centers_normalized))


import matplotlib.colors as mcolors

fig_size, dpi, rotation = (14, 14), 80, 90
plt.figure(num=None, figsize=fig_size, dpi=dpi, facecolor='w', edgecolor='k')
plt.title('Bounding Box Center Locations')
plt.hist2d(centers_x, centers_y, bins=10)
plt.colorbar()
plt.show()

In [None]:
fig_size, dpi, rotation = (14, 14), 80, 90
plt.figure(num=None, figsize=fig_size, dpi=dpi, facecolor='w', edgecolor='k')
plt.title('Bounding Box Center Locations - X')
plt.hist(centers_y)
plt.show()


fig_size, dpi, rotation = (14, 14), 80, 90
plt.figure(num=None, figsize=fig_size, dpi=dpi, facecolor='w', edgecolor='k')
plt.title('Bounding Box Center Locations - Y')
plt.hist(centers_x)
plt.show()

In [None]:
len(centers_x)
