Used for obtaining Prior anchor boxes based on the trainng dataset.
Json format produced by preprocess_labels.py file  

Reference NB:
https://github.com/FairyOnIce/ObjectDetectionYolo

In [None]:
import json
path = "/home/jupyter/VOCdevkit/VOC2012/pascal_voc_2dbbox_train.json"
train_data = json.load( open(path))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

%matplotlib inline

In [None]:
class_freq = {}

for i in train_data:
    for j in i["labels"]:
        if j["category"] in class_freq.keys():
            class_freq[j["category"]] += 1
        else:
            class_freq[j["category"]] = 0

df = pd.DataFrame()
df["category"] = class_freq.keys()
df["count"] = class_freq.values()
df = df.sort_values(by = "count")

sns.barplot(y = "category", x = "count", data = df )
plt.show()

In [None]:
h = []
w = []
for i in train_data:
    imgh = i['size']['height']
    imgw = i['size']['width']
    for j in i["labels"]:
        h_ = (j["box2d"]["y2"] - j["box2d"]["y1"])      
        w_ = (j["box2d"]["x2"] - j["box2d"]["x1"])
        h.append(h_/imgh)
        w.append(w_/imgw)

hw_df = pd.DataFrame()
hw_df["h"] = h
hw_df["w"] = w

sns.scatterplot(y = "w", x = "h", data = hw_df )
plt.show()

In [None]:
def iou(box, clusters):
    ''' :param box:      np.array of shape (2,) containing w and h
        :param clusters: np.array of shape (N cluster, 2) 
    '''
    x = np.minimum(clusters[:, 0], box[0]) 
    y = np.minimum(clusters[:, 1], box[1])

    intersection = x * y
    box_area = box[0] * box[1]
    cluster_area = clusters[:, 0] * clusters[:, 1]

    iou_ = intersection / (box_area + cluster_area - intersection)

    return iou_

In [None]:
def kmeans(boxes, k, dist=np.median,seed=1):
    """
    Calculates k-means clustering with the Intersection over Union (IoU) metric.
    :param boxes: numpy array of shape (r, 2), where r is the number of rows
    :param k: number of clusters
    :param dist: distance function
    :return: numpy array of shape (k, 2)
    """
    rows = boxes.shape[0]

    distances     = np.empty((rows, k)) ## N row x N cluster
    last_clusters = np.zeros((rows,))

    np.random.seed(seed)

    # initialize the cluster centers to be k items
    clusters = boxes[np.random.choice(rows, k, replace=False)]

    while True:
        # Step 1: allocate each item to the closest cluster centers
        for icluster in range(k): # I made change to lars76's code here to make the code faster
            distances[:,icluster] = 1 - iou(clusters[icluster], boxes)

        nearest_clusters = np.argmin(distances, axis=1)

        if (last_clusters == nearest_clusters).all():
            break
            
        # Step 2: calculate the cluster centers as mean (or median) of all the cases in the clusters.
        for cluster in range(k):
            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)

        last_clusters = nearest_clusters

    return clusters,nearest_clusters,distances



In [None]:
kmax = 11
dist = np.mean
results = {}
wh = np.asarray([ [wi,hi] for wi, hi in zip(w, h) ])
for k in range(2,kmax):
    clusters, nearest_clusters, distances = kmeans(wh,k,seed=2,dist=dist)
    WithinClusterMeanDist = np.mean(distances[np.arange(distances.shape[0]),nearest_clusters])
    result = {"clusters":             clusters,
              "nearest_clusters":     nearest_clusters,
              "distances":            distances,
              "WithinClusterMeanDist": WithinClusterMeanDist}
    print("{:2.0f} clusters: mean IoU = {:5.4f}".format(k,1-result["WithinClusterMeanDist"]))
    results[k] = result

### Visualization of k-means results 

In [None]:
def plot_cluster_result(plt,clusters,nearest_clusters,WithinClusterSumDist,wh):
    for icluster in np.unique(nearest_clusters):
        pick = nearest_clusters==icluster
        c = current_palette[icluster]
        plt.rc('font', size=8) 
        plt.plot(wh[pick,0],wh[pick,1],"p",
                 color=c,
                 alpha=0.5,label="cluster = {}, N = {:6.0f}".format(icluster,np.sum(pick)))
        plt.text(clusters[icluster,0],
                 clusters[icluster,1],
                 "c{}".format(icluster),
                 fontsize=20,color="red")
        plt.title("Clusters")
        plt.xlabel("width")
        plt.ylabel("height")
    plt.legend(title="Mean IoU = {:5.4f}".format(WithinClusterSumDist))
    
import seaborn as sns
current_palette = list(sns.xkcd_rgb.values())

figsize = (15,35)
count =1 
fig = plt.figure(figsize=figsize)
for k in range(2,kmax):
    result               = results[k]
    clusters             = result["clusters"]
    nearest_clusters     = result["nearest_clusters"]
    WithinClusterSumDist = result["WithinClusterMeanDist"]
    
    ax = fig.add_subplot(kmax/2,2,count)
    plot_cluster_result(plt,clusters,nearest_clusters,1 - WithinClusterSumDist,wh)
    count += 1
plt.show()

In [None]:
plt.figure(figsize=(6,6))
plt.plot(np.arange(2,kmax),
         [1 - results[k]["WithinClusterMeanDist"] for k in range(2,kmax)],"o-")
plt.title("within cluster mean of {}".format(dist))
plt.ylabel("mean IOU")
plt.xlabel("N clusters (= N anchor boxes)")
plt.show()


In [None]:
Nanchor_box = 5
# Width, Height
anchors = results[Nanchor_box]["clusters"]
anchors

In [None]:
import cv2 
## image shape H, W, C

img_h, img_w = 100,100

image = np.zeros([img_h,img_w,3], dtype = np.uint8) 
window_name = 'Image'
color = (255, 0, 0) 
thickness = 1

for an in anchors:
    box_w, box_h = int(an[0]*img_w) , int(an[1]*img_h)
    start_point = (int(img_h/2 - box_h/2) , int(img_w/2 - box_w/2))   
    end_point = (int(img_h/2 + box_h/2) , int(img_w/2 + box_w/2))  
    image = cv2.rectangle(image, start_point, end_point, color, thickness) 

plt.imshow(image)
plt.show()
