In [1]:
import json
import pandas as pd
import cv2
import numpy as np
import torch
from torchvision import datasets, models, transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

In [2]:
path = "data/images/"

In [3]:
json_data = json.load(open("data/pascal_train2012.json"))

In [4]:
cats = json_data["categories"]
id_cat = []

for c in cats:
    id_cat.append([c["id"], c["name"]])
    
df_cats = pd.DataFrame(id_cat, columns=["category_id", "name"])

In [5]:
id_cat = {key: value-1 for (value, key) in id_cat}

In [6]:
df_filename = pd.DataFrame(json_data["images"])
df_filename.columns = ["file_name", "height", "image_id", "width"]

df_bbox = pd.DataFrame(json_data["annotations"])

In [7]:
df = df_filename.merge(df_bbox, on="image_id")
df = df[df["ignore"] == 0]
df = df.drop(["area", "ignore", "iscrowd", "segmentation", "image_id"], axis=1)
df = df.merge(df_cats, on="category_id")

In [8]:
df.head()

Unnamed: 0,file_name,height,width,bbox,category_id,id,name
0,2008_000008.jpg,442,500,"[52, 86, 419, 334]",13,1,horse
1,2008_000141.jpg,333,500,"[93, 8, 407, 325]",13,69,horse
2,2008_000142.jpg,333,500,"[108, 74, 386, 259]",13,71,horse
3,2008_000371.jpg,333,500,"[188, 128, 186, 148]",13,229,horse
4,2008_000428.jpg,253,400,"[0, 42, 378, 195]",13,276,horse


In [9]:
grouped_data = []

grouped = df.groupby("file_name")
for name, group in grouped:
    val = [name, group["width"].values[0], group["height"].values[0], list(group["name"].values),
           list(group["bbox"].values), list(group["category_id"].values - 1)]
    grouped_data.append(val)
    
print(grouped_data[0])

['2008_000008.jpg', 500, 442, ['horse', 'person'], [[52, 86, 419, 334], [157, 43, 132, 124]], [12, 14]]


In [10]:
target_size = 224

for g in grouped_data:
    x_scale = target_size / g[1]
    y_scale = target_size / g[2]

    old_boxes = g[4]
    new_boxes = []

    for i in range(len(old_boxes)):
        (x, y, d_x, d_y) = old_boxes[i]

        x = int(round(x * x_scale))
        y = int(round(y * y_scale))
        d_x = int(round(d_x * x_scale))
        d_y = int(round(d_y * y_scale))
    
        new_boxes.append([x, y, d_x, d_y])
        
    g[4] = new_boxes
    
    # removing width and height
    del g[2]
    del g[1]

In [11]:
example = grouped_data[2]
example

['2008_000019.jpg',
 ['dog', 'dog', 'dog'],
 [[64, 1, 109, 161], [77, 54, 72, 141], [168, 0, 56, 92]],
 [11, 11, 11]]

In [12]:
def draw_boxes(boxes, image):
    for i in range(len(boxes)):
        cv2.rectangle(image, (boxes[i][0], boxes[i][1]),
                      (boxes[i][0] + boxes[i][2], boxes[i][1] + boxes[i][3]), (255, 0, 0), 1)
    
    return image

img_str = path + example[0]

img = cv2.imread(img_str, cv2.IMREAD_UNCHANGED)
img = cv2.resize(img, (target_size, target_size))

img = draw_boxes(example[2], img)
#cv2.imshow("img", img)
#cv2.waitKey(0)
#cv2.destroyAllWindows()

In [13]:
B = 2
C = 20
S = 7
grid_size = target_size / S

y_list = []
cc = 0

for data in grouped_data:

    # OLD!!! For one grid element: [pc, bx, by, w, h, c1, ..., cN]
    # NEW!!! For one grid element: [pc, c1, ..., cN, b1x, b1y, w1, h1, b2x, b2y, w2, h2]
    
    y = np.zeros((1, S, S, 5*B + C))

    boxes = data[2]
    classes = data[3]

    for i in range(len(boxes)):
        box = boxes[i]

        center_width = box[0] + box[2]/2
        center_height = box[1] + box[3]/2

        width_index = int(np.floor((center_width / target_size) * S))
        height_index = int(np.floor((center_height / target_size) * S))

        class_index = classes[i]

        # Normalize width and height by the enitre image size
        w = box[2] / target_size
        h = box[3] / target_size

        # Normalize bx and by according to the current grid position
        bx = (center_width - ((width_index / S) * target_size)) / grid_size
        by = (center_height - ((height_index / S) * target_size)) / grid_size

        
        y[:, width_index, height_index, class_index] = 1.0
        
        # Hardcoded for B=2
        if y[:, width_index, height_index, 20] == 0.0:
            y[:, width_index, height_index, 20] = 1.0
            y[:, width_index, height_index, 21] = bx
            y[:, width_index, height_index, 22] = by
            y[:, width_index, height_index, 23] = w
            y[:, width_index, height_index, 24] = h
        else:
            y[:, width_index, height_index, 25] = 1.0
            y[:, width_index, height_index, 26] = bx
            y[:, width_index, height_index, 27] = by
            y[:, width_index, height_index, 28] = w
            y[:, width_index, height_index, 29] = h

    y_list.append(y)
y = np.concatenate(y_list, axis=0)

In [14]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Removing adaptive avg pooling and fc
        pretrained_model = list(models.resnet34(pretrained='imagenet').children())[:-2]
        self.pretrained_model = nn.Sequential(*pretrained_model)
        
        self.fc_1 = nn.Linear(512*7*7, 4096)
        self.fc_out = nn.Linear(4096, S*S*(5*B + C))
                              
    def forward(self, x):
        x = self.pretrained_model(x)
        x = x.view(x.size(0), -1)
        x = self.fc_1(x)
        x = F.relu(x)
        x = self.fc_out(x)
        x = x.view(x.size(0), S, S, (5*B + C))
        x = F.relu(x)
        return x
    
    def change_freezing(self, mode=False):
        for param in self.pretrained_model.parameters():
            param.requires_grad = mode

In [15]:
device = "cuda"
model = Model()
model = model.to(device)
model.change_freezing(False)

In [16]:
class PascalData(Dataset):
    def __init__(self, data_list, y, lambda_indicator, target_size=target_size, path=path):
        self.target_size = target_size
        self.path = path
        
        self.lambda_indicator = torch.from_numpy(lambda_indicator).float().to(device)
        
        self.y = torch.from_numpy(y).float().to(device)
        self.file_list = [i[0] for i in data_list]      
        
        self.mean = np.array([0.485, 0.456, 0.406]).reshape((1,1,3))
        self.std = np.array([0.229, 0.224, 0.225]).reshape((1,1,3))
    
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        img_str = self.path + self.file_list[idx]

        img = cv2.imread(img_str, cv2.IMREAD_UNCHANGED)
        img = cv2.resize(img, (self.target_size, self.target_size))
        img = img / 255.0
        img = (img - self.mean) / self.std
        img = img.transpose((2, 0, 1))
        img = torch.from_numpy(img).float().to(device)
        return (img, self.y[idx,:], self.lambda_indicator[idx, :, :, :])

In [18]:
lambda_coord = 5
lambda_noobj = 0.5

lambda_indicator = (y > 0.0).astype(float)

lambda_indicator[:, :, :, 0:20][(lambda_indicator[:, :, :, 20] != 0.0) | 
                                (lambda_indicator[:, :, :, 25] != 0.0)] = 1.0

lambda_indicator[:, :, :, 20][lambda_indicator[:, :, :, 20] == 0.0] = lambda_noobj
lambda_indicator[:, :, :, 21] = lambda_indicator[:, :, :, 21]*lambda_coord
lambda_indicator[:, :, :, 22] = lambda_indicator[:, :, :, 22]*lambda_coord
lambda_indicator[:, :, :, 23] = lambda_indicator[:, :, :, 23]*lambda_coord
lambda_indicator[:, :, :, 24] = lambda_indicator[:, :, :, 24]*lambda_coord

lambda_indicator[:, :, :, 25][lambda_indicator[:, :, :, 20] == 0.5] = lambda_noobj
lambda_indicator[:, :, :, 26] = lambda_indicator[:, :, :, 26]*lambda_coord
lambda_indicator[:, :, :, 27] = lambda_indicator[:, :, :, 27]*lambda_coord
lambda_indicator[:, :, :, 28] = lambda_indicator[:, :, :, 28]*lambda_coord
lambda_indicator[:, :, :, 29] = lambda_indicator[:, :, :, 29]*lambda_coord

In [21]:
batch_size = 64
dataset = PascalData(grouped_data, y, lambda_indicator)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [22]:
def loss(y, y_hat, indicator):
    y[:,:,:,23] = torch.sqrt(y[:,:,:,23])
    y[:,:,:,24] = torch.sqrt(y[:,:,:,24])
    y[:,:,:,28] = torch.sqrt(y[:,:,:,28])
    y[:,:,:,29] = torch.sqrt(y[:,:,:,29])
    
    y_hat[:,:,:,23] = torch.sqrt(y_hat[:,:,:,23])
    y_hat[:,:,:,24] = torch.sqrt(y_hat[:,:,:,24])
    y_hat[:,:,:,28] = torch.sqrt(y_hat[:,:,:,28])
    y_hat[:,:,:,29] = torch.sqrt(y_hat[:,:,:,29])
    
    return torch.sum(indicator * ((y - y_hat)*(y - y_hat)))

In [23]:
seed = 42
n_epochs = 15
lr = 1e-5

In [25]:
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

In [26]:
torch.manual_seed(seed)
for epoch in range(0, n_epochs):
    model.train()
    train_loss = 0
    for _, (x_i, y_i, indicator_i) in enumerate(train_loader):
        model.zero_grad()
        y_hat = model(x_i)
        batch_loss = loss(y_i, y_hat, indicator_i)

        batch_loss.backward()
        optimizer.step()

        train_loss += batch_loss / batch_size
    train_loss = np.round((train_loss/len(train_loader)).detach().cpu().numpy(), 6)
    print(train_loss)

10.535625
7.15091
5.919106
5.037757
4.533224
4.137744
3.830325
3.613034
3.536706
3.341004
3.252924
3.127773
3.060486
3.032904
3.07592


In [144]:
index = 14
(x_i, _, _) = dataset[index]

In [145]:
y_hat = model(x_i.reshape(1, 3, 224, 224))
y_hat = y_hat.detach().cpu().numpy()

In [147]:
output_bb = []
class_list = []

pc_threshold = 0.4

for i in range(S):
    for j in range(S):
        for b in range(B):
            pc = y_hat[:, i, j, 20 + 5*b]

            if pc < pc_threshold:
                continue
        
            box_width = y_hat[:, i, j, 21 + 5*b]*target_size
            box_height = y_hat[:, i, j, 22 + 5*b]*target_size

            bx = i*grid_size + y_hat[:, i, j, 23 + 5*b]*grid_size - box_width/2
            by = j*grid_size + y_hat[:, i, j, 24 + 5*b]*grid_size - box_height/2            
            
            output_bb.append([pc[0], bx[0], by[0], box_width[0], box_height[0]])
            
        classes_predict = np.argwhere(y_hat[:, i, j, 0:20][0,:] > 0.5)
        
        if len(classes_predict) > 0:
            class_list.append(classes_predict[0])
        
#output_bb = sorted(output_bb, key = lambda x: x[0], reverse=True)

In [149]:
output_bb

[]

In [150]:
class_list

[array([14], dtype=int64),
 array([8], dtype=int64),
 array([14], dtype=int64),
 array([14], dtype=int64),
 array([14], dtype=int64),
 array([8], dtype=int64)]

In [141]:
[bb[1:] for bb in output_bb]

[[77.78871, 82.65389, 80.22976, 76.03548],
 [97.875946, 105.96737, 155.80215, 125.462296]]

In [143]:
img_file = dataset.file_list[index]
img_str = path + img_file

target_size = 224
img = cv2.imread(img_str, cv2.IMREAD_UNCHANGED)
img = cv2.resize(img, (target_size, target_size))

img = draw_boxes([bb[1:] for bb in output_bb], img)
cv2.imshow("img", img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [49]:
id_cat

{'aeroplane': 0,
 'bicycle': 1,
 'bird': 2,
 'boat': 3,
 'bottle': 4,
 'bus': 5,
 'car': 6,
 'cat': 7,
 'chair': 8,
 'cow': 9,
 'diningtable': 10,
 'dog': 11,
 'horse': 12,
 'motorbike': 13,
 'person': 14,
 'pottedplant': 15,
 'sheep': 16,
 'sofa': 17,
 'train': 18,
 'tvmonitor': 19}

In [None]:
predicted_bb = []
predicted_bb.append(output_bb[0])
_ = output_bb.pop(0)

while len(output_bb) != 0:
    predicted_bb.append(output_bb[0])
    _ = output_bb.pop(0)
    
    for accepted_bb in predicted_bb:
        filter(lambda x: bb_intersection_over_union(accepted_bb[2:], x[2:]) < 0.5, output_bb)

In [None]:
example = grouped_data[4]
example

In [None]:
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    boxA[2] = boxA[2] + boxA[0]
    boxA[3] = boxA[3] + boxA[1]
    
    boxB[2] = boxB[2] + boxB[0]
    boxB[3] = boxB[3] + boxB[1]
    
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
 
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
 
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
 
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
 
    # return the intersection over union value
    return iou

In [None]:
a = example[2][0]
b = example[2][1]

In [None]:
bb_intersection_over_union(a, b)

In [None]:
example

In [None]:
img_str = path + example[0]

target_size = 224
img = cv2.imread(img_str, cv2.IMREAD_UNCHANGED)
img = cv2.resize(img, (target_size, target_size))

img = draw_boxes(example[2], img)
cv2.imshow("img", img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
predicted_bb

In [None]:
iou = 0.4
iou < 0.5

In [None]:
def get_iou(a, b, epsilon=1e-5):
    """ Given two boxes `a` and `b` defined as a list of four numbers:
            [x1,y1,x2,y2]
        where:
            x1,y1 represent the upper left corner
            x2,y2 represent the lower right corner
        It returns the Intersect of Union score for these two boxes.

    Args:
        a:          (list of 4 numbers) [x1,y1,x2,y2]
        b:          (list of 4 numbers) [x1,y1,x2,y2]
        epsilon:    (float) Small value to prevent division by zero

    Returns:
        (float) The Intersect of Union score.
    """
    
    a[2] = a[2] + a[0]
    a[3] = a[3] + a[1]
    
    b[2] = b[2] + b[0]
    b[3] = b[3] + b[1]

    
    # COORDINATES OF THE INTERSECTION BOX
    x1 = max(a[0], b[0])
    y1 = max(a[1], b[1])
    x2 = min(a[2], b[2])
    y2 = min(a[3], b[3])

    # AREA OF OVERLAP - Area where the boxes intersect
    width = (x2 - x1)
    height = (y2 - y1)
    # handle case where there is NO overlap
    if (width<0) or (height <0):
        return 0.0
    area_overlap = width * height

    # COMBINED AREA
    area_a = (a[2] - a[0]) * (a[3] - a[1])
    area_b = (b[2] - b[0]) * (b[3] - b[1])
    area_combined = area_a + area_b - area_overlap

    # RATIO OF AREA OF OVERLAP OVER COMBINED AREA
    iou = area_overlap / (area_combined+epsilon)
    print(iou)
    return iou

In [None]:
img_file = dataset.file_list[0]
img_str = path + img_file

target_size = 224
img = cv2.imread(img_str, cv2.IMREAD_UNCHANGED)
img = cv2.resize(img, (target_size, target_size))

img = draw_boxes(predicted_bb[0:2][2:], img)
cv2.imshow("img", img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
def draw_boxes(boxes, image):
    for i in range(len(boxes)):
        cv2.rectangle(image, (boxes[i][0], boxes[i][1]),
                      (boxes[i][0] + boxes[i][2], boxes[i][1] + boxes[i][3]), (255, 0, 0), 1)
    
    return image

def draw_grid(img, line_color=(0, 255, 0), thickness=1, type_=cv2.LINE_AA, pxstep=50):
    '''(ndarray, 3-tuple, int, int) -> void
    draw gridlines on img
    line_color:
        BGR representation of colour
    thickness:
        line thickness
    type:
        8, 4 or cv2.LINE_AA
    pxstep:
        grid line frequency in pixels
    '''
    
    x = pxstep
    y = pxstep
    while x < img.shape[1]:
        cv2.line(img, (x, 0), (x, img.shape[0]), color=line_color, lineType=type_, thickness=thickness)
        x += pxstep

    while y < img.shape[0]:
        cv2.line(img, (0, y), (img.shape[1], y), color=line_color, lineType=type_, thickness=thickness)
        y += pxstep   
        
    return img
    
#example = grouped_data[0]
img_str = path + example[0]

target_size = 224
img = cv2.imread(img_str, cv2.IMREAD_UNCHANGED)
img = cv2.resize(img, (target_size, target_size))

img = draw_boxes(example[2], img)
img = draw_grid(img, line_color=(0, 255, 0), thickness=1, type_=cv2.LINE_AA, pxstep=int(target_size / S))
img = cv2.circle(img, (int(center_width), int(center_height)), 5, (255, 255, 255), -1)

#cv2.imshow("img", img)
#cv2.waitKey(0)
#cv2.destroyAllWindows()