In [9]:
%matplotlib notebook
#Ziyan Zhu
import numpy as np 
import torch
from torch import nn
import torch.utils.data as td
import torchvision as tv
from PIL import Image
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import nntools as nt
import time
from VOCdataset import VOCDataset

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
print(torch.__version__)

cuda
1.1.0


In [3]:
dataset_root_dir = '/datasets/ee285f-public/PascalVOC2012'

In [11]:
labels = ['background',  'bicycle', 'bird',  'boat',      'bottle', 
                       'bus',        'car',      'cat',  'chair',     'cow',
                       'diningtable','dog',    'horse',  'motorbike', 'person',
                       'pottedplant','sheep',  'sofa',   'train',   'tvmonitor', 'aeroplane']
color = {}
for i in range(len(labels)):    
    color[labels[i]] = np.random.random(3)
    
def myimshow(image, boxes = [], ax = plt):
    image = image.to('cpu').numpy()
    image = np.moveaxis(image, [0, 1, 2], [2, 0, 1]) 
    image = image 
    image[image < 0] = 0
    image[image > 1] = 1 
    h = ax.imshow(image) 
    if type(boxes['boxes']) == 'torch.Tensor':
        boxes['boxes'] = boxes['boxes'].to('cpu').numpy()
        boxes['labels'] = boxes['labels'].to('cpu').numpy()

    
    for i in range(len(boxes['labels'])):

        ax.gca().add_patch(patches.Rectangle((boxes['boxes'][i][0],boxes['boxes'][i][1]),boxes['boxes'][i][2]- boxes['boxes'][i][0],boxes['boxes'][i][3]-boxes['boxes'][i][1],edgecolor=color[labels[boxes["labels"][i]]],linewidth=1.5,facecolor='none'))
        ax.annotate(labels[boxes["labels"][i]], xy=(boxes['boxes'][i][2], boxes['boxes'][i][1]),va="center", ha="center",fontsize=5, bbox=dict(boxstyle="Square", fc=color[labels[boxes["labels"][i]]]))
    ax.axis('off') 
   
    return h

def myimshow_score(image, obj, ax = plt):
    image = image.to('cpu').numpy()
    names = obj['labels'].detach().to('cpu').numpy().astype(int)
    boxes = obj['boxes'].detach().to('cpu').numpy()
    scores = obj['scores'].detach().to('cpu').numpy()
    image = np.moveaxis(image, [0, 1, 2], [2, 0, 1]) 
    image[image < 0] = 0
    image[image > 1] = 1 
    h = ax.imshow(image) 

    if len(scores) == 0:
        ax.axis('off')
        return h
    elif boxes.shape[0] > 0:
        for i in range(len(names)):
            ax.gca().add_patch(patches.Rectangle((boxes[i][0],boxes[i][1]),boxes[i][2]- boxes[i][0],boxes[i][3]-boxes[i][1],edgecolor=color[labels[names[i]]],linewidth = 1.5, facecolor='none'))
            ax.annotate(labels[names[i]]+ ':' + str(scores[i]), xy=(boxes[i][2], boxes[i][1]),va="center", ha="center",fontsize=5, bbox=dict(boxstyle="Square", fc=color[labels[names[i]]]))
        ax.axis('off')
    else:
        ax.gca().add_patch( patches.Rectangle((boxes[0],boxes[1]),boxes[2]- boxes[0],boxes[3]-boxes[1],edgecolor=color[labels[names]],linewidth = 1.5,facecolor='none'))
        ax.annotate(labels[names]+ ':' + str(scores), xy=(boxes[2], boxes[1]),va="center", ha="center",fontsize=5, bbox=dict(boxstyle="Square", fc=color[labels[names]]))
        ax.axis('off') 
   
    return h

class ClassificationStatsManager(nt.StatsManager):
    def __init__(self):
        super(ClassificationStatsManager, self).__init__()
        
    def init(self):
        super(ClassificationStatsManager, self).init() 
        
    def accumulate(self, loss, x, y, d): 
        super(ClassificationStatsManager, self).accumulate(loss, x, y, d) 
        
    def summarize(self):
        loss = super(ClassificationStatsManager, self).summarize() 
        return {'loss': loss}

In [5]:
train_set = VOCDataset(dataset_root_dir)
val_set = VOCDataset(dataset_root_dir,mode="val")

# Load the model

In [6]:
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
backbone = tv.models.vgg16_bn(pretrained=True).features
# FasterRCNN needs to know the number of output channels in a backbone. For vgg16, it's 512
backbone.out_channels = 512
# let's make the RPN generate 5 x 3 anchors per spatial location, 
# with 5 different sizes and 3 different aspect ratios. 
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))
# let's define what the feature maps are that we will use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
roi_pooler = tv.ops.MultiScaleRoIAlign(featmap_names=[0],
                                                output_size=7,
                                                sampling_ratio=2)
# put the pieces together inside a FasterRCNN model
model_vgg16 = FasterRCNN(backbone,num_classes=21,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)

In [7]:
# ignore the pretrained part first
for param in model_vgg16.backbone.parameters():
    param.requires_grad = False
# create experience
lr = 1e-3
model_vgg16 = model_vgg16.to(device)
params = [p for p in model_vgg16.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9, weight_decay=0.0005)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
stats_manager = ClassificationStatsManager()
exp1 = nt.Experiment(model_vgg16, train_set, val_set, optimizer, stats_manager,
                     output_dir="faster_rcnn_vgg16_0", perform_validation_during_training=True)

# Now fine tune the whole model
new_model = exp1.net
for param in new_model.backbone.parameters():
    param.requires_grad = True
    
# create experience
lr = 5e-4
new_model = new_model.to(device)
params = [p for p in new_model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=lr, momentum=0.9, weight_decay=0.0005)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
stats_manager = ClassificationStatsManager()
exp2 = nt.Experiment(new_model, train_set, val_set, optimizer, stats_manager,
                     output_dir="faster_rcnn_vgg16", perform_validation_during_training=True)

# Testing the model

In [16]:
# Output some results (by index of images)
model = exp2.net
model.eval()
with torch.no_grad():
    images, targets = val_set.__getitem__(1105)
    images = images.to(device)
    #img = images.clone()
    targets['boxes'] = targets['boxes'].to(device)
    targets['labels'] = targets['labels'].to(device)
    tic = time.time()
    prediction = model([images])
    toc = time.time()
    print('time used: ',toc - tic ,'s')
    print(prediction)
    print("---------------------------------")
    print(targets)
    print("---------------------------------")
    
    res = {'boxes':torch.Tensor([]).to(device),'scores':torch.Tensor([]).to(device),'labels':torch.Tensor([]).to(device).int()}
    objs = set(prediction[0]['labels'].to('cpu').numpy())
    if len(prediction[0]['boxes'].shape) > 1:
        prediction[0]['boxes'] , prediction[0]['scores'], prediction[0]['labels'] = exp1.myfilter(prediction[0]['boxes'] , prediction[0]['scores'], prediction[0]['labels'], threshold = 0.1)
    
        for k in objs:
            obj_idx = prediction[0]['labels'] == k

            idx = exp1.nms(prediction[0]['boxes'][obj_idx], prediction[0]['scores'][obj_idx], threshold=0.3)

            res['boxes'] = torch.cat((res['boxes'], prediction[0]['boxes'][obj_idx][idx]), 0)
            res['scores'] = torch.cat((res['scores'], prediction[0]['scores'][obj_idx][idx]), 0)
            res['labels'] = torch.cat((res['labels'], prediction[0]['labels'][obj_idx][idx].int()), 0)
    print(res)
myimshow_score(images, res)

time used:  0.13159441947937012 s
[{'boxes': tensor([[393.2878,  62.1224, 542.8098, 353.9370],
        [218.0862,  32.1699, 631.2411, 453.3212],
        [461.4948,  14.4064, 617.7599, 443.4039],
        [364.4221,   0.0000, 662.7646, 555.0192],
        [ 94.1312,   0.0000, 695.9973, 606.6591],
        [203.9007, 208.6664, 664.9078, 399.1727],
        [514.5536, 203.9819, 569.5567, 298.3762],
        [364.6696, 154.8501, 564.8584, 319.1427],
        [482.7719, 128.1137, 533.7560, 307.9071],
        [353.6871, 148.0914, 597.4772, 402.6593],
        [481.7880, 220.1838, 578.6641, 289.1514],
        [ 79.4171,  37.8950, 752.4004, 466.7374],
        [531.0724, 222.7763, 554.7749, 286.4097],
        [ 54.0608,  30.9783, 669.2627, 707.6179]], device='cuda:0'), 'labels': tensor([14, 19, 19, 19, 19, 19, 14, 14, 14, 19, 14,  5, 14, 18],
       device='cuda:0'), 'scores': tensor([0.9743, 0.9481, 0.5854, 0.5665, 0.4133, 0.3061, 0.2659, 0.2518, 0.1476,
        0.1375, 0.1016, 0.0877, 0.0730, 0.0525

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x7fa7853a8438>

In [13]:
# Output some results (by index of images)
model = exp2.net
model.eval()
with torch.no_grad():
    images, targets = val_set.__getitem__(2000)
    images = images.to(device)
    #img = images.clone()
    targets['boxes'] = targets['boxes'].to(device)
    targets['labels'] = targets['labels'].to(device)

    tic = time.time()
    prediction = model([images])
    toc = time.time()
    print('time used: ',toc - tic ,'s')
    
    print(prediction)
    print("---------------------------------")
    print(targets)
    print("---------------------------------")
    
    res = {'boxes':torch.Tensor([]).to(device),'scores':torch.Tensor([]).to(device),'labels':torch.Tensor([]).to(device).int()}
    objs = set(prediction[0]['labels'].to('cpu').numpy())
    if len(prediction[0]['boxes'].shape) > 1:
        prediction[0]['boxes'] , prediction[0]['scores'], prediction[0]['labels'] = exp1.myfilter(prediction[0]['boxes'] , prediction[0]['scores'], prediction[0]['labels'], threshold = 0.1)
    
        for k in objs:
            obj_idx = prediction[0]['labels'] == k

            idx = exp1.nms(prediction[0]['boxes'][obj_idx], prediction[0]['scores'][obj_idx], threshold=0.3)

            res['boxes'] = torch.cat((res['boxes'], prediction[0]['boxes'][obj_idx][idx]), 0)
            res['scores'] = torch.cat((res['scores'], prediction[0]['scores'][obj_idx][idx]), 0)
            res['labels'] = torch.cat((res['labels'], prediction[0]['labels'][obj_idx][idx].int()), 0)
    print(res)
myimshow_score(images, res)

time used:  0.14017295837402344 s
[{'boxes': tensor([[331.1691, 255.7982, 615.1109, 800.0000],
        [178.5005, 247.2293, 527.6614, 793.9796],
        [181.2190, 231.1863, 357.2228, 787.1511],
        [322.2395, 289.0977, 477.3708, 800.0000],
        [438.5099, 187.1372, 782.7277, 800.0000],
        [ 90.4128, 255.5053, 474.9197, 553.1225]], device='cuda:0'), 'labels': tensor([14, 14, 14, 14, 14, 14], device='cuda:0'), 'scores': tensor([0.9425, 0.7484, 0.3512, 0.2180, 0.0941, 0.0640], device='cuda:0')}]
---------------------------------
{'boxes': tensor([[  1.7857, 192.8571, 110.7143, 295.2381],
        [657.1429, 211.9048, 800.0000, 426.1905],
        [ 87.5000, 226.1905, 367.8571, 759.5238],
        [321.4286, 266.6667, 455.3571, 800.0000],
        [410.7143, 283.3333, 621.4286, 800.0000],
        [  1.7857, 285.7143, 800.0000, 800.0000]], device='cuda:0'), 'labels': tensor([ 3,  3, 14, 14, 14,  3], device='cuda:0', dtype=torch.int32)}
---------------------------------
{'boxes': te

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x7fa7854e34a8>

In [14]:
# Output some results (by image name)
model = exp2.net
model.eval()
with torch.no_grad():
    images, targets = val_set.__getitem__(2000,'2008_001719')
    images = images.to(device)
    #img = images.clone()
    targets['boxes'] = targets['boxes'].to(device)
    targets['labels'] = targets['labels'].to(device)

    tic = time.time()
    prediction = model([images])
    toc = time.time()
    print('time used: ',toc - tic ,'s')
    
    print(prediction)
    print("---------------------------------")
    print(targets)
    print("---------------------------------")
    
    res = {'boxes':torch.Tensor([]).to(device),'scores':torch.Tensor([]).to(device),'labels':torch.Tensor([]).to(device).int()}
    objs = set(prediction[0]['labels'].to('cpu').numpy())
    if len(prediction[0]['boxes'].shape) > 1:
        prediction[0]['boxes'] , prediction[0]['scores'], prediction[0]['labels'] = exp1.myfilter(prediction[0]['boxes'] , prediction[0]['scores'], prediction[0]['labels'], threshold = 0.1)
    
        for k in objs:
            obj_idx = prediction[0]['labels'] == k

            idx = exp1.nms(prediction[0]['boxes'][obj_idx], prediction[0]['scores'][obj_idx], threshold=0.3)

            res['boxes'] = torch.cat((res['boxes'], prediction[0]['boxes'][obj_idx][idx]), 0)
            res['scores'] = torch.cat((res['scores'], prediction[0]['scores'][obj_idx][idx]), 0)
            res['labels'] = torch.cat((res['labels'], prediction[0]['labels'][obj_idx][idx].int()), 0)
    print(res)
myimshow_score(images, res)

time used:  0.12358760833740234 s
[{'boxes': tensor([[ 34.8046, 281.6299, 651.5958, 600.8788],
        [ 84.7808, 308.8127, 550.3843, 447.1779],
        [  4.8293, 107.8691, 656.2583, 745.3041],
        [  1.2839, 234.3580, 668.8951, 564.2599],
        [  4.0827, 306.3251, 640.1992, 682.8289],
        [ 23.5820, 126.9041, 626.5299, 782.7540],
        [  0.0000, 228.2991, 671.7604, 557.8882],
        [  0.0000,  74.3875, 644.4249, 776.7409]], device='cuda:0'), 'labels': tensor([ 6,  6,  6,  3, 18,  8, 20, 20], device='cuda:0'), 'scores': tensor([0.4681, 0.1411, 0.1394, 0.0664, 0.0613, 0.0593, 0.0538, 0.0508],
       device='cuda:0')}]
---------------------------------
{'boxes': tensor([[ 78.4000,  69.7095, 601.6000, 730.2905]], device='cuda:0'), 'labels': tensor([20], device='cuda:0', dtype=torch.int32)}
---------------------------------
{'boxes': tensor([[ 34.8046, 281.6299, 651.5958, 600.8788]], device='cuda:0'), 'scores': tensor([0.4681], device='cuda:0'), 'labels': tensor([6], devic

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x7fa7854273c8>