In [1]:
%matplotlib notebook

import os
import numpy as np 
import torch
from torch import nn
from torch.nn import functional as F 
import torch.utils.data as td
import torchvision as tv
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import socket
import getpass
import pandas as pd
import xml.etree.ElementTree as ET
import nntools as nt

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
print(torch.__version__)

1.1.0


In [4]:
dataset_root_dir = '/datasets/ee285f-public/PascalVOC2012'

In [8]:
class VOCDataset(td.Dataset):
    def __init__(self, root_dir, mode='train', image_size=(800, 800)): 
        super(VOCDataset, self).__init__()
        self.mode = mode
        self.image_size = image_size
        
        self.images_dir = os.path.join(root_dir, 'JPEGImages')
        self.images_idx = pd.read_csv(os.path.join(root_dir, 'ImageSets/Main' ,"%s.txt" % mode), sep=' ', header=None, error_bad_lines=False) 
        self.ann_dir    = os.path.join(root_dir, 'Annotations')
        
        self.labels = ['aeroplane',  'bicycle', 'bird',  'boat',      'bottle', 
                       'bus',        'car',      'cat',  'chair',     'cow',
                       'diningtable','dog',    'horse',  'motorbike', 'person',
                       'pottedplant','sheep',  'sofa',   'train',   'tvmonitor']
    def __len__(self):
        # Return the size of the dataset
        return len(self.images_idx)
    def __repr__(self):
        # Return the data is training set or testing set, and its image size
        return "VOCDataset(mode={}, image_size={})". \
            format(self.mode, self.image_size)
    def __getitem__(self, idx):
        # Return the preprocessed data (tensor) and labels.
        img_path = os.path.join(self.images_dir, self.images_idx.iloc[idx][0]+".jpg") 
        img = Image.open(img_path).convert('RGB')
        objs, seen_labels = self.get_obj(self.images_idx.iloc[idx][0])
        transform = tv.transforms.Compose([ 
            # resize the image
            tv.transforms.Resize(self.image_size), 
            # convert a PIL Image to tensor in range [0,1]
            tv.transforms.ToTensor(), 
            # normalize the tensor to [-1,1]
            #tv.transforms.Normalize((1/2,1/2,1/2),(1/2,1/2,1/2)) 
        ])
        # Transform
        img = transform(img)
        return img, objs, seen_labels
    
    def get_obj(self, img_name):
        
        img = {'object':[]}
        seen_labels = {}
        
        
        
        tree = ET.parse(os.path.join(self.ann_dir, img_name+".xml"))
     
        for elem in tree.iter():
            #print(elem.tag)
            if 'width' in elem.tag:
                img['width'] = int(elem.text)
                frac_x = self.image_size[0] / img['width']
            if 'height' in elem.tag:
                img['height'] = int(elem.text)
                frac_y = self.image_size[0] / img['height']
            if 'object' in elem.tag or 'part' in elem.tag:
                obj = {}
                for attr in list(elem):
                    
                    if 'name' in attr.tag:          
                        obj['name'] = attr.text
                        
                        img['object'] += [obj]  
 
                        if obj['name'] in seen_labels:
                            seen_labels[obj['name']] += 1
                        else:
                            seen_labels[obj['name']]  = 1
                           
                    if 'bndbox' in attr.tag:
                        for dim in list(attr):
                            if 'xmin' in dim.tag:
                                #obj['xmin'] = int(round(float(dim.text)*frac_x))
                                obj['xmin'] = float(dim.text)*frac_x
                            if 'ymin' in dim.tag:
                                #obj['ymin'] = int(round(float(dim.text)*frac_y))
                                obj['ymin'] = float(dim.text)*frac_y
                            if 'xmax' in dim.tag:
                                #obj['xmax'] = int(round(float(dim.text)*frac_x))
                                obj['xmax'] = float(dim.text)*frac_x
                            if 'ymax' in dim.tag:
                                #obj['ymax'] = int(round(float(dim.text)*frac_y))
                                obj['ymax'] = float(dim.text)*frac_y
                                                                              
        return img, seen_labels
    

In [9]:
def myimshow(image, boxes = [], ax = plt):
    image = image.to('cpu').numpy()
    image = np.moveaxis(image, [0, 1, 2], [2, 0, 1]) 
    image = (image + 1) / 2
    image[image < 0] = 0
    image[image > 1] = 1 
    h = ax.imshow(image) 
    
    labels = ['aeroplane',  'bicycle', 'bird',  'boat',      'bottle', 
                       'bus',        'car',      'cat',  'chair',     'cow',
                       'diningtable','dog',    'horse',  'motorbike', 'person',
                       'pottedplant','sheep',  'sofa',   'train',   'tvmonitor']
    color = {}
    for i in range(len(labels)):    
        color[labels[i]] = np.random.random(3)
    
    for i in range(len(boxes)):

        ax.gca().add_patch(patches.Rectangle((boxes[i]['xmin'],boxes[i]['ymin']),boxes[i]['xmax']- boxes[i]['xmin'],boxes[i]['ymax']-boxes[i]['ymin'],edgecolor=color[boxes[i]["name"]],facecolor='none'))
        ax.annotate(boxes[i]["name"], xy=(boxes[i]['xmax'], boxes[i]['ymin']),va="center", ha="center",fontsize=5, bbox=dict(boxstyle="Square", fc=color[boxes[i]["name"]]))
    ax.axis('off') 
   
    return h

In [10]:
train_set = VOCDataset(dataset_root_dir)
val_set = VOCDataset(dataset_root_dir,mode="val")

In [52]:
x, x_objs, x_labels = train_set.__getitem__(1150)

In [32]:
myimshow(x, boxes = x_objs["object"])

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x7f2594fc3710>

In [62]:
x_objs

{'object': [{'name': 'horse',
   'xmin': 20,
   'ymin': 92,
   'xmax': 144,
   'ymax': 308},
  {'name': 'horse', 'xmin': 131, 'ymin': 102, 'xmax': 228, 'ymax': 298},
  {'name': 'horse', 'xmin': 202, 'ymin': 99, 'xmax': 286, 'ymax': 307},
  {'name': 'horse', 'xmin': 289, 'ymin': 103, 'xmax': 374, 'ymax': 292},
  {'name': 'horse', 'xmin': 373, 'ymin': 88, 'xmax': 449, 'ymax': 286},
  {'name': 'horse', 'xmin': 430, 'ymin': 117, 'xmax': 500, 'ymax': 307},
  {'name': 'person', 'xmin': 9, 'ymin': 66, 'xmax': 106, 'ymax': 201},
  {'name': 'person', 'xmin': 120, 'ymin': 74, 'xmax': 202, 'ymax': 200},
  {'name': 'person', 'xmin': 191, 'ymin': 71, 'xmax': 279, 'ymax': 181},
  {'name': 'person', 'xmin': 269, 'ymin': 86, 'xmax': 334, 'ymax': 177},
  {'name': 'person', 'xmin': 351, 'ymin': 65, 'xmax': 423, 'ymax': 184},
  {'name': 'person', 'xmin': 411, 'ymin': 68, 'xmax': 500, 'ymax': 214}],
 'width': 500,
 'height': 322}

In [20]:
class NNClassifier(nt.NeuralNetwork):
    def __init__(self):
        super(NNClassifier, self).__init__()
        self.cross_entropy = nn.CrossEntropyLoss()
    def criterion(self, y, d):
        return self.cross_entropy(y, d)

In [24]:
class Fasterrcnn_resnet50_Transfer(NNClassifier):
    def __init__(self, num_classes, fine_tuning=False):
        super(Fasterrcnn_resnet50_Transfer, self).__init__()
        self.model = tv.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        for param in model.parameters():
            param.requires_grad = fine_tuning
            
        cls_score_num_ftrs = self.model.roi_heads.box_predictor.cls_score.in_features
        self.model.roi_heads.box_predictor.cls_score = nn.Linear(cls_score_num_ftrs, num_classes+1)
        
        bbox_pred_num_ftrs = self.model.roi_heads.box_predictor.bbox_pred.in_features
        self.model.roi_heads.box_predictor.bbox_pred =  nn.Linear(bbox_pred_num_ftrs, 4*(num_classes+1))
        
    def forward(self, x): 
        f = self.model(x)
        return f

In [25]:
model = Fasterrcnn_resnet50_Transfer(20)

In [26]:
model

Fasterrcnn_resnet50_Transfer(
  (cross_entropy): CrossEntropyLoss()
  (model): FasterRCNN(
    (transform): GeneralizedRCNNTransform()
    (backbone): BackboneWithFPN(
      (body): IntermediateLayerGetter(
        (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (bn1): FrozenBatchNorm2d()
        (relu): ReLU(inplace)
        (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
        (layer1): Sequential(
          (0): Bottleneck(
            (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn1): FrozenBatchNorm2d()
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (bn2): FrozenBatchNorm2d()
            (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn3): FrozenBatchNorm2d()
            (relu): ReLU(inplace)
            (downsample): Sequential(
              (0): Conv2d(64

In [27]:
train_loader = td.DataLoader(train_set, batch_size=16, shuffle=True, pin_memory=True)
val_loader = td.DataLoader(val_set, batch_size=1, shuffle=True, pin_memory=True)

In [28]:
model.eval()

Fasterrcnn_resnet50_Transfer(
  (cross_entropy): CrossEntropyLoss()
  (model): FasterRCNN(
    (transform): GeneralizedRCNNTransform()
    (backbone): BackboneWithFPN(
      (body): IntermediateLayerGetter(
        (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (bn1): FrozenBatchNorm2d()
        (relu): ReLU(inplace)
        (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
        (layer1): Sequential(
          (0): Bottleneck(
            (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn1): FrozenBatchNorm2d()
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (bn2): FrozenBatchNorm2d()
            (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn3): FrozenBatchNorm2d()
            (relu): ReLU(inplace)
            (downsample): Sequential(
              (0): Conv2d(64

In [33]:
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]

In [36]:
predictions = model.forward(x)

KeyboardInterrupt: 

In [None]:
predictions[0]['labels']