In [1]:
import os
import numpy as np
import torch
from PIL import Image

In [2]:
class PENNFUDAN(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        #print("Images path ", os.listdir(os.path.join(root, "PNGImages")))
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
        
    def __getitem__(self, idx):
        img_path = os.path.join(self.root, 'PNGImages', self.imgs[idx])
        mask_path = os.path.join(self.root, 'PedMasks', self.masks[idx])
        
        img = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path)
        mask = np.array(mask)
        #print(mask.shape)
        
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[1:]
        #print(obj_ids.shape)
        
        masks = mask == obj_ids[:, None, None]
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
            
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((num_objs, ), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1] * (boxes[:, 2] - boxes[:, 0] ))
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)
            
        return img, target
    
    def __len__(self):
        return len(self.imgs)

### 1 - Finetuning from a pretrained model

In [3]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 2
#print(model.roi_heads)

In [4]:
in_feature = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictors = FastRCNNPredictor(in_feature, num_classes)

### 2 - Modifying the model to add a different backbone

Ripping Down the Faster RCNN model with Mobilenet !

In [5]:
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

In [6]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features

In [7]:
backbone.out_channels =1280

In [8]:
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                 aspect_ratios=((0.5, 1.0, 2.0),))

In [9]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                               output_size=7,
                                               sampling_ratio=2)
fasterrcnn_model = FasterRCNN(backbone,
                  num_classes=2,
                  rpn_anchor_generator=anchor_generator,
                  box_roi_pool=roi_pooler)

### PENNFUDAN Dataset

In [10]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

In [11]:
def get_model_instance_segmentation(num_classes):
    
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    print("get_model_instance_segmentation - in_features: ",in_features)
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    print(in_features_mask)
    hidden_layer = 256
    
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                      hidden_layer,
                                                      num_classes)
    
    return model

In [12]:
import transforms as T
import utils
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    
    return T.Compose(transforms)

In [13]:
from engine import train_one_epoch, evaluate
import utils

In [14]:
def main():
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print(device)
    num_classes = 2
    
    dataset = PENNFUDAN('PennFudanPed', get_transform(train=True))
    dataset_test = PENNFUDAN('PennFudanPed', get_transform(train=False))
    
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
    
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=2,
                                             shuffle=True, num_workers=4,
                                             collate_fn=utils.collate_fn)
    data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1,
                                             shuffle=False, num_workers=4,
                                             collate_fn=utils.collate_fn)
    
    model = get_model_instance_segmentation(num_classes)
    model.to(device)
    
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                               momentum=0.9, weight_decay=0.0005)
    
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                  step_size=3,
                                                  gamma=0.1)
    num_epochs = 10
    for epoch in range(num_epochs):
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        lr_scheduler.step()
        evaluate(model, data_loader_test, device=device)
        
    return ("That's it !")

In [15]:
main()

cuda
get_model_instance_segmentation - in_features:  1024
256


	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple)


Epoch: [0]  [ 0/60]  eta: 0:00:34  lr: 0.000090  loss: 2.9290 (2.9290)  loss_classifier: 0.8120 (0.8120)  loss_box_reg: 0.2290 (0.2290)  loss_mask: 1.8441 (1.8441)  loss_objectness: 0.0335 (0.0335)  loss_rpn_box_reg: 0.0104 (0.0104)  time: 0.5746  data: 0.1427  max mem: 2550
Epoch: [0]  [10/60]  eta: 0:00:18  lr: 0.000936  loss: 1.3332 (1.7261)  loss_classifier: 0.4867 (0.5150)  loss_box_reg: 0.1805 (0.1890)  loss_mask: 0.7630 (0.9858)  loss_objectness: 0.0245 (0.0280)  loss_rpn_box_reg: 0.0091 (0.0083)  time: 0.3601  data: 0.0181  max mem: 2861
Epoch: [0]  [20/60]  eta: 0:00:14  lr: 0.001783  loss: 0.9235 (1.2428)  loss_classifier: 0.2774 (0.3740)  loss_box_reg: 0.1712 (0.1804)  loss_mask: 0.3625 (0.6543)  loss_objectness: 0.0168 (0.0230)  loss_rpn_box_reg: 0.0100 (0.0112)  time: 0.3453  data: 0.0045  max mem: 3065
Epoch: [0]  [30/60]  eta: 0:00:10  lr: 0.002629  loss: 0.4942 (1.0005)  loss_classifier: 0.1020 (0.2842)  loss_box_reg: 0.1578 (0.1755)  loss_mask: 0.2282 (0.5106)  loss_ob

Epoch: [2]  [ 0/60]  eta: 0:00:36  lr: 0.005000  loss: 0.2890 (0.2890)  loss_classifier: 0.0292 (0.0292)  loss_box_reg: 0.0331 (0.0331)  loss_mask: 0.2035 (0.2035)  loss_objectness: 0.0010 (0.0010)  loss_rpn_box_reg: 0.0223 (0.0223)  time: 0.6006  data: 0.1739  max mem: 3341
Epoch: [2]  [10/60]  eta: 0:00:18  lr: 0.005000  loss: 0.1898 (0.1901)  loss_classifier: 0.0292 (0.0295)  loss_box_reg: 0.0129 (0.0140)  loss_mask: 0.1325 (0.1359)  loss_objectness: 0.0010 (0.0014)  loss_rpn_box_reg: 0.0097 (0.0094)  time: 0.3685  data: 0.0180  max mem: 3341
Epoch: [2]  [20/60]  eta: 0:00:14  lr: 0.005000  loss: 0.1898 (0.2040)  loss_classifier: 0.0300 (0.0322)  loss_box_reg: 0.0129 (0.0189)  loss_mask: 0.1318 (0.1389)  loss_objectness: 0.0008 (0.0017)  loss_rpn_box_reg: 0.0097 (0.0122)  time: 0.3604  data: 0.0028  max mem: 3341
Epoch: [2]  [30/60]  eta: 0:00:10  lr: 0.005000  loss: 0.2049 (0.2003)  loss_classifier: 0.0305 (0.0308)  loss_box_reg: 0.0136 (0.0189)  loss_mask: 0.1272 (0.1379)  loss_ob

Epoch: [4]  [ 0/60]  eta: 0:00:26  lr: 0.000500  loss: 0.1615 (0.1615)  loss_classifier: 0.0177 (0.0177)  loss_box_reg: 0.0077 (0.0077)  loss_mask: 0.1300 (0.1300)  loss_objectness: 0.0005 (0.0005)  loss_rpn_box_reg: 0.0056 (0.0056)  time: 0.4379  data: 0.1291  max mem: 3341
Epoch: [4]  [10/60]  eta: 0:00:19  lr: 0.000500  loss: 0.1695 (0.1795)  loss_classifier: 0.0273 (0.0306)  loss_box_reg: 0.0111 (0.0152)  loss_mask: 0.1269 (0.1228)  loss_objectness: 0.0010 (0.0013)  loss_rpn_box_reg: 0.0097 (0.0096)  time: 0.3944  data: 0.0146  max mem: 3341
Epoch: [4]  [20/60]  eta: 0:00:15  lr: 0.000500  loss: 0.1486 (0.1645)  loss_classifier: 0.0250 (0.0259)  loss_box_reg: 0.0103 (0.0121)  loss_mask: 0.1029 (0.1175)  loss_objectness: 0.0003 (0.0008)  loss_rpn_box_reg: 0.0066 (0.0082)  time: 0.3790  data: 0.0032  max mem: 3341
Epoch: [4]  [30/60]  eta: 0:00:10  lr: 0.000500  loss: 0.1276 (0.1571)  loss_classifier: 0.0220 (0.0241)  loss_box_reg: 0.0044 (0.0109)  loss_mask: 0.1014 (0.1141)  loss_ob

Epoch: [6]  [ 0/60]  eta: 0:00:29  lr: 0.000050  loss: 0.2120 (0.2120)  loss_classifier: 0.0454 (0.0454)  loss_box_reg: 0.0190 (0.0190)  loss_mask: 0.1333 (0.1333)  loss_objectness: 0.0040 (0.0040)  loss_rpn_box_reg: 0.0104 (0.0104)  time: 0.4867  data: 0.1669  max mem: 3341
Epoch: [6]  [10/60]  eta: 0:00:18  lr: 0.000050  loss: 0.1386 (0.1559)  loss_classifier: 0.0219 (0.0244)  loss_box_reg: 0.0071 (0.0104)  loss_mask: 0.1104 (0.1133)  loss_objectness: 0.0002 (0.0008)  loss_rpn_box_reg: 0.0072 (0.0069)  time: 0.3796  data: 0.0172  max mem: 3341
Epoch: [6]  [20/60]  eta: 0:00:15  lr: 0.000050  loss: 0.1343 (0.1552)  loss_classifier: 0.0210 (0.0237)  loss_box_reg: 0.0057 (0.0100)  loss_mask: 0.1104 (0.1139)  loss_objectness: 0.0002 (0.0009)  loss_rpn_box_reg: 0.0052 (0.0068)  time: 0.3719  data: 0.0029  max mem: 3341
Epoch: [6]  [30/60]  eta: 0:00:11  lr: 0.000050  loss: 0.1535 (0.1572)  loss_classifier: 0.0232 (0.0250)  loss_box_reg: 0.0087 (0.0104)  loss_mask: 0.1138 (0.1137)  loss_ob

Epoch: [8]  [ 0/60]  eta: 0:00:32  lr: 0.000050  loss: 0.1370 (0.1370)  loss_classifier: 0.0209 (0.0209)  loss_box_reg: 0.0065 (0.0065)  loss_mask: 0.0989 (0.0989)  loss_objectness: 0.0042 (0.0042)  loss_rpn_box_reg: 0.0065 (0.0065)  time: 0.5414  data: 0.1724  max mem: 3410
Epoch: [8]  [10/60]  eta: 0:00:19  lr: 0.000050  loss: 0.1415 (0.1527)  loss_classifier: 0.0209 (0.0230)  loss_box_reg: 0.0088 (0.0097)  loss_mask: 0.1050 (0.1117)  loss_objectness: 0.0004 (0.0012)  loss_rpn_box_reg: 0.0073 (0.0071)  time: 0.3826  data: 0.0175  max mem: 3410
Epoch: [8]  [20/60]  eta: 0:00:14  lr: 0.000050  loss: 0.1518 (0.1513)  loss_classifier: 0.0202 (0.0238)  loss_box_reg: 0.0088 (0.0088)  loss_mask: 0.1050 (0.1109)  loss_objectness: 0.0003 (0.0011)  loss_rpn_box_reg: 0.0068 (0.0068)  time: 0.3649  data: 0.0027  max mem: 3410
Epoch: [8]  [30/60]  eta: 0:00:11  lr: 0.000050  loss: 0.1421 (0.1510)  loss_classifier: 0.0188 (0.0229)  loss_box_reg: 0.0067 (0.0089)  loss_mask: 0.1056 (0.1120)  loss_ob

"That's it !"