In [None]:
import glob
import pandas as pd
import os
import numpy as np
import torch
from PIL import Image
import torchvision
import cv2
import matplotlib.pyplot as plt

# import utils
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device="cpu"

In [None]:
# !unzip /content/PennFudanPed.zip -d data/

In [None]:
root_path='/content/data/PennFudanPed'
glob.glob('/content/data/PennFudanPed/PNGImages')

['/content/data/PennFudanPed/PNGImages']

In [None]:
import os
import numpy as np
import torch
from PIL import Image


class PennFudanDataset(object):
    def __init__(self, root, transforms=False):
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        
        img = Image.open(img_path).convert("RGB")
        
        mask = Image.open(mask_path)
        
        # convert the PIL Image into a numpy array
        mask = np.array(mask)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
       
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transforms:
            img = self.transforms(img)

        return img,target

    def __len__(self):
        return len(self.imgs)

In [None]:
from torchvision import transforms as T

def get_transform(train=False):
    transforms = []
    transforms.append(T.ToTensor())
    return T.Compose(transforms)

def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    return [data, target]
  

In [None]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
dataset = PennFudanDataset('/content/data/PennFudanPed',get_transform())
data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True,collate_fn=my_collate)
images,targets = next(iter(data_loader))
output = model(images,targets)   
print(output)

	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple)


{'loss_classifier': tensor(0.1365, grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.0024, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0063, grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0015, grad_fn=<DivBackward0>)}


In [None]:
model.to("cpu")
num_epoch=1
optimizer = torch.optim.SGD(model.parameters(), lr=0.005,momentum=0.9, weight_decay=0.0005)
for epoch in range(num_epoch):
  for images,targets in data_loader:
    optimizer.zero_grad()
    batch_loss = model(images,targets) 
    loss=sum([l for k,l in batch_loss.items()]) 
    print(batch_loss,loss)
    loss.backward()
    optimizer.step()



{'loss_classifier': tensor(0.1909, grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.0120, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0441, grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0031, grad_fn=<DivBackward0>)} tensor(0.2501, grad_fn=<AddBackward0>)
{'loss_classifier': tensor(0.1254, grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.0063, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0161, grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0070, grad_fn=<DivBackward0>)} tensor(0.1547, grad_fn=<AddBackward0>)
{'loss_classifier': tensor(0.1076, grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.0091, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0089, grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0065, grad_fn=<DivBackward0>)} tensor(0.1321, grad_fn=<AddBackward0>)
{'loss_classifier': tensor(0.0659, grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.0092, gra