In [1]:
import os
import torchvision
import torch
from PIL import Image, ImageDraw
import numpy as np
import torchvision.transforms as T
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET
from torchvision import transforms

In [2]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
def get_instance_segmentation_model(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

    # get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

In [3]:
device = 'cuda:0'
model = get_instance_segmentation_model(4)
model = model.to(device)

In [4]:
class CustomDataset(Dataset):
    def __init__(self, img_dir=r'D:\Datasets\Apple obj-det\train', transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.names = os.listdir(img_dir)
        self.img_names = [(os.path.join(img_dir, name)) for name in self.names if not self.names.index(name)%2]
        self.ann_names = [(os.path.join(img_dir, name)) for name in self.names if self.names.index(name)%2]
    def __getitem__(self, idx):
        img_name = self.img_names[idx]
        img = Image.open(img_name).convert("RGB")
        ann_name = self.ann_names[idx]
        ann_tree = ET.parse(ann_name)
        boxes = []
        labels = []
        lab_leg = {'apple': 1, 'banana':2, 'orange':3}
        root = ann_tree.getroot()
        for objects in root.findall("object"):
            labels.append(lab_leg[objects[0].text])
            for bndbox in objects.findall("bndbox"):
                listbnd = list(bndbox)
                boxes.append([int(listbnd.text), int(listbnd.text), int(listbnd.text), int(listbnd.text)])
        areas = [(i[2] - i[0]) * (i[3] - i[1]) for i in boxes]
        areas = torch.as_tensor(areas, dtype=torch.int64)
        bndbox = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        target = {}
        target['boxes'] = bndbox
        target['labels'] = labels
        target['image_id'] = torch.as_tensor([idx], dtype=torch.int64)
        target['area'] = areas
        target['iscrowd'] = torch.zeros((len(self.img_names),), dtype=torch.uint8)
        if self.transform is not None:
            img = self.transform(img)
        return img, target
    def __len__(self):
        return len(self.img_names)

In [5]:
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    return T.Compose(transforms)

In [6]:
cus = CustomDataset(transform=get_transform(True))
test = CustomDataset(img_dir=r'D:\Datasets\Apple obj-det\test', transform=get_transform(True))

In [7]:
%cd C:\Users\janor\JupyterProjects\Tutorials\pytorch tutorial\vision\references\detection
import utils
from engine import train_one_epoch, evaluate
data_loader = torch.utils.data.DataLoader(
    cus, batch_size=1, shuffle=True, num_workers=0,
    collate_fn=utils.collate_fn)
test_loader = torch.utils.data.DataLoader(
    test, batch_size=1, shuffle=False, num_workers=0,
    collate_fn=utils.collate_fn)
%cd C:\Users\janor\JupyterProjects\Projects\custom object detection

C:\Users\janor\JupyterProjects\Tutorials\pytorch tutorial\vision\references\detection
C:\Users\janor\JupyterProjects\Projects\custom object detection


In [8]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
for epoch in range(10):
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, test_loader, device=device)

	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple)


Epoch: [0]  [  0/240]  eta: 0:11:20  lr: 0.000026  loss: 1.6304 (1.6304)  loss_classifier: 1.5285 (1.5285)  loss_box_reg: 0.0940 (0.0940)  loss_objectness: 0.0043 (0.0043)  loss_rpn_box_reg: 0.0036 (0.0036)  time: 2.8338  data: 0.5682  max mem: 1306
Epoch: [0]  [ 10/240]  eta: 0:03:45  lr: 0.000235  loss: 1.6092 (1.3399)  loss_classifier: 1.3551 (1.1324)  loss_box_reg: 0.1105 (0.1904)  loss_objectness: 0.0025 (0.0085)  loss_rpn_box_reg: 0.0042 (0.0086)  time: 0.9803  data: 0.1900  max mem: 1651
Epoch: [0]  [ 20/240]  eta: 0:03:31  lr: 0.000444  loss: 0.7426 (0.9939)  loss_classifier: 0.6275 (0.7763)  loss_box_reg: 0.1226 (0.1979)  loss_objectness: 0.0011 (0.0065)  loss_rpn_box_reg: 0.0056 (0.0132)  time: 0.8687  data: 0.2076  max mem: 1651
Epoch: [0]  [ 30/240]  eta: 0:03:08  lr: 0.000653  loss: 0.2936 (0.7657)  loss_classifier: 0.1921 (0.5821)  loss_box_reg: 0.1114 (0.1658)  loss_objectness: 0.0011 (0.0061)  loss_rpn_box_reg: 0.0061 (0.0118)  time: 0.8521  data: 0.2111  max mem: 1651


  "Palette images with Transparency expressed in bytes should be "


Epoch: [0]  [140/240]  eta: 0:01:15  lr: 0.002952  loss: 0.1089 (0.3902)  loss_classifier: 0.0398 (0.2305)  loss_box_reg: 0.0470 (0.1407)  loss_objectness: 0.0005 (0.0051)  loss_rpn_box_reg: 0.0100 (0.0139)  time: 0.6910  data: 0.0636  max mem: 1683
Epoch: [0]  [150/240]  eta: 0:01:07  lr: 0.003161  loss: 0.0947 (0.3743)  loss_classifier: 0.0387 (0.2196)  loss_box_reg: 0.0361 (0.1360)  loss_objectness: 0.0008 (0.0049)  loss_rpn_box_reg: 0.0086 (0.0138)  time: 0.6669  data: 0.0268  max mem: 1683
Epoch: [0]  [160/240]  eta: 0:00:59  lr: 0.003370  loss: 0.0818 (0.3569)  loss_classifier: 0.0372 (0.2091)  loss_box_reg: 0.0216 (0.1295)  loss_objectness: 0.0008 (0.0049)  loss_rpn_box_reg: 0.0069 (0.0134)  time: 0.6781  data: 0.0342  max mem: 1683
Epoch: [0]  [170/240]  eta: 0:00:52  lr: 0.003579  loss: 0.0944 (0.3464)  loss_classifier: 0.0523 (0.2015)  loss_box_reg: 0.0424 (0.1267)  loss_objectness: 0.0009 (0.0048)  loss_rpn_box_reg: 0.0066 (0.0133)  time: 0.6751  data: 0.0366  max mem: 1683


In [8]:
mod = torch.load('fruit_det_mod.pth')
mod = mod.eval()

In [62]:
import cv2
def get_prediction(img, threshold=0.0):
    lab_leg = {1:'apple', 2:'banana', 3:'orange'}
#     img = Image.open(img_path) # Load the image
#     transform = T.Compose([T.ToTensor()]) # Defing PyTorch Transform
#     img = transform(img) # Apply the transform to the image
    pred = mod([img.to(device)]) # Pass the image to the model
    pred_class = [lab_leg[i] for i in list(pred[0]['labels'].cpu().numpy())] # Get the Prediction Score
    pred_boxes = [[(i[0], i[1]), (i[2], i[3])] for i in list(pred[0]['boxes'].cpu().detach().numpy())] # Bounding boxes
    pred_score = list(pred[0]['scores'].cpu().detach().numpy())
    pred_t = [pred_score.index(x) for x in pred_score if x > threshold][-1] # Get list of index with score greater than threshold.
    pred_boxes = pred_boxes[:pred_t+1]
    pred_class = pred_class[:pred_t+1]
    return pred_boxes, pred_class

In [68]:
def detect_img(im, threshold=0.5, rect_th=1, text_size=1, text_th=1):
    boxes, pred_cls = get_prediction(im, threshold) # Get predictions
    im = Image.fromarray(im.mul(255).permute(1, 2, 0).byte().numpy())
    img = cv2.cvtColor(np.float32(im), cv2.COLOR_BGR2GRAY ) # Convert to RGB
    for i in range(len(boxes)):
        cv2.rectangle(img, boxes[i][0], boxes[i][1],color=(0, 255, 0), thickness=rect_th) # Draw Rectangle with the coordinates
        cv2.putText(img,pred_cls[i], boxes[i][0],  cv2.FONT_HERSHEY_SIMPLEX, text_size, (0,255,0),thickness=text_th) # Write the prediction class
    return img