In [4]:
import os
import xml.etree.ElementTree as ET
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from glob import glob
from torchvision import models
import torch
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
%matplotlib inline

In [5]:
labels = ['background', 'orange', 'apple', 'banana']
label2targets = {l: t for t, l in enumerate(labels)}
targets2label = {t: l for l, t in label2targets.items()}
num_classes = len(targets2label)

In [6]:
def set_seed(seed):
    """
    Use this to set ALL the random seeds to a fixed value and take out any randomness from cuda kernels
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.benchmark = False  ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
    torch.backends.cudnn.enabled   = False

    return True

device = 'cpu'
if torch.cuda.device_count() > 0 and torch.cuda.is_available():
    print("Cuda installed! Running on GPU!")
    device = 'cuda'
else:
    print("No GPU available!")

No GPU available!


In [7]:
def parse_xml(xml_file, scale=1):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    bounding_boxes = []
    labels = []

    for obj in root.findall('object'):
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text) / scale
        ymin = int(bbox.find('ymin').text) / scale
        xmax = int(bbox.find('xmax').text) / scale
        ymax = int(bbox.find('ymax').text) / scale
        bounding_boxes.append((xmin, ymin, xmax, ymax))
        labels.append(obj.find('name').text)

    return bounding_boxes, labels

def load_dataset(folder):
    output_data = {}

    for filename in os.listdir(folder):
        if filename.endswith('.xml'):
            xml_path = os.path.join(folder, filename)
            image_name = os.path.splitext(filename)[0] + '.jpg'
            image_path = os.path.join(folder, image_name)
            img = Image.open(image_path)
            if os.path.exists(image_path):
                bounding_boxes, name = parse_xml(xml_path)
                output_data[image_name] = {'image': img, 'bounding_boxes': bounding_boxes, 'label': name}
            else:
                print(f"Image file {image_name} not found for XML file {filename}")

    df = pd.DataFrame(output_data)
    df = df.T
    return df

In [8]:
class FruitDataSet(Dataset):
    def __init__(self, root_dir, transforms=None, resize_params=(224, 224)):
        self.root_dir = root_dir
        self.transforms = transforms
        self.img_paths = sorted(glob(self.root_dir + '/*.jpg'))
        self.xml_paths = sorted(glob(self.root_dir + '/*.xml'))
        self.resize_params = resize_params
    
    def parse_xml(self, xml_file, dims):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        bounding_boxes = []
        labels = []

        for obj in root.findall('object'):
            bbox = obj.find('bndbox')
            xmin = (int(bbox.find('xmin').text) / dims[1]) * self.resize_params[1]
            ymin = (int(bbox.find('ymin').text) / dims[0]) * self.resize_params[0]
            xmax = (int(bbox.find('xmax').text) / dims[1]) * self.resize_params[1]
            ymax = (int(bbox.find('ymax').text) / dims[0]) * self.resize_params[0]
            bounding_boxes.append((xmin, ymin, xmax, ymax))
            labels.append(obj.find('name').text)

        return bounding_boxes, labels
        
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        xml_path = self.xml_paths[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
        dims = img.shape[:2]
        img = cv2.resize(img, self.resize_params, cv2.INTER_AREA)
        img /= 255.0
        
    
        bounding_boxes, labels = self.parse_xml(xml_path, dims)
        bounding_boxes = torch.tensor(bounding_boxes).float()
        area = torch.as_tensor((bounding_boxes[:, 3] - bounding_boxes[:, 1]) * (bounding_boxes[:, 2] - bounding_boxes[:, 0]))
          # suppose all instances are not crowd
        iscrowd = torch.zeros(( bounding_boxes.shape[0],), dtype=torch.int64)
        target = {}
        target['labels'] = torch.tensor([label2targets[label] for label in labels]).long()
        target['boxes'] = bounding_boxes
        target['area'] = area
        #target['iscrowd'] = iscrowd
        target["image_id"] = torch.tensor([idx])
        
        if self.transforms:
            sample = {
                "image": img,
                "bboxes": target["boxes"],
                'labels': labels
            }
            
        return torch.tensor(img).permute(2, 0 ,1), target
        
        
        

In [9]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [10]:
train_ds = FruitDataSet(root_dir='train')
test_ds = FruitDataSet(root_dir='test')
#print('length of dataset = ', len(train_ds), '\n')
img, target = train_ds[78]
#print(img.shape, "\n", target)
train_dl = DataLoader(train_ds, batch_size=4, shuffle=True, collate_fn =  collate_fn)
test_dl = DataLoader(test_ds, batch_size=4, shuffle=False,  collate_fn =  collate_fn)

In [11]:
images, targets = next(iter(train_dl))
images = list(image.to(device) for image in images)
print(targets)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]


({'labels': tensor([2, 2]), 'boxes': tensor([[109.2599,  31.0435, 199.3394, 206.3478],
        [ 27.0581,  10.9565, 115.7676, 164.3478]]), 'area': tensor([15791.3291, 13607.2617]), 'image_id': tensor([67])}, {'labels': tensor([2]), 'boxes': tensor([[ 18.6667,  31.3600, 205.3333, 221.7600]]), 'area': tensor([35541.3320]), 'image_id': tensor([20])}, {'labels': tensor([1, 3]), 'boxes': tensor([[ 39.4800,  45.8653, 139.7200, 151.7992],
        [ 21.5600,  39.3554, 188.7200, 174.5839]]), 'area': tensor([10618.8203, 22604.8008]), 'image_id': tensor([166])}, {'labels': tensor([1]), 'boxes': tensor([[  6.2720,   4.7158, 124.5440, 165.0526]]), 'area': tensor([18963.3574]), 'image_id': tensor([235])})


In [12]:
def plot_img_bbox(img, target):
    # plot the image and bboxes
    # Bounding boxes are defined as follows: x-min y-min width height
    fig, a = plt.subplots(1,1)
    fig.set_size_inches(6,6)
    a.imshow(img)
    for box in (target['bounding_boxes']):
        x, y, width, height  = box[0], box[1], box[2]-box[0], box[3]-box[1]
        rect = patches.Rectangle((x, y),
                                 width, height,
                                 linewidth = 2,
                                 edgecolor = 'r',
                                 facecolor = 'none')

        # Draw the bounding box on top of the image
        a.add_patch(rect)
    plt.show()
    
# plotting the image with bboxes. Feel free to change the index
#img, target = test_ds[20]
#plot_img_bbox(img, target)

In [119]:
set_seed(42)
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)


Using cache found in /home/crasious/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-4-2 Python-3.7.10 torch-1.13.1+cu117 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [13]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [14]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 4)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "


In [15]:
model(images, targets)

{'loss_classifier': tensor(1.6202, grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.2383, grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.0059, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0049, grad_fn=<DivBackward0>)}

In [None]:
itr = 1

for epoch in range(10):
    print(f"epoch: {epoch}")
    for images, targets in train_dl:
        images = list(image.to(device) for image in images)
        #print(targets)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
       
        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        loss_value = losses.item()

        #loss_hist.send(loss_value)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        print(f"Iteration #{itr} loss: {loss_value}")
        itr += 1

Iteration #1 loss: 1.9877402782440186
Iteration #2 loss: 0.7592624425888062
Iteration #3 loss: 0.8154124617576599
Iteration #4 loss: 0.6115449666976929
Iteration #5 loss: 1.065006971359253
Iteration #6 loss: 0.714165449142456
Iteration #7 loss: 0.6699426174163818
Iteration #8 loss: 0.33624786138534546
Iteration #9 loss: 0.4830794930458069
Iteration #10 loss: 0.28032469749450684
Iteration #11 loss: 0.29673856496810913
Iteration #12 loss: 0.26447951793670654
Iteration #13 loss: 0.47230643033981323
Iteration #14 loss: 0.44779759645462036
Iteration #15 loss: 0.3303823471069336
Iteration #16 loss: 0.36142227053642273
Iteration #17 loss: 0.2955268621444702
Iteration #18 loss: 0.5276088118553162
Iteration #19 loss: 0.5749486684799194
Iteration #20 loss: 0.3386731743812561
Iteration #21 loss: 0.2983100414276123
Iteration #22 loss: 0.39120984077453613
Iteration #23 loss: 0.18498551845550537
Iteration #24 loss: 0.4596691131591797
Iteration #25 loss: 0.35394197702407837
Iteration #26 loss: 0.1445

In [None]:
def resize_image(image):
    # Open image
    
    # Check image dimensions
    width, height = image.size
    print(width, height)
    # If image dimensions are already (224, 224), return the original image
    if width == 224 and height == 224:
        return image
    
    # Resize image
    resized_image = image.resize((224, 224))
    
    return resized_image