In [143]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob
import xml.etree.ElementTree as ET
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.utils.checkpoint
from torch.utils.data import Dataset
import torchvision
from torchvision.transforms import transforms
# import torchvision.transforms as transforms

In [163]:
# VOCdataset create

class VOCDataset(Dataset):
    CLASSES_NAME = (
        "__background__",
        "aeroplane",
        "bicyle",
        "bird",
        "boat",
        "bottle",
        "bus",
        "car",
        "cat",
        "chair",
        "cow",
        "diningtable",
        "dog",
        "horse",
        "motorbike",
        "person",
        "pottedplant",
        "sheep",
        "sofa",
        "train",
        "tvmonitor"
    )

    #     data loading
    def __init__(self, root_dir, resize_size=[800, 1024], split='train', use_difficult=False, transforms=None):
        self.root = root_dir
        self.use_difficult = use_difficult
        self.imgset = split
        self.mean = [0.485, 0.456, 0.406]  # VOC數據集中所有圖像矩陣的平均值和方差，為後續圖像歸一化做準備
        self.std = [0.229, 0.224, 0.225]
        self.resize_size = resize_size
#         transform = transforms.Compose([
#             transforms.ToTensor(),
#             transforms.Normalize(self.mean, self.std),
#             transforms.Resize(self.resize_size)
#         ])
        self.transforms = transforms

        self._annopath = os.path.join(self.root, "Annotations", "%s.xml")
        self._imgpath = os.path.join(self.root, "data_object detection", "%s.jpg")
        self._imgsetpath = os.path.join(self.root, "ImageSets", "%s.txt")

        # 讀取train.txt中內容
        with open(self._imgsetpath % self.imgset) as f:
            self.img_ids = f.readlines()
        self.img_ids = [x.strip() for x in self.img_ids]  # strip()去除首尾空格

        self.name2id = dict(zip(VOCDataset.CLASSES_NAME, range(len(VOCDataset.CLASSES_NAME))))
#         self.resize_size = resize_size
        
        print("INFO=====>voc dataset init finished  ! !")

    # 獲得長度
    def __len__(self):
        return len(self.img_ids)

    def _read_img_rgb(self, path):
        return cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)

    def __getitem__(self, index):

        img_id = self.img_ids[index]
        img = self._read_img_rgb(self._imgpath % img_id)

        anno = ET.parse(self._annopath % img_id).getroot()  # 讀取xml文檔的根目錄
        boxes = []
        classes = []

        for obj in anno.iter("object"):
            difficult = int(obj.find("difficult").text) == 1
            if not self.use_difficult and difficult:
                continue
            _box = obj.find("bndbox")
            box = [
                _box.find("xmin").text,
                _box.find("ymin").text,
                _box.find("xmax").text,
                _box.find("ymax").text,
            ]
            TO_REMOVE = 1  # 由于像素是網格存储，坐標2實質表示第一个像素格，所以-1
            box = tuple(
                map(lambda x: x - TO_REMOVE, list(map(float, box)))
            )
            boxes.append(box)

            name = obj.find("name").text.lower().strip()
            classes.append(self.name2id[name])  # 將類別映射回去

        boxes = np.array(boxes, dtype=np.int64)

        # 将img,box和classes转成tensor
#         img = transforms(img)  # transforms 自動將圖像歸一化，
        img = transforms.ToTensor()(img)
        boxes = torch.from_numpy(boxes)
        classes = torch.LongTensor(classes)
#         img_id = torch.tesnsor([index])
#         area = torch.tesnsor([])
#         iscrowd = torch.tesnsor([])
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = classes
#         target["area"] = area
#         target["iscrowd"] = iscrowd
#         target["img_id"] = img_id
        
        
#         if self.transforms is not None:
#             img, target = self.transforms(img, target)

        return img, target

In [164]:
# VOC dataset&dataLoader
# from VOC_2007 import VOCDataset
root_dir = "C:/Users/User/Desktop/Deep-Learning/Assignment_4_Object_Detection_and_Semantic_Segmentation/VOC_2007/"

# train_VOCdataset = VOCDataset(root_dir,split="train",transforms=transforms)
# val_VOCdataset = VOCDataset(root_dir,split="val",transforms=transforms)
# test_VOCdataset = VOCDataset(root_dir,split="test",transforms=transs84forms)

train_VOCdataset = VOCDataset(root_dir,split="train")
val_VOCdataset = VOCDataset(root_dir,split="val")
test_VOCdataset = VOCDataset(root_dir,split="test")

# VOC_DataLoader
batch_size=1
train_VOCdataloader = DataLoader(train_VOCdataset, batch_size, num_workers = 0)
val_VOCdataloader = DataLoader(val_VOCdataset, batch_size, num_workers = 0)
test_VOCdataloader = DataLoader(test_VOCdataset, batch_size, num_workers = 0)

INFO=====>voc dataset init finished  ! !
INFO=====>voc dataset init finished  ! !
INFO=====>voc dataset init finished  ! !


In [165]:
img, target = train_VOCdataset[1]
target

{'boxes': tensor([[ 12, 310,  83, 361],
         [361, 329, 499, 388],
         [234, 327, 333, 374],
         [174, 326, 251, 363],
         [138, 319, 188, 358],
         [107, 324, 149, 352],
         [ 83, 322, 120, 349]]),
 'labels': tensor([7, 7, 7, 7, 7, 7, 7])}

In [71]:
img.shape

torch.Size([3, 500, 335])

In [141]:
img

tensor([[[0.5137, 0.5451, 0.4941,  ..., 0.4275, 0.4588, 0.4431],
         [0.5922, 0.4980, 0.5020,  ..., 0.4196, 0.4275, 0.4549],
         [0.5451, 0.4941, 0.5059,  ..., 0.4353, 0.4196, 0.4627],
         ...,
         [0.5137, 0.5529, 0.5843,  ..., 0.5333, 0.4706, 0.4588],
         [0.5216, 0.5843, 0.6510,  ..., 0.6431, 0.5490, 0.5176],
         [0.4941, 0.5882, 0.6863,  ..., 0.6549, 0.5804, 0.5686]],

        [[0.5804, 0.6118, 0.5608,  ..., 0.5059, 0.5373, 0.5137],
         [0.6588, 0.5647, 0.5686,  ..., 0.4980, 0.5059, 0.5255],
         [0.6118, 0.5608, 0.5725,  ..., 0.5137, 0.4980, 0.5333],
         ...,
         [0.4000, 0.4235, 0.4157,  ..., 0.3843, 0.3294, 0.3176],
         [0.3961, 0.4471, 0.4745,  ..., 0.4863, 0.4000, 0.3647],
         [0.3725, 0.4549, 0.5020,  ..., 0.4902, 0.4235, 0.4157]],

        [[0.6118, 0.6431, 0.5922,  ..., 0.5333, 0.5647, 0.5529],
         [0.6902, 0.5961, 0.6000,  ..., 0.5255, 0.5333, 0.5647],
         [0.6431, 0.5922, 0.6039,  ..., 0.5490, 0.5333, 0.

In [53]:
# ADE20K dataset&dataloader


### Define model

In [146]:
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor


model = maskrcnn_resnet50_fpn(weights="DEFAULT")

# 固定backbone Net参数
# for param in model.parameters():
#     param.requires_grad = False

# 構建自定義的detection/segmentation head
num_classes_detection = 21  
num_classes_segmentation = 151  

# 替換分類器和mask head
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes_detection)

in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 512
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes_segmentation)

In [132]:
# model Architecture
model.train()

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(in

In [166]:
# Difine optimizer
lr = .0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

num_epochs = 10

for epoch in range(num_epochs):
#     for (images_voc2007, targets_voc2007), (images_ade20k, targets_ade20k) in zip(voc2007_data_loader, ade20k_data_loader):
    for images, targets in train_VOCdataloader: 
    # 合并批次数据
#         images = torch.cat([images_voc2007, images_ade20k])
#         targets_detection = targets_voc2007 + targets_ade20k
#         targets_segmentation = targets_ade20k

    # 前向传播
#         targets = int(targets)
        outputs = model(images, targets)

        # 計算loss
        loss_detection = nn.SmoothL1Loss()  # 根據object detection的輸出計算loss
        loss = loss_detection(outputs, targets)
#         loss_segmentation =   # 根據語意分割的輸出計算loss
#         total_loss = loss_detection + loss_segmentation

        # backpropagation
        optimizer.zero_grad() # 初始化傳播的梯度
        loss.backward() # 計算梯度
#         total_loss.backward() # 計算梯度
        optimizer.step() # 更新weight


TypeError: string indices must be integers

In [123]:
# 官網範例

from engine import train_one_epoch, evaluate
from torchvision.models.detection import utils
import torch

batch_size = 1

# def train():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has two classes only - background and person
num_classes = 21
# use our dataset and defined transformations
#     dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
#     dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))

# split the dataset in train and test set
train_VOCdataset = VOCDataset(root_dir,split="train")
test_VOCdataset = VOCDataset(root_dir,split="test")

# define training and validation data loaders
train_VOCdataloader = DataLoader(train_VOCdataset, batch_size)
test_VOCdataloader = DataLoader(test_VOCdataset, batch_size)

# get the model using our helper function
model = object_detection_semantic_segmentation(num_classes_detection, num_classes_segmentation)

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=0.0001, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.1)

# let's train it for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, train_VOCdataloader, device, epoch, print_freq=1)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, test_VOCdataloader, device=device)

    print("That's it!")

INFO=====>voc dataset init finished  ! !
INFO=====>voc dataset init finished  ! !


TypeError: object_detection_semantic_segmentation() takes 1 positional argument but 2 were given