# 微调基于torchvision 0.3的目标检测模型
  
微调在 Penn-Fudan 数据库中对行人检测和分割的已预先训练的 Mask R-CNN 模型。  
Penn-Fudan 包含170个图像和345个行人实例，将用它来说明如何在 torchvision 中使用新功能，以便在自定义数据集上训练实例分割模型

## 1. 新的自定义数据
  
数据集应该从标准的类torch.utils.data.Dataset继承而来，并实现_len和_getitem_


###  1.1 为数据集编写类

In [1]:
import os
import numpy as np
import torch
from PIL import Image

In [2]:
class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        
        mask = Image.open(mask_path)
        mask = np.array(mask)
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[1:]
        
        # 将颜色编码的mask分成一组
        masks = mask = obj_ids[:, None, None]
        
        # 获取每个mask的边界框坐标
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            x1 = np.min(pos[1])
            x2 = np.max(pos[1])
            y1 = np.min(pos[0])
            y2 = np.max(pos[0])
            boxes.append([x1, y1, x2, y2])
        
        boxes = torch.as_tensor(boxes, dtype = torch.float32)
        labels = torch.ones((num_objs,), dtype = torch.int64)
        masks = torch.as_tensor(masks, dtype = torch.uint8)
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        # 假设所有实例都不是人群 
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)
        
        return img, target
    
    def __len__(self):
        return len(self.imgs)

## 2. 定义模型

* 1没用
* 2没用

* 1 微调已经预训练的模型：从一个在COCO上已预先训练过的模型开始，为自己的特定类进行微调。

In [4]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True)

# 将分类器替换为具有用户定义的 num_classes的新分类器
num_classes = 2

# 获取分类器的输入参数的数量 
in_features = model.roi_heads.box_predictor.cls_score.in_features
# 用新的头部替换预先训练好的头部 
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to C:\Users\朱子仪/.cache\torch\checkpoints\fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


HBox(children=(FloatProgress(value=0.0, max=167502836.0), HTML(value='')))




* 2 修改模型以添加不同的主干

In [5]:
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

backbone = torchvision.models.mobilenet_v2(pretrained = True).features

# FasterRCNN需要知道骨干网中的输出通道数量。
# 对于mobilenet_v2，它是1280，所以我们需要在这里添加
backbone.out_channels = 1280

# RPN在每个空间位置生成5 x 3个锚点
# 具有5种不同的大小和3种不同的宽高比。
# 每个特征映射可能具有不同的大小和宽高比
anchor_generator = AnchorGenerator(sizes = ((32,64,128,256,512),),
                                  aspect_ratios = ((0.5, 1.0, 2.0),))

# 定义用于执行ROI(region of interest)裁剪的特征映射，以及重新缩放后裁剪的大小。
# 如果主干返回Tensor，则featmap_names应为[0]。
# 更一般地，主干应该返回OrderedDict[Tensor] 
# 并且在featmap_names中，可以选择要使用的功能映射。 
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names = [0],
                                               output_size = 7,
                                               sampling_ratio = 2)

# 组合
model = FasterRCNN(backbone,
                   num_classes=2,
                   rpn_anchor_generator = anchor_generator,
                   box_roi_pool = roi_pooler)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to C:\Users\朱子仪/.cache\torch\checkpoints\mobilenet_v2-b0353104.pth


HBox(children=(FloatProgress(value=0.0, max=14212972.0), HTML(value='')))




### 2.1 PennFudan 数据集的实例分割模型

In [3]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor 
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

def get_model_instance_segmentation(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained = True)
    
    # 获取分类器的输入参数的数量 
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # 用新的头部替换预先训练好的头部 
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    # 获取掩膜分类器的输入特征数
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # 用新的掩膜预测器替换掩膜预测器
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 
                                                       hidden_layer,
                                                       num_classes)
    
    return model


## 3. 整合

### 3.1 为数据扩充 / 转换编写辅助函数


In [4]:
import transforms as T

def get_transform(train):
    transforms = [] 
    transforms.append(T.ToTensor())
    if train:  
        transforms.append(T.RandomHorizontalFlip(0.5))   
    return T.Compose(transforms)


In [19]:
from engine import train_one_epoch, evaluate 
import utils

def main():
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    # 数据集只有两个类 - 背景和人
    dataset = PennFudanDataset('./data/PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('./data/PennFudanPed', get_transform(train=False))
    
    # 在训练和测试集中拆分数据集
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset, indices[-50:])
    
    # 定义训练和验证数据加载器
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size = 1, shuffle = True, num_workers = 0,
        collate_fn = utils.collate_fn)
    
    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size = 1, shuffle = False, num_workers = 0,
        collate_fn = utils.collate_fn)  
    
    num_classes = 2
    model = get_model_instance_segmentation(num_classes)
    
    model.to(device)
    
    # 优化器
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr = 0.005,
                               momentum = 0.9, weight_decay = 0.0005)
    
    # 学习率调整
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                  step_size = 3,
                                                  gamma = 0.1)
    
    epochs = 10
    for epoch in range(epochs):
        # train_one_epoch(model, optimizer, data_loader, device, epoch, 10)
        # lr_scheduler.step()
        try:
            train_one_epoch(model, optimizer, data_loader, device, epoch, 10)
            lr_scheduler.step()
            evaluate(model, data_loader_test, device = device)
        except RuntimeError as exception:
            if "out of memory" in str(exception):
                print("WARNING: out of memory")
            if hasattr(torch.cuda, 'empty_cache'):
                torch.cuda.empty_cache()
            else:
                raise exception
        # evaluate(model, data_loader_test, device = device)
    print("Finish!")

In [9]:
import torchvision

In [20]:
main()

Finish!
