# <span id='00'>目录</span>
- [JSON和预处理](#l1)
- [数据集制作](#01)
- [模型加载与训练](#02)
- [预测集和模型预测](#03)

In [1]:
import torch
from torchvision.io.image import read_image
from torchvision.transforms import functional as F
from torch.utils.data import Dataset, DataLoader
import json
import pandas as pd

## <span id='l1'>[json有5个部分组成：](#00)</span>
'info'，'license'，'image'，'annotations'，'categories'

'info'（信息）：
该部分包含一些关于数据集的元信息，例如数据集的名称、版本、描述等。这是一个提供有关数据集整体信息的可选字段。

'license'（许可证）：
存放与数据集相关的许可证信息，包括数据集的使用权限、限制条件等。这通常是一个包含许可证详细信息的字典或文本字段。

'image'（图像信息）：
这一部分描述了每张图像的信息，通常是一个包含多个图像信息的列表。每个图像信息包括图像的ID、文件名、高度、宽度等。在目标检测中，可能还包含有关图像拍摄时间、摄影设备等信息。

'annotations'（标注信息）：
包含有关目标物体的标注信息。每个标注项通常包括目标的边界框坐标、类别标签、标注ID等。这是目标检测任务中非常重要的部分，用于训练和评估模型。

'categories'（类别信息）：
包含数据集中所有可能的目标类别信息。每个类别通常由一个唯一的ID、类别名称、以及可能的其他属性组成。这个部分用于将模型预测的类别ID映射回类别名称，以便结果的可读性。

# <span id=01>[制作训练集](#00)</span>

In [2]:
def load_json(json_file):
    with open(json_file, 'r') as f:
        json_dict = json.load(f)
    return json_dict

class CustomDataset(Dataset):
    
    def __init__(self, json_file):
        self.json_dict = load_json(json_file)
        self._len = len(self.json_dict['images'])
        self.df_annotations = pd.concat([pd.DataFrame([obj]) for obj in self.json_dict['annotations']], ignore_index=True)
    
    def __len__(self):
        return self._len
    
    def __getitem__(self, idx):
        image = read_image('datasets/images/' + self.json_dict['images'][idx]['file_name'])
        image = image / 255
        
        df_image_id = self.df_annotations[self.df_annotations['image_id'] == idx+1]
        boxes = torch.tensor(df_image_id['bbox'].tolist(), dtype=torch.float16)
        boxes[:, 2:4] = boxes[:, 0:2] + boxes[:, 2:4]
        labels = torch.tensor(df_image_id['category_id'].tolist(), dtype=torch.int64)
        targets = {
            'boxes': boxes,
            'labels': labels
        }
        
        return image, targets

In [3]:
train_set = CustomDataset('datasets/annotations/train.json')
# train_set[2100]
# 在windows下就不要考虑多线程了，不考虑num_workers，需要Linux下考虑
# 为什么batch只能设置为1，因为如果>1的话要求不同图片对应的tensor的shape必须是相同的
# 但每张图片的label数量又是不同的，这个之后可以考虑换个方法解决
train_dl = DataLoader(dataset=train_set, batch_size=1, shuffle=True) ##########################################################
train_dl

<torch.utils.data.dataloader.DataLoader at 0x1e1c88b6290>

# <span id=02>[模型加载与训练](#00)</span>

In [4]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.models import ResNet50_Weights

num_epochs = 200

weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(
    weights=weights,
#     num_classes=23,
#     weights_backbone=ResNet50_Weights.DEFAULT,
#     trainable_backbone_layers=5 # ,
    box_score_thresh=0.80
)


# 获取模型的分类器的输入特征数
in_features = model.roi_heads.box_predictor.cls_score.in_features

# 将模型的分类器替换为新的分类器，适应你的目标类别数
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes=23)

device = torch.device('cuda', 0)
model.to(device)

# 定义优化器和学习率调度器
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.95, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

In [5]:
for epoch in range(num_epochs):
    model.train()
    for images, targets in train_dl:        
        images = images.to(device)
        b = targets["boxes"]
        b = torch.squeeze(b, 0)
        l = targets["labels"]
        l = torch.squeeze(l, 0)
        targets = [{'boxes': b.to(device),'labels': l.to(device)}]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
    lr_scheduler.step()
    print('Epoch = ', epoch +1, '/', num_epochs, ':')
    print('loss_classifier = ', loss_dict['loss_classifier'])
    print('loss_box_reg = ', loss_dict['loss_box_reg'])
    print('loss_objectness = ', loss_dict['loss_objectness'])
    print('loss_rpn_box_reg = ', loss_dict['loss_rpn_box_reg'])

print("Training complete.")

Epoch =  1 / 200 :
loss_classifier =  tensor(0.2090, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.2016, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0405, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0077, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  2 / 200 :
loss_classifier =  tensor(0.6024, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.2952, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0209, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0284, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  3 / 200 :
loss_classifier =  tensor(0.6515, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.6835, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0501, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg 

Epoch =  25 / 200 :
loss_classifier =  tensor(0.0309, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1073, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0024, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0488, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  26 / 200 :
loss_classifier =  tensor(0.1036, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1775, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0003, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0093, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  27 / 200 :
loss_classifier =  tensor(0.1377, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1398, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0006, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_r

Epoch =  49 / 200 :
loss_classifier =  tensor(0.0523, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1475, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0127, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0033, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  50 / 200 :
loss_classifier =  tensor(0.1059, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1221, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0031, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0019, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  51 / 200 :
loss_classifier =  tensor(0.0998, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.2706, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0004, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_r

Epoch =  73 / 200 :
loss_classifier =  tensor(0.0326, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.0623, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0075, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.1167, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  74 / 200 :
loss_classifier =  tensor(0.1196, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.2906, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0021, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0342, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  75 / 200 :
loss_classifier =  tensor(0.0045, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.0082, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0001, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_r

Epoch =  97 / 200 :
loss_classifier =  tensor(0.2260, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.2452, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0032, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0562, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  98 / 200 :
loss_classifier =  tensor(0.0909, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1817, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0057, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0567, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  99 / 200 :
loss_classifier =  tensor(0.0217, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.0601, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0001, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_r

Epoch =  121 / 200 :
loss_classifier =  tensor(0.0441, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.0816, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0018, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0044, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  122 / 200 :
loss_classifier =  tensor(0.0724, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1514, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0018, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0810, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  123 / 200 :
loss_classifier =  tensor(0.1086, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1567, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0145, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_bo

Epoch =  145 / 200 :
loss_classifier =  tensor(0.1245, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1374, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0288, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0575, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  146 / 200 :
loss_classifier =  tensor(0.0065, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.0144, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(8.5295e-05, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0013, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  147 / 200 :
loss_classifier =  tensor(0.0194, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.0269, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0005, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rp

Epoch =  169 / 200 :
loss_classifier =  tensor(0.2669, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1690, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0027, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0540, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  170 / 200 :
loss_classifier =  tensor(0.1699, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.3126, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0010, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0329, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  171 / 200 :
loss_classifier =  tensor(0.0330, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.0732, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0006, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_bo

Epoch =  193 / 200 :
loss_classifier =  tensor(0.0327, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1144, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0008, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0488, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  194 / 200 :
loss_classifier =  tensor(0.0484, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.1352, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0002, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_box_reg =  tensor(0.0027, device='cuda:0', grad_fn=<DivBackward0>)
Epoch =  195 / 200 :
loss_classifier =  tensor(0.1666, device='cuda:0', grad_fn=<NllLossBackward0>)
loss_box_reg =  tensor(0.2256, device='cuda:0', grad_fn=<DivBackward0>)
loss_objectness =  tensor(0.0115, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
loss_rpn_bo

In [6]:
# 保存模型参数
torch.save(model.state_dict(), './faster_rcnn_fcn_v1.pth')

# <span id='03'>[预测集与模型预测](#00)</span>

In [6]:
class TestDataset(Dataset):
    
    def __init__(self, json_file):
        self.json_dict = load_json(json_file)
        self._len = len(self.json_dict['images'])
    
    def __len__(self):
        return self._len
    
    def __getitem__(self, idx):
        image = read_image('datasets/images/' + self.json_dict['images'][idx]['file_name'])
        image = image / 255
        image_name = self.json_dict['images'][idx]['file_name']
        
        return image, image_name

In [7]:
test_set = TestDataset('datasets/annotations/test.json')
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       

In [8]:
# 预测并且把结果写入txt
for i,j in test_set:
    i = torch.unsqueeze(i,0).to(device)
    pred = model(i)
    pred = pred[0]
    its = len(pred['labels'])
    j = str(j)[:-4]
    file_path = 'submission/'+str(j)+'.txt'
    with open(file_path, 'w') as file:
        for it in range(its):
            file.write(str(int(pred['labels'][it]))+':'+str(float(pred['boxes'][it][0]))+' '+
                       str(float(pred['boxes'][it][1]))+' '+str(float(pred['boxes'][it][2]))+' '+
                       str(float(pred['boxes'][it][3]))+' '+str(float(pred['scores'][it]))+'\n')