脚本：YOLO ➜ COCO JSON for Faster R-CNN

In [1]:
"""
# This block is used to convert YOLO-format labels to COCO-format annotations
# It has been disabled to avoid unnecessary regeneration during training

import os
import shutil
import json
from tqdm import tqdm

# Force a switch to the ObjectDetection catalog
os.chdir("C:/Users/admin/Downloads/Code/ObjectDetection")
print("当前目录:", os.getcwd())

# Input path
yolo_root = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset"
output_root = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset_faster"
os.makedirs(output_root, exist_ok=True)

def convert_split(split):
    print(f"Converting {split} set...")
    
    # Create new structures
    image_dir = os.path.join(output_root, "images", split)
    ann_dir = os.path.join(output_root, "annotations")
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(ann_dir, exist_ok=True)

    label_dir = os.path.join(yolo_root, "labels", split)
    source_image_dir = os.path.join(yolo_root, "images", split)

    # Category information (copied from smartcity.yaml in YOLOv5)
    # Category information: written directly to COCO 80 class tags
    # Category information: written directly to COCO 80 class tags
    categories = [
        {"id": i, "name": name} for i, name in enumerate([
            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
            'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
            'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
            'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
            'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
            'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
            'toothbrush'
        ])
    ]



    images = []
    annotations = []
    ann_id = 1
    for idx, file_name in enumerate(tqdm(os.listdir(label_dir))):
        if not file_name.endswith(".txt"):
            continue
        img_name = file_name.replace(".txt", ".jpg")
        src_img_path = os.path.join(source_image_dir, img_name)
        dst_img_path = os.path.join(image_dir, img_name)
        shutil.copyfile(src_img_path, dst_img_path)

        height, width = 640, 640  # If the original size is known it can be replaced

        image_info = {
            "id": idx,
            "file_name": img_name,
            "width": width,
            "height": height
        }
        images.append(image_info)

        # Read YOLO tags
        label_path = os.path.join(label_dir, file_name)
        with open(label_path, 'r') as f:
            for line in f.readlines():
                parts = line.strip().split()
                if len(parts) != 5:
                    continue
                class_id = int(parts[0])
                x_center, y_center, w, h = map(float, parts[1:])
                x = x_center - w / 2
                y = y_center - h / 2

                ann = {
                    "id": ann_id,
                    "image_id": idx,
                    "category_id": class_id,
                    "bbox": [x * width, y * height, w * width, h * height],
                    "area": w * h * width * height,
                    "iscrowd": 0
                }
                annotations.append(ann)
                ann_id += 1

    coco_json = {
        "images": images,
        "annotations": annotations,
        "categories": categories
    }

    output_path = os.path.join(ann_dir, f"annotations_{split}.json")
    with open(output_path, 'w') as f:
        json.dump(coco_json, f)

    print(f"{split} annotations saved to {output_path}")

# Execute two splits
convert_split("train")
convert_split("val")

"""

'\n# This block is used to convert YOLO-format labels to COCO-format annotations\n# It has been disabled to avoid unnecessary regeneration during training\n\nimport os\nimport shutil\nimport json\nfrom tqdm import tqdm\n\n# Force a switch to the ObjectDetection catalog\nos.chdir("C:/Users/admin/Downloads/Code/ObjectDetection")\nprint("当前目录:", os.getcwd())\n\n# Input path\nyolo_root = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset"\noutput_root = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset_faster"\nos.makedirs(output_root, exist_ok=True)\n\ndef convert_split(split):\n    print(f"Converting {split} set...")\n    \n    # Create new structures\n    image_dir = os.path.join(output_root, "images", split)\n    ann_dir = os.path.join(output_root, "annotations")\n    os.makedirs(image_dir, exist_ok=True)\n    os.makedirs(ann_dir, exist_ok=True)\n\n    label_dir = os.path.join(yolo_root, "labels", split)\n    source_image_dir = os.path.join(yolo_root, "images", split)\

Step 1：环境配置与导入库

In [None]:
import os
import torch
import torchvision
from torchvision.datasets import CocoDetection
from torchvision.transforms import functional as F
from torch.utils.data import DataLoader
from PIL import Image
import matplotlib.pyplot as plt

# 数据路径
data_dir = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset_faster"
train_img_dir = os.path.join(data_dir, "images", "train")
train_ann_path = os.path.join(data_dir, "annotations", "annotations_train.json")
val_img_dir = os.path.join(data_dir, "images", "val")
val_ann_path = os.path.join(data_dir, "annotations", "annotations_val.json")

# 自定义 COCO 数据集 transform
class CocoTransform(torchvision.datasets.CocoDetection):
    def __getitem__(self, index):
        img, target = super().__getitem__(index)
        img = F.to_tensor(img)

        boxes = []
        labels = []
        for obj in target:
            bbox = obj['bbox']
            x, y, w, h = bbox
            if w <= 0 or h <= 0:
                continue  # 跳过非法框
            boxes.append([x, y, x + w, y + h])
            labels.append(obj['category_id'])

        if not boxes:
            # 如果图片中没有合法框，强制返回一个无目标的标签
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
        else:
            boxes = torch.tensor(boxes, dtype=torch.float32)
            labels = torch.tensor(labels, dtype=torch.int64)

        target = {
            'boxes': boxes,
            'labels': labels,
        }
        return img, target


# 创建 DataLoader
train_dataset = CocoTransform(train_img_dir, train_ann_path)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

val_dataset = CocoTransform(val_img_dir, val_ann_path)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

print(f"Train set size: {len(train_dataset)}, Val set size: {len(val_dataset)}")

"""
# This block is used to batch verify the presence of illegal boxes
# It has been disabled to avoid unnecessary regeneration during training

# 结束前 —— 批量验证是否存在非法 box
invalid_count = 0

for images, targets in train_loader:
    for t in targets:
        boxes = t['boxes']
        if boxes.shape[0] == 0:
            invalid_count += 1
        elif (boxes[:, 2] <= boxes[:, 0]).any() or (boxes[:, 3] <= boxes[:, 1]).any():
            invalid_count += 1

print(f" Data validation is complete. Number of illegal labels: {invalid_count}")

"""


loading annotations into memory...
Done (t=2.40s)
creating index...
index created!
loading annotations into memory...
Done (t=0.51s)
creating index...
index created!
Train set size: 105558, Val set size: 11708


'\n# This block is used to batch verify the presence of illegal boxes\n# It has been disabled to avoid unnecessary regeneration during training\n\n# 结束前 —— 批量验证是否存在非法 box\ninvalid_count = 0\n\nfor images, targets in train_loader:\n    for t in targets:\n        boxes = t[\'boxes\']\n        if boxes.shape[0] == 0:\n            invalid_count += 1\n        elif (boxes[:, 2] <= boxes[:, 0]).any() or (boxes[:, 3] <= boxes[:, 1]).any():\n            invalid_count += 1\n\nprint(f" Data validation is complete. Number of illegal labels: {invalid_count}")\n\n'

Step 2：模型定义与训练

In [3]:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights

# 加载预训练模型
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn(weights=weights)

# 替换分类头（COCO: 80 类 + background）
num_classes = 81
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

Optimizer 与训练参数设置

In [4]:
# 设置优化器
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# 学习率调度器（可选）
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)


训练 Loop（建议5个 epoch）

In [None]:
from tqdm.notebook import tqdm
import time
import os

checkpoint_dir = "C:/Users/admin/Downloads/Code/ObjectDetection/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

num_epochs = 5
train_loss_list = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    start_time = time.time()

    for batch_idx, (images, targets) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        try:
            # 将数据移动到 GPU
            images = list(img.to(device) for img in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # 训练模型
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            epoch_loss += losses.item()

            # 反向传播和优化
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            # 每 batch 输出 loss
            print(f"Batch {batch_idx+1}/{len(train_loader)}, Loss: {losses.item():.4f}")

        except Exception as e:
            print(f"Skipped a batch due to error: {str(e)}")
            continue
        
    # 更新学习率
    lr_scheduler.step()
    avg_loss = epoch_loss / len(train_loader)
    train_loss_list.append(avg_loss)

    print(f"Epoch {epoch+1} finished. Average Loss: {avg_loss:.4f}, Time: {time.time() - start_time:.2f}s")
    
   # 每个 epoch 后保存模型
    checkpoint_path = os.path.join(checkpoint_dir, f"fasterrcnn_epoch{epoch+1}.pt")
    torch.save(model.state_dict(), checkpoint_path)
    print(f"Model saved to: {checkpoint_path}")


Epoch 1/5:   0%|          | 0/52779 [00:00<?, ?it/s]