脚本：YOLO ➜ COCO JSON for Faster R-CNN

In [6]:
import os
import shutil
import json
from tqdm import tqdm

# 强制切换到 ObjectDetection 目录
os.chdir("C:/Users/admin/Downloads/Code/ObjectDetection")
print("当前目录:", os.getcwd())

# 输入路径
yolo_root = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset"
output_root = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset_faster"
os.makedirs(output_root, exist_ok=True)

def convert_split(split):
    print(f"Converting {split} set...")
    
    # 创建新结构
    image_dir = os.path.join(output_root, "images", split)
    ann_dir = os.path.join(output_root, "annotations")
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(ann_dir, exist_ok=True)

    label_dir = os.path.join(yolo_root, "labels", split)
    source_image_dir = os.path.join(yolo_root, "images", split)

    # 类别信息（从 YOLOv5 的 smartcity.yaml 复制）
    # 类别信息：直接写入 COCO 80类标签
    # 类别信息：直接写入 COCO 80类标签
    categories = [
        {"id": i, "name": name} for i, name in enumerate([
            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
            'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
            'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
            'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
            'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
            'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
            'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
            'toothbrush'
        ])
    ]



    images = []
    annotations = []
    ann_id = 1
    for idx, file_name in enumerate(tqdm(os.listdir(label_dir))):
        if not file_name.endswith(".txt"):
            continue
        img_name = file_name.replace(".txt", ".jpg")
        src_img_path = os.path.join(source_image_dir, img_name)
        dst_img_path = os.path.join(image_dir, img_name)
        shutil.copyfile(src_img_path, dst_img_path)

        height, width = 640, 640  # 如果知道原始尺寸可以替换

        image_info = {
            "id": idx,
            "file_name": img_name,
            "width": width,
            "height": height
        }
        images.append(image_info)

        # 读取 YOLO 标签
        label_path = os.path.join(label_dir, file_name)
        with open(label_path, 'r') as f:
            for line in f.readlines():
                parts = line.strip().split()
                if len(parts) != 5:
                    continue
                class_id = int(parts[0])
                x_center, y_center, w, h = map(float, parts[1:])
                x = x_center - w / 2
                y = y_center - h / 2

                ann = {
                    "id": ann_id,
                    "image_id": idx,
                    "category_id": class_id,
                    "bbox": [x * width, y * height, w * width, h * height],
                    "area": w * h * width * height,
                    "iscrowd": 0
                }
                annotations.append(ann)
                ann_id += 1

    coco_json = {
        "images": images,
        "annotations": annotations,
        "categories": categories
    }

    output_path = os.path.join(ann_dir, f"annotations_{split}.json")
    with open(output_path, 'w') as f:
        json.dump(coco_json, f)

    print(f"{split} annotations saved to {output_path}")

# 执行两个 split
convert_split("train")
convert_split("val")


当前目录: C:\Users\admin\Downloads\Code\ObjectDetection
Converting train set...


100%|██████████| 105558/105558 [17:05<00:00, 102.94it/s]


train annotations saved to C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset_faster\annotations\annotations_train.json
Converting val set...


100%|██████████| 11708/11708 [01:43<00:00, 113.15it/s]


val annotations saved to C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset_faster\annotations\annotations_val.json


Step 1：环境配置与导入库

In [7]:
import os
import torch
import torchvision
from torchvision.datasets import CocoDetection
from torchvision.transforms import functional as F
from torch.utils.data import DataLoader
from PIL import Image
import matplotlib.pyplot as plt

# 数据路径
data_dir = "C:/Users/admin/Downloads/Code/ObjectDetection/coco_subset_faster"
train_img_dir = os.path.join(data_dir, "images", "train")
train_ann_path = os.path.join(data_dir, "annotations", "annotations_train.json")
val_img_dir = os.path.join(data_dir, "images", "val")
val_ann_path = os.path.join(data_dir, "annotations", "annotations_val.json")

# 自定义 COCO 数据集 transform
class CocoTransform(torchvision.datasets.CocoDetection):
    def __getitem__(self, index):
        img, target = super().__getitem__(index)
        img = F.to_tensor(img)

        boxes = []
        labels = []
        for obj in target:
            bbox = obj['bbox']
            x, y, w, h = bbox
            boxes.append([x, y, x + w, y + h])
            labels.append(obj['category_id'])

        target = {
            'boxes': torch.tensor(boxes, dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.int64),
        }
        return img, target

# 创建 DataLoader
train_dataset = CocoTransform(train_img_dir, train_ann_path)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

val_dataset = CocoTransform(val_img_dir, val_ann_path)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

print(f"Train set size: {len(train_dataset)}, Val set size: {len(val_dataset)}")


loading annotations into memory...
Done (t=2.59s)
creating index...
index created!
loading annotations into memory...
Done (t=0.21s)
creating index...
index created!
Train set size: 105558, Val set size: 11708


Step 2：模型定义与训练

In [8]:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# 加载预训练 Faster R-CNN 基础模型
model = fasterrcnn_resnet50_fpn(pretrained=True)

# 替换分类头（COCO: 80 类 + background）
num_classes = 81
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to C:\Users\admin/.cache\torch\hub\checkpoints\fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:01<00:00, 119MB/s]  


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

Optimizer 与训练参数设置

In [9]:
# 设置优化器
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# 学习率调度器（可选）
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)


训练 Loop（建议5个 epoch）

In [None]:
import time

num_epochs = 5
train_loss_list = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    start_time = time.time()
    
    for images, targets in train_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        epoch_loss += losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    lr_scheduler.step()
    avg_loss = epoch_loss / len(train_loader)
    train_loss_list.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Time: {time.time()-start_time:.2f}s")
