In [9]:
import torch
from utils.yolov1 import YOLOv1
from ultralytics import YOLO

## YOLOv1

In [10]:
# 初始化YOLOv1模型（默认20类，对应VOC数据集）
yolov1 = YOLOv1(num_classes=20, num_bboxes=2)

print(yolov1)
print("输入尺寸: (64, 3, 448, 448)")
print("输出尺寸: (64, 7, 7, 30)")

YOLOv1(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): LeakyReLU(negative_slope=0.1)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): LeakyReLU(negative_slope=0.1)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1))
    (7): LeakyReLU(negative_slope=0.1)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): LeakyReLU(negative_slope=0.1)
    (10): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (11): LeakyReLU(negative_slope=0.1)
    (12): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): LeakyReLU(negative_slope=0.1)
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15): Sequential(
      (0): Conv2d(512, 256,

## YOLOv3

- YOLOv2改进(相对于YOLOv1)：
  - 骨干网络：使用更深的网络（Darknet-19），使用BN层
  - 特征融合：Passthrough层简单融合
  - k-means聚类5个锚框
  - 检测尺度：单尺度(448x448) + 多尺度训练
  - 分类：Softmax 单标签
  - 小目标检测：高分辨率训练 + Passthrough层
  - 精读/速度：召回率提升，速度提升
- YOLOv3改进(相对于YOLOv2)：
  - 骨干网络：使用更深的网络（Darknet-53），使用残差连接
  - 特征融合：FPN多尺度特征融合（3尺度输出）
  - k-means聚类9个锚框
  - 检测尺度：三尺度(320x320, 416x416, 608x608) + 多尺度训练
  - 分类：独立的Logistic回归 多标签
  - 小目标检测：FPN多尺度检测
  - 精读/速度：mAP提升，速度略有下降

In [11]:
yolov3 = YOLO('models/yolov3u.pt')  # 加载预训练的YOLOv3模型
print(yolov3)

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): Bottleneck(
        (cv1): Conv(
          (conv): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=0.001, mome

## YOLOv5

In [12]:
# yolov5nu, yolov5su, yolov5mu, yolov5lu, yolov5xu, yolov5n6u, yolov5s6u, yolov5m6u, yolov5l6u, yolov5x6u
yolov5 = YOLO("models/yolov5nu")
print(yolov5)

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3(
        (cv1): Conv(
          (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_st

## YOLOv8

In [13]:
yolov8 = YOLO("models/yolov8n.pt")
print(yolov8)
# save_label(yolov8, "./labs/assets/dog_bike_car.jpg", save_label_dir="./labs/assets/labels")

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

### C2f层

- Yolov8中的C2f层是CSP Bottleneck的加速版本，使用了更高效的通道分割和融合方式。
- 输入通过cv1卷积层后，在通道维度上被均分为两部分。
- 第一部分直接保留，第二部分依次通过n个Bottleneck块进行处理。
- 最后将所有部分在通道维度上拼接，并通过cv2卷积层输出结果。
- 下面是C2f层的实现代码：

In [14]:
import torch
import torch.nn as nn
from ultralytics.nn.modules import Conv, Bottleneck
class C2f(nn.Module):
    """Faster Implementation of CSP Bottleneck with 2 convolutions."""

    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = False, g: int = 1, e: float = 0.5):
        """Initialize a CSP bottleneck with 2 convolutions.

        Args:
            c1 (int): 输入通道数
            c2 (int): 输出通道数
            n (int): Bottleneck块的数量。
            shortcut (bool): 是否使用快捷连接。
            g (int): 卷积的组数。
            e (float): 扩展比例。
        """
        super().__init__()
        self.c = int(c2 * e)  # 隐藏通道数，默认为输出通道的50%
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """C2f的前向传播"""
        y = list(self.cv1(x).chunk(2, 1))  # cv1的输出在第1维度(通道)切分为两部分
        y.extend(m(y[-1]) for m in self.m)  # 对最后一部分进行n个Bottleneck块的处理
        return self.cv2(torch.cat(y, 1))  # 在第1个维度上拼接所有部分后通过cv2

    def forward_split(self, x: torch.Tensor) -> torch.Tensor:
        """使用split()代替chunk()的前向传播"""
        y = self.cv1(x).split((self.c, self.c), 1)
        y = [y[0], y[1]]
        y.extend(m(y[-1]) for m in self.m)
        return self.cv2(torch.cat(y, 1))