# YOLOv4 from scratch with PyTorch

https://arxiv.org/abs/2004.10934

### Imports

In [7]:
import torch
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
from model import YOLO, yolov4_config
from dataset import YOLOv4Dataset
from loss import YOLOv4Loss
from utils import (
    load_checkpoint,
    trainable_parameters,
)
from train import train_yolov4
from test import test_model

### Constants

In [8]:
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
WEIGHT_DECAY = 0
EPOCHS = 100
NUM_WORKERS = 6
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "yolov4-20.pt"
LOAD_MODEL_EPOCH = 0
# https://www.kaggle.com/datasets/aladdinpersson/pascalvoc-yolo
IMG_DIR = "/data/Datasets/pascalvoc/images"
LABEL_DIR = "/data/Datasets/pascalvoc/labels"
TRAIN_CSV = "/data/Datasets/pascalvoc/train.csv"
TEST_CSV = "/data/Datasets/pascalvoc/test.csv"
IMAGE_SIZE = 416
S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
NUM_CLASSES = 20
ANCHORS = [
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
]

### Initialization of the model

In [9]:
model = YOLO(yolov4_config).to(DEVICE)
params = trainable_parameters(model)
print(f"Trainable parameters: {params}")
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_fn = YOLOv4Loss(num_classes=20)

if LOAD_MODEL:
    load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

Layer 0: ConvMish with 3 input channels and 32 output channels
Layer 1: ConvMish with 32 input channels and 64 output channels
Layer 2: CSPBlock with 64 input channels and 64 output channels
Layer 3: ConvMish with 64 input channels and 128 output channels
Layer 4: CSPBlock with 128 input channels and 128 output channels
Layer 5: ConvMish with 128 input channels and 256 output channels
Layer 6: CSPBlock with 256 input channels and 256 output channels
Layer 7: ConvMish with 256 input channels and 512 output channels
Layer 8: CSPBlock with 512 input channels and 512 output channels
Layer 9: ConvMish with 512 input channels and 1024 output channels
Layer 10: CSPBlock with 1024 input channels and 1024 output channels
Layer 11: SPP with 1024 input channels and 512 output channels
Layer 12: ConvMish with 512 input channels and 512 output channels
Layer 13: ConvMish with 512 input channels and 1024 output channels
Layer 14: ConvMish with 1024 input channels and 512 output channels
Layer 15: Co

### Initialization of the training dataset/dataloader

In [10]:
train_transforms = A.Compose(
    [
        A.LongestMaxSize(max_size=IMAGE_SIZE),
        A.PadIfNeeded(min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=0, value=[0, 0, 0]),
        A.Rotate(limit=10, p=0.5),
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
        A.MotionBlur(p=0.2),
        A.MedianBlur(blur_limit=3, p=0.1),
        A.Blur(blur_limit=3, p=0.1),
        A.OneOf([
            A.OpticalDistortion(p=0.3),
            A.GridDistortion(p=0.1),
            #A.IAAPiecewiseAffine(p=0.3),
        ], p=0.2),
        A.OneOf([
            A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.9),
            A.ToGray(p=0.01),
        ], p=0.3),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255.0),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[],),
)
test_transforms = A.Compose(
    [
        A.LongestMaxSize(max_size=IMAGE_SIZE),
        A.PadIfNeeded(min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=0, value=[0, 0, 0]),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255.0),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
)
train_dataset = YOLOv4Dataset(
    "/data/Datasets/pascalvoc/100examples.csv",
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
    anchors=ANCHORS,
    split_sizes=S,
    num_classes=NUM_CLASSES,
    transform=train_transforms,
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
)

### Initialization of the testing dataset/dataloader

In [11]:
test_dataset = YOLOv4Dataset(
    "/data/Datasets/pascalvoc/100examples.csv",
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
    anchors=ANCHORS,
    split_sizes=S,
    num_classes=NUM_CLASSES,
    transform=test_transforms,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
)

### Model training

In [12]:
train_yolov4(model, train_loader, test_loader, optimizer, loss_fn, DEVICE, EPOCHS, S, ANCHORS, IMAGE_SIZE)

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:11<00:00,  1.00it/s, ciou=2.7, cls=1.08, conf=0.324, total=4.1]  


Epoch [1/100]


100%|██████████| 12/12 [00:07<00:00,  1.71it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.09it/s, ciou=2.79, cls=0.689, conf=0.278, total=3.76]


Epoch [2/100]


100%|██████████| 12/12 [00:09<00:00,  1.22it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.00it/s, ciou=2.46, cls=0.399, conf=0.302, total=3.16]


Epoch [3/100]


100%|██████████| 12/12 [00:12<00:00,  1.06s/it]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.06it/s, ciou=2.27, cls=0.277, conf=0.277, total=2.83]


Epoch [4/100]


100%|██████████| 12/12 [00:22<00:00,  1.89s/it]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.06it/s, ciou=2.14, cls=0.28, conf=0.227, total=2.65] 


Epoch [5/100]


100%|██████████| 12/12 [00:06<00:00,  1.77it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.05it/s, ciou=2.06, cls=0.242, conf=0.25, total=2.55] 


Epoch [6/100]


100%|██████████| 12/12 [00:09<00:00,  1.22it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.03it/s, ciou=1.99, cls=0.238, conf=0.227, total=2.45]


Epoch [7/100]


100%|██████████| 12/12 [00:06<00:00,  1.79it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:12<00:00,  1.05s/it, ciou=1.96, cls=0.237, conf=0.216, total=2.41]


Epoch [8/100]


100%|██████████| 12/12 [00:06<00:00,  1.94it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.14it/s, ciou=1.97, cls=0.234, conf=0.218, total=2.43]


Epoch [9/100]


100%|██████████| 12/12 [00:06<00:00,  1.89it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.11it/s, ciou=nan, cls=0.242, conf=0.267, total=nan] 


Epoch [10/100]


100%|██████████| 12/12 [00:06<00:00,  1.74it/s]


Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 12/12 [00:10<00:00,  1.15it/s, ciou=1.96, cls=0.24, conf=0.255, total=2.46] 


Epoch [11/100]


100%|██████████| 12/12 [00:05<00:00,  2.06it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.12it/s, ciou=1.99, cls=0.231, conf=0.245, total=2.46]


Epoch [12/100]


100%|██████████| 12/12 [00:06<00:00,  1.92it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.14it/s, ciou=2.03, cls=0.223, conf=0.259, total=2.51]


Epoch [13/100]


100%|██████████| 12/12 [00:05<00:00,  2.07it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.09it/s, ciou=2.2, cls=0.24, conf=0.358, total=2.8]   


Epoch [14/100]


100%|██████████| 12/12 [00:06<00:00,  1.79it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.11it/s, ciou=2.43, cls=0.241, conf=0.395, total=3.06]


Epoch [15/100]


100%|██████████| 12/12 [00:05<00:00,  2.00it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.11it/s, ciou=2.39, cls=0.23, conf=0.351, total=2.97] 


Epoch [16/100]


100%|██████████| 12/12 [00:07<00:00,  1.65it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:11<00:00,  1.08it/s, ciou=2.44, cls=0.232, conf=0.389, total=3.06]


Epoch [17/100]


100%|██████████| 12/12 [00:05<00:00,  2.17it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.11it/s, ciou=2.51, cls=0.229, conf=0.435, total=3.18]


Epoch [18/100]


100%|██████████| 12/12 [00:06<00:00,  1.99it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.16it/s, ciou=2.41, cls=0.235, conf=0.397, total=3.04]


Epoch [19/100]


100%|██████████| 12/12 [00:06<00:00,  1.96it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.14it/s, ciou=2.5, cls=0.233, conf=0.433, total=3.17] 


Epoch [20/100]


100%|██████████| 12/12 [00:05<00:00,  2.18it/s]


Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 12/12 [00:10<00:00,  1.16it/s, ciou=2.48, cls=0.226, conf=0.412, total=3.12]


Epoch [21/100]


100%|██████████| 12/12 [00:05<00:00,  2.09it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.12it/s, ciou=nan, cls=nan, conf=nan, total=nan]      


Epoch [22/100]


100%|██████████| 12/12 [00:05<00:00,  2.28it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.18it/s, ciou=nan, cls=nan, conf=nan, total=nan]


Epoch [23/100]


100%|██████████| 12/12 [00:05<00:00,  2.39it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:09<00:00,  1.24it/s, ciou=nan, cls=nan, conf=nan, total=nan]


Epoch [24/100]


100%|██████████| 12/12 [00:06<00:00,  1.94it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.20it/s, ciou=nan, cls=nan, conf=nan, total=nan]


Epoch [25/100]


100%|██████████| 12/12 [00:05<00:00,  2.37it/s]


Test mAP: 0.0000


100%|██████████| 12/12 [00:10<00:00,  1.16it/s, ciou=nan, cls=nan, conf=nan, total=nan]


Epoch [26/100]


 67%|██████▋   | 8/12 [00:03<00:01,  2.01it/s]


KeyboardInterrupt: 

### Plot some predictions on the test dataset

In [None]:
test_model(10, model, test_loader, DEVICE, split_size=SPLIT_SIZE, num_boxes=NUM_BOXES, num_classes=NUM_CLASSES)

NameError: name 'SPLIT_SIZE' is not defined