# YOLOv3 from scratch with PyTorch

https://arxiv.org/abs/1506.02640

### Imports

In [1]:
import torch
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
from model import YOLO, yolov3_config
from dataset import YOLOv3Dataset
from loss import YOLOv3Loss
from utils import (
    load_checkpoint,
    trainable_parameters,
)
from train import train_yolov3
from test import test_model

INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.12 (you have 1.4.10). Upgrade using: pip install --upgrade albumentations


### Constants

In [2]:
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
WEIGHT_DECAY = 0
EPOCHS = 100
NUM_WORKERS = 6
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "yolov3-20.pt"
LOAD_MODEL_EPOCH = 0
# https://www.kaggle.com/datasets/aladdinpersson/pascalvoc-yolo
IMG_DIR = "/data/Datasets/pascalvoc/images"
LABEL_DIR = "/data/Datasets/pascalvoc/labels"
TRAIN_CSV = "/data/Datasets/pascalvoc/train.csv"
TEST_CSV = "/data/Datasets/pascalvoc/test.csv"
IMAGE_SIZE = 416
S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
NUM_CLASSES = 20
ANCHORS = [
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
]

### Initialization of the model

In [3]:
model = YOLO(yolov3_config).to(DEVICE)
params = trainable_parameters(model)
print(f"Trainable parameters: {params}")
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_fn = YOLOv3Loss()

if LOAD_MODEL:
    load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

Layer 0: Conv with 3 input channels and 32 output channels
Layer 1: Conv with 32 input channels and 64 output channels
Layer 2: Bottleneck with 64 input channels and 64 output channels
Layer 3: Conv with 64 input channels and 128 output channels
Layer 4: Bottleneck with 128 input channels and 128 output channels
Layer 5: Bottleneck with 128 input channels and 128 output channels
Layer 6: Conv with 128 input channels and 256 output channels
Layer 7: Bottleneck with 256 input channels and 256 output channels
Layer 8: Bottleneck with 256 input channels and 256 output channels
Layer 9: Bottleneck with 256 input channels and 256 output channels
Layer 10: Bottleneck with 256 input channels and 256 output channels
Layer 11: Bottleneck with 256 input channels and 256 output channels
Layer 12: Bottleneck with 256 input channels and 256 output channels
Layer 13: Bottleneck with 256 input channels and 256 output channels
Layer 14: Bottleneck with 256 input channels and 256 output channels
Layer 1

### Initialization of the training dataset/dataloader

In [4]:
train_transforms = A.Compose(
    [
        A.LongestMaxSize(max_size=int(IMAGE_SIZE * 1.1)),
        A.PadIfNeeded(
            min_height=int(IMAGE_SIZE * 1.1),
            min_width=int(IMAGE_SIZE * 1.1)
        ),
        A.RandomCrop(width=IMAGE_SIZE, height=IMAGE_SIZE),
        A.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.6, p=0.4),
        A.ShiftScaleRotate(
            rotate_limit=20, p=0.5, border_mode=cv2.BORDER_CONSTANT
        ),
        A.HorizontalFlip(p=0.5),
        A.Blur(p=0.1),
        A.CLAHE(p=0.1),
        A.Posterize(p=0.1),
        A.ToGray(p=0.1),
        A.ChannelShuffle(p=0.05),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[],),
)

test_transforms = A.Compose(
    [
        A.LongestMaxSize(max_size=IMAGE_SIZE),
        A.PadIfNeeded(min_height=IMAGE_SIZE, min_width=IMAGE_SIZE),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
)

train_dataset = YOLOv3Dataset(
    "/data/Datasets/pascalvoc/100examples.csv",
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
    anchors=ANCHORS,
    split_sizes=S,
    num_classes=NUM_CLASSES,
    #transform=train_transforms,
    transform=test_transforms,
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
)

test_dataset = YOLOv3Dataset(
    "/data/Datasets/pascalvoc/100examples.csv",
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
    anchors=ANCHORS,
    split_sizes=S,
    num_classes=NUM_CLASSES,
    transform=test_transforms,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
)

  Expected `Union[float, tuple[float, float]]` but got `list` - serialized value may not be as expected
  Expected `Union[float, tuple[float, float]]` but got `list` - serialized value may not be as expected
  Expected `Union[float, tuple[float, float]]` but got `list` - serialized value may not be as expected
  Expected `Union[float, tuple[float, float]]` but got `list` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


### Model training

In [5]:
train_yolov3(model, train_loader, test_loader, optimizer, loss_fn, DEVICE, EPOCHS, S, ANCHORS)

100%|██████████| 6/6 [00:07<00:00,  1.30s/it, loss=47.8, mean_loss=53.2]


Epoch [1/100], Mean loss: 53.1945


100%|██████████| 6/6 [00:05<00:00,  1.10it/s, loss=51.3, mean_loss=52.8]


Epoch [2/100], Mean loss: 52.8289


100%|██████████| 6/6 [00:05<00:00,  1.15it/s, loss=45.7, mean_loss=47.2]


Epoch [3/100], Mean loss: 47.1945


100%|██████████| 6/6 [00:05<00:00,  1.04it/s, loss=40.8, mean_loss=44.4]


Epoch [4/100], Mean loss: 44.4332


100%|██████████| 6/6 [00:05<00:00,  1.05it/s, loss=40.9, mean_loss=41.5]


Epoch [5/100], Mean loss: 41.4559


100%|██████████| 6/6 [00:05<00:00,  1.02it/s, loss=41.6, mean_loss=39.9]


Epoch [6/100], Mean loss: 39.8634


100%|██████████| 6/6 [00:07<00:00,  1.33s/it, loss=38.4, mean_loss=38.2]


Epoch [7/100], Mean loss: 38.2439


100%|██████████| 6/6 [00:08<00:00,  1.45s/it, loss=33.8, mean_loss=36.5]


Epoch [8/100], Mean loss: 36.5444


100%|██████████| 6/6 [00:08<00:00,  1.45s/it, loss=32.6, mean_loss=35.5]


Epoch [9/100], Mean loss: 35.5074


100%|██████████| 6/6 [00:09<00:00,  1.51s/it, loss=35.7, mean_loss=35.5]


Epoch [10/100], Mean loss: 35.4586


100%|██████████| 6/6 [01:28<00:00, 14.70s/it]


Processed all batches, found 5753 pred boxes and 778748 true boxes
Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 6/6 [00:09<00:00,  1.55s/it, loss=32.4, mean_loss=34]  


Epoch [11/100], Mean loss: 33.9589


100%|██████████| 6/6 [00:09<00:00,  1.63s/it, loss=34, mean_loss=33.7]  


Epoch [12/100], Mean loss: 33.7384


100%|██████████| 6/6 [00:08<00:00,  1.47s/it, loss=32.7, mean_loss=33.5]


Epoch [13/100], Mean loss: 33.4641


100%|██████████| 6/6 [00:10<00:00,  1.72s/it, loss=32.8, mean_loss=33.6]


Epoch [14/100], Mean loss: 33.6302


100%|██████████| 6/6 [00:10<00:00,  1.72s/it, loss=31.9, mean_loss=33.1]


Epoch [15/100], Mean loss: 33.0554


100%|██████████| 6/6 [00:09<00:00,  1.57s/it, loss=32.1, mean_loss=32.5]


Epoch [16/100], Mean loss: 32.5255


100%|██████████| 6/6 [00:09<00:00,  1.50s/it, loss=30.8, mean_loss=32]  


Epoch [17/100], Mean loss: 31.9850


100%|██████████| 6/6 [00:09<00:00,  1.61s/it, loss=30.9, mean_loss=32]  


Epoch [18/100], Mean loss: 31.9796


100%|██████████| 6/6 [00:10<00:00,  1.68s/it, loss=31.5, mean_loss=31.2]


Epoch [19/100], Mean loss: 31.1826


100%|██████████| 6/6 [00:09<00:00,  1.51s/it, loss=32, mean_loss=30.8]  


Epoch [20/100], Mean loss: 30.7831


100%|██████████| 6/6 [01:22<00:00, 13.81s/it]


Processed all batches, found 5643 pred boxes and 778748 true boxes
Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 6/6 [00:09<00:00,  1.57s/it, loss=28.8, mean_loss=29.9]


Epoch [21/100], Mean loss: 29.8796


100%|██████████| 6/6 [00:09<00:00,  1.59s/it, loss=28.6, mean_loss=29.4]


Epoch [22/100], Mean loss: 29.4316


100%|██████████| 6/6 [00:08<00:00,  1.48s/it, loss=30.9, mean_loss=29.9]


Epoch [23/100], Mean loss: 29.9444


100%|██████████| 6/6 [00:08<00:00,  1.46s/it, loss=29.5, mean_loss=29.8]


Epoch [24/100], Mean loss: 29.8111


100%|██████████| 6/6 [00:08<00:00,  1.47s/it, loss=30.2, mean_loss=29.1]


Epoch [25/100], Mean loss: 29.0524


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=28.8, mean_loss=29.1]


Epoch [26/100], Mean loss: 29.0568


100%|██████████| 6/6 [00:09<00:00,  1.51s/it, loss=28.8, mean_loss=28.4]


Epoch [27/100], Mean loss: 28.3949


100%|██████████| 6/6 [00:08<00:00,  1.40s/it, loss=28.7, mean_loss=28]  


Epoch [28/100], Mean loss: 28.0454


100%|██████████| 6/6 [00:09<00:00,  1.52s/it, loss=29.6, mean_loss=28.7]


Epoch [29/100], Mean loss: 28.7254


100%|██████████| 6/6 [00:09<00:00,  1.53s/it, loss=27.7, mean_loss=27.8]


Epoch [30/100], Mean loss: 27.8390


100%|██████████| 6/6 [01:12<00:00, 12.09s/it]


Processed all batches, found 5543 pred boxes and 778748 true boxes
Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 6/6 [00:08<00:00,  1.43s/it, loss=26.8, mean_loss=28.2]


Epoch [31/100], Mean loss: 28.2056


100%|██████████| 6/6 [00:08<00:00,  1.42s/it, loss=26.7, mean_loss=28]  


Epoch [32/100], Mean loss: 27.9829


100%|██████████| 6/6 [00:08<00:00,  1.42s/it, loss=28.6, mean_loss=28.1]


Epoch [33/100], Mean loss: 28.0603


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=27, mean_loss=28]    


Epoch [34/100], Mean loss: 28.0382


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=26.1, mean_loss=30.7]


Epoch [35/100], Mean loss: 30.7343


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=28.1, mean_loss=28]  


Epoch [36/100], Mean loss: 27.9771


100%|██████████| 6/6 [00:08<00:00,  1.45s/it, loss=27.7, mean_loss=28.3]


Epoch [37/100], Mean loss: 28.3370


100%|██████████| 6/6 [00:08<00:00,  1.45s/it, loss=27.5, mean_loss=27.9]


Epoch [38/100], Mean loss: 27.8952


100%|██████████| 6/6 [00:08<00:00,  1.47s/it, loss=27.4, mean_loss=27.5]


Epoch [39/100], Mean loss: 27.4712


100%|██████████| 6/6 [00:09<00:00,  1.51s/it, loss=30.3, mean_loss=27.9]


Epoch [40/100], Mean loss: 27.9233


100%|██████████| 6/6 [01:32<00:00, 15.39s/it]


Processed all batches, found 5665 pred boxes and 778748 true boxes
Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 6/6 [00:09<00:00,  1.51s/it, loss=27.1, mean_loss=27.4]


Epoch [41/100], Mean loss: 27.3988


100%|██████████| 6/6 [00:09<00:00,  1.52s/it, loss=28.2, mean_loss=27.3]


Epoch [42/100], Mean loss: 27.3312


100%|██████████| 6/6 [00:08<00:00,  1.45s/it, loss=26.6, mean_loss=27.4]


Epoch [43/100], Mean loss: 27.4385


100%|██████████| 6/6 [00:08<00:00,  1.35s/it, loss=28.4, mean_loss=27.4]


Epoch [44/100], Mean loss: 27.3511


100%|██████████| 6/6 [00:08<00:00,  1.47s/it, loss=26.5, mean_loss=27.1]


Epoch [45/100], Mean loss: 27.1226


100%|██████████| 6/6 [00:08<00:00,  1.47s/it, loss=28.4, mean_loss=26.9]


Epoch [46/100], Mean loss: 26.9207


100%|██████████| 6/6 [00:08<00:00,  1.50s/it, loss=27.7, mean_loss=26.7]


Epoch [47/100], Mean loss: 26.7341


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=27.1, mean_loss=27]  


Epoch [48/100], Mean loss: 27.0008


100%|██████████| 6/6 [00:09<00:00,  1.53s/it, loss=28.3, mean_loss=26.9]


Epoch [49/100], Mean loss: 26.9322


100%|██████████| 6/6 [00:08<00:00,  1.38s/it, loss=25.5, mean_loss=26.6]


Epoch [50/100], Mean loss: 26.5881


100%|██████████| 6/6 [01:10<00:00, 11.73s/it]


Processed all batches, found 5607 pred boxes and 778748 true boxes
Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 6/6 [00:09<00:00,  1.53s/it, loss=26.2, mean_loss=26.8]


Epoch [51/100], Mean loss: 26.7579


100%|██████████| 6/6 [00:08<00:00,  1.39s/it, loss=26.7, mean_loss=26.7]


Epoch [52/100], Mean loss: 26.7299


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=27.7, mean_loss=26.9]


Epoch [53/100], Mean loss: 26.8847


100%|██████████| 6/6 [00:08<00:00,  1.47s/it, loss=26.5, mean_loss=26.9]


Epoch [54/100], Mean loss: 26.8813


100%|██████████| 6/6 [00:08<00:00,  1.43s/it, loss=26.8, mean_loss=26.4]


Epoch [55/100], Mean loss: 26.4289


100%|██████████| 6/6 [00:08<00:00,  1.42s/it, loss=26.7, mean_loss=26.3]


Epoch [56/100], Mean loss: 26.2841


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=26.1, mean_loss=26.3]


Epoch [57/100], Mean loss: 26.2704


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=26.5, mean_loss=26.4]


Epoch [58/100], Mean loss: 26.3619


100%|██████████| 6/6 [00:08<00:00,  1.46s/it, loss=26.5, mean_loss=26.2]


Epoch [59/100], Mean loss: 26.2098


100%|██████████| 6/6 [00:08<00:00,  1.42s/it, loss=26.7, mean_loss=26]  


Epoch [60/100], Mean loss: 26.0099


100%|██████████| 6/6 [01:12<00:00, 12.12s/it]


Processed all batches, found 5595 pred boxes and 778748 true boxes
Test mAP: 0.0000
=> Saving checkpoint


100%|██████████| 6/6 [00:08<00:00,  1.43s/it, loss=25.5, mean_loss=25.9]


Epoch [61/100], Mean loss: 25.9314


100%|██████████| 6/6 [00:08<00:00,  1.41s/it, loss=25.1, mean_loss=26]  


Epoch [62/100], Mean loss: 25.9901


100%|██████████| 6/6 [00:08<00:00,  1.41s/it, loss=25.3, mean_loss=26]  


Epoch [63/100], Mean loss: 25.9947


100%|██████████| 6/6 [00:08<00:00,  1.45s/it, loss=25.6, mean_loss=25.8]


Epoch [64/100], Mean loss: 25.8075


100%|██████████| 6/6 [00:08<00:00,  1.45s/it, loss=25.3, mean_loss=25.8]


Epoch [65/100], Mean loss: 25.8256


100%|██████████| 6/6 [00:08<00:00,  1.44s/it, loss=25, mean_loss=25.7]  


Epoch [66/100], Mean loss: 25.6614


100%|██████████| 6/6 [00:08<00:00,  1.46s/it, loss=25.3, mean_loss=25.5]


Epoch [67/100], Mean loss: 25.5039


100%|██████████| 6/6 [00:08<00:00,  1.48s/it, loss=24.2, mean_loss=25.4]


Epoch [68/100], Mean loss: 25.3687


100%|██████████| 6/6 [00:08<00:00,  1.49s/it, loss=25.8, mean_loss=25.2]


Epoch [69/100], Mean loss: 25.2416


100%|██████████| 6/6 [00:08<00:00,  1.47s/it, loss=25.8, mean_loss=25.3]


Epoch [70/100], Mean loss: 25.3224


 83%|████████▎ | 5/6 [01:08<00:13, 13.93s/it]

### Initialization of the testing dataset/dataloader

In [None]:
test_dataset = YOLOv3Dataset(
    "/data/Datasets/pascalvoc/test.csv",
    img_dir=IMG_DIR,
    label_dir=LABEL_DIR,
    anchors=ANCHORS,
    split_sizes=S,
    num_classes=NUM_CLASSES,
    transform=test_transforms,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
)

### Plot some predictions on the test dataset

In [None]:
test_model(10, model, test_loader, DEVICE, split_size=SPLIT_SIZE, num_boxes=NUM_BOXES, num_classes=NUM_CLASSES)

NameError: name 'SPLIT_SIZE' is not defined