# Imports

In [5]:
import os
import random
import mmcv
import mmdet
from mmdet.utils import register_all_modules
from mmengine import Config
from mmdet.apis import init_detector, inference_detector
from mmdet.registry import DATASETS
from mmengine.runner import Runner
import urllib.request
import torch
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.1.0+cu121
CUDA available: True


# PASCAL VOC Dataset

In [2]:
data_root = 'VOC2012/'
splits_dir = os.path.join(data_root, 'ImageSets/Main')

print("Available split files:")
for f in sorted(os.listdir(splits_dir)):
    if f.endswith('.txt'):
        filepath = os.path.join(splits_dir, f)
        with open(filepath, 'r') as file:
            lines = file.readlines()
        print(f"  {f:20s} - {len(lines):5d} images")

Available split files:
  aeroplane_train.txt  -  5717 images
  aeroplane_trainval.txt - 11540 images
  aeroplane_val.txt    -  5823 images
  bicycle_train.txt    -  5717 images
  bicycle_trainval.txt - 11540 images
  bicycle_val.txt      -  5823 images
  bird_train.txt       -  5717 images
  bird_trainval.txt    - 11540 images
  bird_val.txt         -  5823 images
  boat_train.txt       -  5717 images
  boat_trainval.txt    - 11540 images
  boat_val.txt         -  5823 images
  bottle_train.txt     -  5717 images
  bottle_trainval.txt  - 11540 images
  bottle_val.txt       -  5823 images
  bus_train.txt        -  5717 images
  bus_trainval.txt     - 11540 images
  bus_val.txt          -  5823 images
  car_train.txt        -  5717 images
  car_trainval.txt     - 11540 images
  car_val.txt          -  5823 images
  cat_train.txt        -  5717 images
  cat_trainval.txt     - 11540 images
  cat_val.txt          -  5823 images
  chair_train.txt      -  5717 images
  chair_trainval.txt   - 

# Train / Val / Test splits

In [7]:
import os

# Set random seed for reproducibility
random.seed(42)

# Paths
val_file = 'VOC2012/ImageSets/Main/val.txt'
val_custom_file = 'VOC2012/ImageSets/Main/val_custom.txt'
test_custom_file = 'VOC2012/ImageSets/Main/test_custom.txt'

# Read val.txt
print("Reading val.txt...")
with open(val_file, 'r') as f:
    val_samples = [line.strip() for line in f.readlines()]

print(f"Total samples in val.txt: {len(val_samples)}")

# Split in half
random.shuffle(val_samples)
mid_point = len(val_samples) // 2
val_custom_samples = val_samples[:mid_point]
test_custom_samples = val_samples[mid_point:]

print(f"Samples for val_custom.txt: {len(val_custom_samples)}")
print(f"Samples for test_custom.txt: {len(test_custom_samples)}")

# Write val_custom.txt
print(f"\nWriting to {val_custom_file}...")
with open(val_custom_file, 'w') as f:
    f.write('\n'.join(val_custom_samples))

# Write test_custom.txt
print(f"Writing to {test_custom_file}...")
with open(test_custom_file, 'w') as f:
    f.write('\n'.join(test_custom_samples))

print("\nCustom splits created successfully!")
print("\nSummary:")
print(f"Original val.txt:     {len(val_samples):5d} samples")
print(f"New val_custom.txt:   {len(val_custom_samples):5d} samples")
print(f"New test_custom.txt:  {len(test_custom_samples):5d} samples")

# Verify no overlap
overlap = set(val_custom_samples) & set(test_custom_samples)
print(f"Verification:")
print(f"Overlap between val and test: {len(overlap)} samples")
print(f"Total samples preserved: {len(val_custom_samples) + len(test_custom_samples)} = {len(val_samples)}")

Reading val.txt...
Total samples in val.txt: 5823
Samples for val_custom.txt: 2911
Samples for test_custom.txt: 2912

Writing to VOC2012/ImageSets/Main/val_custom.txt...
Writing to VOC2012/ImageSets/Main/test_custom.txt...

Custom splits created successfully!

Summary:
Original val.txt:      5823 samples
New val_custom.txt:    2911 samples
New test_custom.txt:   2912 samples
Verification:
Overlap between val and test: 0 samples
Total samples preserved: 5823 = 5823


# YOLOX

## Model download

In [4]:
# YOLOX-s model (you can change to yolox_tiny, yolox_m, yolox_l, yolox_x)
model_url = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth'
model_path = 'checkpoints/yolox_s_8x8_300e_coco.pth'

if not os.path.exists(model_path):
    print("Downloading pretrained YOLOX model...")
    urllib.request.urlretrieve(model_url, model_path)
    print("Download complete!")
else:
    print("Pretrained model already exists!")

Downloading pretrained YOLOX model...
Download complete!


## Configuration file

In [39]:
# Create custom config for YOLOX on VOC
cfg_content = """
# Set default scope
default_scope = 'mmdet'

# Model settings
model = dict(
    type='YOLOX',
    data_preprocessor=dict(
        type='DetDataPreprocessor',
        pad_size_divisor=32,
        batch_augments=[
            dict(
                type='BatchSyncRandomResize',
                random_size_range=(480, 800),
                size_divisor=32,
                interval=10)
        ]),
    backbone=dict(
        type='CSPDarknet',
        deepen_factor=0.33,
        widen_factor=0.5,
        out_indices=(2, 3, 4),
        use_depthwise=False,
        spp_kernal_sizes=(5, 9, 13),
        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
        act_cfg=dict(type='Swish')),
    neck=dict(
        type='YOLOXPAFPN',
        in_channels=[128, 256, 512],
        out_channels=128,
        num_csp_blocks=1,
        use_depthwise=False,
        upsample_cfg=dict(scale_factor=2, mode='nearest'),
        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
        act_cfg=dict(type='Swish')),
    bbox_head=dict(
        type='YOLOXHead',
        num_classes=20,
        in_channels=128,
        feat_channels=128,
        stacked_convs=2,
        strides=(8, 16, 32),
        use_depthwise=False,
        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
        act_cfg=dict(type='Swish'),
        loss_cls=dict(
            type='CrossEntropyLoss',
            use_sigmoid=True,
            reduction='sum',
            loss_weight=1.0),
        loss_bbox=dict(
            type='IoULoss',
            mode='square',
            eps=1e-16,
            reduction='sum',
            loss_weight=5.0),
        loss_obj=dict(
            type='CrossEntropyLoss',
            use_sigmoid=True,
            reduction='sum',
            loss_weight=1.0),
        loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)),
    train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
    test_cfg=dict(
        score_thr=0.01,
        nms=dict(type='nms', iou_threshold=0.65)))

# Dataset settings
dataset_type = 'VOCDataset'
data_root = 'VOC2012/'

# Training pipeline (simplified without Mosaic/MixUp)
train_pipeline = [
    dict(type='LoadImageFromFile', backend_args=None),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(type='Resize', scale=(640, 640), keep_ratio=True),
    dict(type='RandomFlip', prob=0.5),
    dict(
        type='PhotoMetricDistortion',
        brightness_delta=32,
        contrast_range=(0.5, 1.5),
        saturation_range=(0.5, 1.5),
        hue_delta=18),
    dict(
        type='Pad',
        pad_to_square=True,
        pad_val=dict(img=(114.0, 114.0, 114.0))),
    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
    dict(type='PackDetInputs')
]

# Testing pipeline
test_pipeline = [
    dict(type='LoadImageFromFile', backend_args=None),
    dict(type='Resize', scale=(640, 640), keep_ratio=True),
    dict(
        type='Pad',
        pad_to_square=True,
        pad_val=dict(img=(114.0, 114.0, 114.0))),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(
        type='PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                   'scale_factor'))
]

BATCH_SIZE = 24

# Data loaders
train_dataloader = dict(
    batch_size=BATCH_SIZE,
    num_workers=4,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file='ImageSets/Main/train.txt',
        data_prefix=dict(sub_data_root=''),
        filter_cfg=dict(filter_empty_gt=True, min_size=32),
        pipeline=train_pipeline))

val_dataloader = dict(
    batch_size=BATCH_SIZE,
    num_workers=4,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file='ImageSets/Main/val_custom.txt',
        data_prefix=dict(sub_data_root=''),
        test_mode=True,
        pipeline=test_pipeline))

test_dataloader = dict(
    batch_size=BATCH_SIZE,
    num_workers=4,
    persistent_workers=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        ann_file='ImageSets/Main/test_custom.txt',
        data_prefix=dict(sub_data_root=''),
        test_mode=True,
        pipeline=test_pipeline))

# Evaluator
val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points')
test_evaluator = val_evaluator

# Optimizer
optim_wrapper = dict(
    type='OptimWrapper',
    optimizer=dict(
        type='SGD', lr=0.01, momentum=0.9, weight_decay=5e-4,
        nesterov=True),
    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))

# Learning rate config
max_epochs = 7 #100
num_last_epochs = 1 #15
interval = 10

param_scheduler = [
    dict(
        type='LinearLR',
        start_factor=0.001,
        by_epoch=True,
        begin=0,
        end=5),
    dict(
        type='CosineAnnealingLR',
        eta_min=0.0001,
        begin=5,
        T_max=max_epochs - num_last_epochs,
        end=max_epochs - num_last_epochs,
        by_epoch=True),
    dict(
        type='ConstantLR',
        by_epoch=True,
        factor=1,
        begin=max_epochs - num_last_epochs,
        end=max_epochs,
    )
]

# Training config
train_cfg = dict(
    type='EpochBasedTrainLoop',
    max_epochs=max_epochs,
    val_interval=interval)

val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')

# Hooks
default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=BATCH_SIZE),
    param_scheduler=dict(type='ParamSchedulerHook'),
    checkpoint=dict(
        type='CheckpointHook',
        interval=interval,
        max_keep_ckpts=3,
        save_best='auto'),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='DetVisualizationHook'))

# Custom hooks - REMOVED YOLOXModeSwitchHook since we don't use Mosaic/MixUp
custom_hooks = [
    dict(
        type='EMAHook',
        ema_type='ExpMomentumEMA',
        momentum=0.0001,
        update_buffers=True,
        priority=49)
]

# Environment settings
env_cfg = dict(
    cudnn_benchmark=False,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'),
)

# Visualizer
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
    type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')

# Log processor
log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)

# Log level
log_level = 'INFO'

# Load from pretrained checkpoint
load_from = 'checkpoints/yolox_s_8x8_300e_coco.pth'

# Resume training
resume = False

# Runtime settings
work_dir = './work_dirs/yolox_voc'
"""

# Save config
os.makedirs('configs/yolox', exist_ok=True)
with open('configs/yolox/yolox_s_voc.py', 'w') as f:
    f.write(cfg_content)

print("Configuration file created successfully!")
print("Config saved to: configs/yolox/yolox_s_voc.py")

Configuration file created successfully!
Config saved to: configs/yolox/yolox_s_voc.py


## Load and Verify Configuration File

In [3]:
# Register all MMDetection modules with mmdet scope
register_all_modules(init_default_scope=True)

# Load the config
cfg = Config.fromfile('configs/yolox/yolox_s_voc.py')

# Set work directory
os.makedirs(cfg.work_dir, exist_ok=True)

print("Configuration loaded successfully!")
print(f"Default scope: {cfg.get('default_scope', 'Not set')}")
print(f"Work directory: {cfg.work_dir}")
print(f"Max epochs: {cfg.max_epochs}")
print(f"Batch size: {cfg.train_dataloader['batch_size']}")

Configuration loaded successfully!
Default scope: mmdet
Work directory: ./work_dirs/yolox_voc
Max epochs: 7
Batch size: 24


## Train Model

In [4]:
# Build the runner
runner = Runner.from_cfg(cfg)

# Start training
print("Starting training...")
runner.train()

12/10 16:35:57 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.11.13 (main, Sep 18 2025, 19:46:39) [Clang 20.1.4 ]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 854631069
    GPU 0: NVIDIA RTX A6000
    CUDA_HOME: None
    GCC: cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
    PyTorch: 2.1.0+cu121
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 12.1
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_

YOLOX(
  (data_preprocessor): DetDataPreprocessor(
    (batch_augments): ModuleList(
      (0): BatchSyncRandomResize()
    )
  )
  (backbone): CSPDarknet(
    (stem): Focus(
      (conv): ConvModule(
        (conv): Conv2d(12, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (activate): Swish()
      )
    )
    (stage1): Sequential(
      (0): ConvModule(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (activate): Swish()
      )
      (1): CSPLayer(
        (main_conv): ConvModule(
          (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (activate): Swish()
        )
        (short_conv): C

# Faster R-CNN

# DETR