In [1]:
!pip install ultralytics


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from ultralytics import YOLO
import torch

### Clear GPU Memory Before Starting Training

#### This ensures we start with clean memory and no memory leaks.

In [3]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()     # Releases unreferenced cached memory
    torch.cuda.ipc_collect()     # Cleans up any interprocess memory from dead processes

### Load the Pretrained YOLOv8 Model

#### You can choose 'yolov8s.pt' for smaller models or 'yolov8m.pt' for balanced accuracy.

In [4]:
model = YOLO('yolov8m.pt')

### Automatically Select the Best Available Device

#### This ensures that if a CUDA-capable GPU (like your L40S) is available, it's used.

In [None]:
device = 0 if torch.cuda.is_available() else 'cpu'
print(f"Device in use: {device}")
model.to(device)

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(48, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(48, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_

### Begin Training the Model with Optimized Settings

#### These parameters are chosen to improve convergence, utilize your hardware well,
#### and apply techniques like mixed precision and EMA for stability and performance.

In [None]:
results = model.train(
    # Dataset
    data='/teamspace/studios/this_studio/data/yolo-extracted/data.yaml',         # Path to dataset config file (classes, train/val paths, etc.)

    # Training Setup
    epochs=200,                       # Total training cycles (more epochs = better performance if not overfitting)
    imgsz=640,                        # Higher input resolution can help with small objects (adjust batch size if needed)
    batch=16,                         # Number of samples per GPU iteration (balance with VRAM)
    cache=True,                       # Load entire dataset into RAM (you have 128 GB, so it speeds things up)

    # Hardware Optimization
    workers=8,                        # Number of dataloader subprocesses (8–12 works well with 16 vCPUs)
    amp=True,                         # Mixed precision training (faster, less memory, same accuracy)
    device=device,                    # Use GPU if available, otherwise fallback to CPU

    # Learning Rate & Optimizer
    lr0=0.005,                        # Initial learning rate
    lrf=0.0005,                       # Final learning rate (decayed over time)
    warmup_epochs=5,                  # Start with slower learning rate to stabilize early training
    warmup_momentum=0.75,             # Momentum during warmup phase
    optimizer='AdamW',                # Weight-decay-aware Adam optimizer (generally better for generalization)
    weight_decay=0.0005,              # L2 regularization (prevents overfitting)

    # Validation
    val=True,                         # Run validation after every epoch to monitor performance

    # Model Saving & Early Stopping
    save_period=5,                    # Save model every 5 epochs
    patience=15,                      # Stop early if no improvement in validation loss for 15 epochs

    # Additional Optimization
    cos_lr=True,                      # Use cosine decay for smoother convergence to minima
    name='yolov8m-unBalanced-Dataset' # Name of the model
)

New https://pypi.org/project/ultralytics/8.3.122 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.114 🚀 Python-3.10.10 torch-2.2.1+cu121 CPU (Intel Xeon Platinum 8259CL 2.50GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8m.pt, data=/teamspace/studios/this_studio/data/yolo-extracted/data.yaml, epochs=200, time=None, patience=15, batch=16, imgsz=640, save=True, save_period=5, cache=True, device=cpu, workers=8, project=None, name=yolov8m-unBalanced-Dataset5, exist_ok=False, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=True, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=

Model summary: 169 layers, 25,859,794 parameters, 25,859,778 gradients, 79.1 GFLOPs

Transferred 469/475 items from pretrained weights
Freezing layer 'model.22.dfl.conv.weight'
[34m[1mtrain: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 2008.4±480.2 MB/s, size: 161.6 KB)


[34m[1mtrain: [0mScanning /teamspace/studios/this_studio/data/yolo-extracted/labels/train.cache... 5931 images, 6 backgrounds, 0 corrupt: 100%|██████████| 5931/5931 [00:00<?, ?it/s]




[34m[1mtrain: [0mCaching images (4.5GB RAM): 100%|██████████| 5931/5931 [00:38<00:00, 155.65it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))
[34m[1mval: [0mFast image access ✅ (ping: 0.7±1.6 ms, read: 1655.8±795.1 MB/s, size: 137.4 KB)


[34m[1mval: [0mScanning /teamspace/studios/this_studio/data/yolo-extracted/labels/val.cache... 1050 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1050/1050 [00:00<?, ?it/s]




[34m[1mval: [0mCaching images (0.8GB RAM): 100%|██████████| 1050/1050 [00:07<00:00, 148.50it/s]


Plotting labels to runs/detect/yolov8m-unBalanced-Dataset5/labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.005, momentum=0.937) with parameter groups 77 weight(decay=0.0), 84 weight(decay=0.0005), 83 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/yolov8m-unBalanced-Dataset5[0m
Starting training for 200 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/200         0G     0.9886      1.267      1.239         34        640:  34%|███▎      | 125/371 [1:10:16<2:18:17, 33.73s/it]


### Evaluate the model

In [None]:
metrics = model.val(data='data.yaml')  # Path to the validation dataset YAML file

### Export the trained model

In [None]:
model.export(format="onnx")  # or use format="engine" for TensorRT