In [1]:
# Set up dataset path
import os

# Path to your dataset
DATASET_PATH = "/home/ubuntu/yolov8_dataset"

# Create data.yaml if not already created
yaml_content = f"""
train: {DATASET_PATH}/train/images
val: {DATASET_PATH}/valid/images

nc: 1
names: ['crater']
"""

with open('/home/ubuntu/data.yaml', 'w') as f:
    f.write(yaml_content)

print("Dataset configuration complete!")


Dataset configuration complete!


In [2]:
from ultralytics import YOLO
import torch

In [None]:
from ultralytics import YOLO
import torch

# Verify GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"GPU names: {[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]}")

# Load a YOLOv8 model
model = YOLO('yolov8m.pt')  # Load medium-sized model (you can also try 'yolov8l.pt' or 'yolov8x.pt' for larger models)

# Set training parameters
results = model.train(
    data='/home/ubuntu/data.yaml',
    imgsz=640,
    epochs=100,
    batch=64,  # Max batch size for A100XL (~80GB VRAM)
    device=0,
    workers=6,  # Optimal for 4 vCPUs (workers = vCPUs * 1.5)
    patience=50,
    project='lunar_craters',
    name='a100_optimized_run',
    exist_ok=True,
    pretrained=True,
    amp=True,  # Critical for A100 performance
    optimizer='AdamW',  # Better for large batches than default SGD
    lr0=3e-4,  # Adjust learning rate for larger batch
    cos_lr=True,  # Cosine learning rate scheduler
    overlap_mask=False,  # Reduce CPU overhead
    cache='ram',  # Use 64GB RAM for caching
    single_cls=True,  # If lunar craters are single class
    hsv_h=0.01,  # Reduce augmentation intensity
    hsv_s=0.5,
    hsv_v=0.5,
    fliplr=0.3,  # Simplify augmentations
    mosaic=0.8,  # Partial mosaic to balance speed/accuracy
    mixup=0.1,  # Light regularization
    close_mosaic=5  # Earlier mosaic cutoff
)


# Display training results summary
print(results)


GPU available: True
Number of GPUs: 1
GPU names: ['NVIDIA A100-SXM4-80GB']
Ultralytics 8.3.152 🚀 Python-3.10.12 torch-2.7.1+cu126 CUDA:0 (NVIDIA A100-SXM4-80GB, 81153MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=64, bgr=0.0, box=7.5, cache=ram, cfg=None, classes=None, close_mosaic=5, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=True, cutmix=0.0, data=/home/ubuntu/data.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=True, fliplr=0.3, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.01, hsv_s=0.5, hsv_v=0.5, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.0003, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.1, mode=train, model=yolov8m.pt, momentum=0.937, mosaic=0.8, multi_scale=False, name=a100_optimized_run, nbs=64, nms=False, opset=None, optimize=Fals

[34m[1mtrain: [0mScanning /home/ubuntu/yolov8_dataset/train/labels.cache... 13955 images, 122 backgrounds, 0 corrupt: 100%|██████████| 13955/13955 [00:00<?, ?it/s][0m

[34m[1mtrain: [0m/home/ubuntu/yolov8_dataset/train/images/Lunar_A-17-39_jpg.rf.bcc50cf4104dc65891c090e827f4fda8.jpg: 1 duplicate labels removed
[34m[1mtrain: [0m/home/ubuntu/yolov8_dataset/train/images/Lunar_C-7-3_jpg.rf.351a0cea48fef91ea3da9a652f3a2345.jpg: 1 duplicate labels removed







[34m[1mtrain: [0mCaching images (16.0GB RAM): 100%|██████████| 13955/13955 [00:17<00:00, 794.67it/s][0m 


[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 106.8±110.3 MB/s, size: 74.3 KB)


[34m[1mval: [0mScanning /home/ubuntu/yolov8_dataset/valid/labels.cache... 3712 images, 48 backgrounds, 0 corrupt: 100%|██████████| 3740/3740 [00:00<?, ?it/s][0m

[34m[1mval: [0m/home/ubuntu/yolov8_dataset/valid/images/Lunar_A-13-15_jpg.rf.38ef2ac1d37ccf9f2f4846ee6acbc964.jpg: 1 duplicate labels removed
[34m[1mval: [0m/home/ubuntu/yolov8_dataset/valid/images/Lunar_A-17-15_jpg.rf.55d8b7aab578aee27a3bd6c92a9f6f0f.jpg: 1 duplicate labels removed
[34m[1mval: [0m/home/ubuntu/yolov8_dataset/valid/images/Lunar_A-17-32_jpg.rf.b0a6c910ec04ef3997cc513f4c020d83.jpg: 1 duplicate labels removed
[34m[1mval: [0m/home/ubuntu/yolov8_dataset/valid/images/Lunar_A-6-21_jpg.rf.6a66c495e5afc19b3ea5d61471ee1757.jpg: 1 duplicate labels removed
[34m[1mval: [0m/home/ubuntu/yolov8_dataset/valid/images/Lunar_A-6-30_jpg.rf.0660e8cdeaf99f3d42a1c85dc66982b6.jpg: 1 duplicate labels removed
[34m[1mval: [0m/home/ubuntu/yolov8_dataset/valid/images/Lunar_A-9-23_jpg.rf.da25911b4cee3b7e28fec6d16ea4eac0.jpg: 1 duplicate labels removed
[34m[1mval: [0m/home/ubuntu/yolov8_dataset/valid/images/Lunar_B-11-42_jpg.rf.27a2816b311b4e1355bb85abeb7c2777.jpg: 1 duplicate lab






[34m[1mval: [0mCaching images (4.3GB RAM): 100%|██████████| 3740/3740 [00:06<00:00, 534.39it/s][0m 


Plotting labels to lunar_craters/a100_optimized_run/labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.0003, momentum=0.937) with parameter groups 77 weight(decay=0.0), 84 weight(decay=0.0005), 83 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 6 dataloader workers
Logging results to [1mlunar_craters/a100_optimized_run[0m
Starting training for 100 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/100      71.1G        nan        nan        nan        391        640: 100%|██████████| 219/219 [06:41<00:00,  1.83s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [02:05<00:00,  4.19s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      2/100      41.4G        nan        nan        nan        226        640: 100%|██████████| 219/219 [06:14<00:00,  1.71s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [01:48<00:00,  3.61s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      3/100      41.2G        nan        nan        nan        824        640: 100%|██████████| 219/219 [06:31<00:00,  1.79s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [01:44<00:00,  3.47s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      4/100      35.1G        nan        nan        nan        173        640: 100%|██████████| 219/219 [05:58<00:00,  1.63s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [01:40<00:00,  3.34s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      5/100      41.5G        nan        nan        nan        487        640: 100%|██████████| 219/219 [05:50<00:00,  1.60s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [01:40<00:00,  3.35s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      6/100      41.4G        nan        nan        nan        384        640: 100%|██████████| 219/219 [05:49<00:00,  1.59s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [01:41<00:00,  3.38s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      7/100      42.1G        nan        nan        nan        140        640: 100%|██████████| 219/219 [06:03<00:00,  1.66s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [01:40<00:00,  3.36s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      8/100      35.1G        nan        nan        nan        262        640: 100%|██████████| 219/219 [05:56<00:00,  1.63s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 30/30 [01:36<00:00,  3.22s/it]

                   all       3740     249688          0          0          0          0






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      9/100      35.1G        nan        nan        nan       6568        640:   9%|▊         | 19/219 [00:30<05:19,  1.60s/it]

In [None]:
# In a new cell
%load_ext tensorboard
%tensorboard --logdir lunar_craters


In [None]:
!cp lunar_craters/a100_optimized_run/weights/best.pt ~/best_crater_model.pt
print("Best model saved to ~/best_crater_model.pt")