In [1]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-32GB (UUID: GPU-ca3aed3d-2f15-4bb6-90df-13b26ca00a91)


In [2]:
!nvidia-smi

Wed Dec 10 18:12:28 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  | 00000000:18:00.0 Off |                    0 |
| N/A   39C    P0              42W / 300W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
%%writefile single_gpu_ddp_amp_benchmark.py
import os
import time
import socket
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from torchvision import transforms
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

# AMP Imports (Robust check for newer PyTorch versions)
try:
    from torch.amp import autocast, GradScaler
    USE_NEW_AMP = True
except ImportError:
    from torch.cuda.amp import autocast, GradScaler
    USE_NEW_AMP = False

# ============================================================
# 1. SETUP / CLEANUP
# ============================================================

def find_free_port():
    """Finds a random open port on the machine."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(('', 0))
        return s.getsockname()[1]

def setup_dist(rank, world_size, port):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = str(port)
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup_dist():
    dist.destroy_process_group()

# ============================================================
# 2. DATASET & MODEL
# ============================================================

class OralCancerDataset(Dataset):
    def __init__(self, dataframe, path_map, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.path_map = path_map
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_id = row['id']
        label = int(row['label'])
        
        img_path = self.path_map.get(img_id)
        if img_path is None:
            image = Image.new('RGB', (96, 96))
        else:
            image = Image.open(img_path).convert('RGB')
            
        if self.transform:
            image = self.transform(image)
            
        return image, label

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 12 * 12, 256)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = x.view(-1, 128 * 12 * 12)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# ============================================================
# 3. TRAINING WORKER (DDP + AMP + RESUME)
# ============================================================

def train_worker(rank, world_size, df, path_map, batch_size, num_epochs, csv_path, port):
    setup_dist(rank, world_size, port)
    device = torch.device(f"cuda:{rank}")
    
    # --- Transforms ---
    IMG_SIZE = 96
    train_tf = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3),
    ])
    val_tf = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3),
    ])

    # --- Split ---
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
    
    train_dataset = OralCancerDataset(train_df, path_map, transform=train_tf)
    val_dataset   = OralCancerDataset(val_df,   path_map, transform=val_tf)

    # --- Samplers (Required for DDP) ---
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
    val_sampler   = DistributedSampler(val_dataset, num_replicas=world_size, rank=rank, shuffle=False)

    # --- Loaders ---
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, 
                              num_workers=0, pin_memory=True)
    val_loader   = DataLoader(val_dataset, batch_size=batch_size, sampler=val_sampler, 
                              num_workers=0, pin_memory=True)

    # --- Model Setup ---
    model = SimpleCNN().to(device)
    # Wrap model with DDP
    model = DDP(model, device_ids=[rank])

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scaler = GradScaler()

    # --- CHECKPOINT & RESUME LOGIC ---
    ckpt_dir = "checkpoints_ddp_amp_fresh"
    if rank == 0:
        os.makedirs(ckpt_dir, exist_ok=True)
    
    # Ensure dir exists before others proceed
    dist.barrier()
    
    ckpt_path = os.path.join(ckpt_dir, f"1gpu_ddp_amp_batch{batch_size}.pt")

    start_epoch = 0
    if os.path.exists(ckpt_path):
        if rank == 0:
            print(f"[RESUME] Found checkpoint for Batch {batch_size}. Loading...")
        
        # Load to specific device
        map_location = f"cuda:{rank}"
        checkpoint = torch.load(ckpt_path, map_location=map_location)
        
        model.module.load_state_dict(checkpoint['model_state'])
        optimizer.load_state_dict(checkpoint['optimizer_state'])
        scaler.load_state_dict(checkpoint['scaler_state'])
        start_epoch = checkpoint['epoch']
        
        if rank == 0:
            print(f"[RESUME] Resuming from Epoch {start_epoch}")
    else:
        if rank == 0:
            print(f"[START] No checkpoint found. Starting fresh for Batch {batch_size}")

    if start_epoch >= num_epochs:
        if rank == 0:
            print(f"Batch {batch_size} already completed {start_epoch} epochs. Skipping.")
        cleanup_dist()
        return

    # --- TRAINING LOOP ---
    for epoch in range(start_epoch, num_epochs):
        train_sampler.set_epoch(epoch)
        torch.cuda.reset_peak_memory_stats(device)
        epoch_start = time.time()

        # 1. Train
        model.train()
        train_loss_sum = 0.0
        train_correct = 0
        train_total = 0

        for images, labels in train_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.float().unsqueeze(1).to(device, non_blocking=True)
            
            optimizer.zero_grad()
            
            # AMP Context
            if USE_NEW_AMP:
                amp_ctx = autocast(device_type="cuda")
            else:
                amp_ctx = autocast()
                
            with amp_ctx:
                outputs = model(images)
                loss = criterion(outputs, labels)
            
            # Scaled Backward
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss_sum += loss.item() * images.size(0)
            probs = torch.sigmoid(outputs)
            preds = (probs >= 0.5).float()
            train_correct += (preds == labels).sum().item()
            train_total += labels.size(0)

        epoch_time = time.time() - epoch_start

        # 2. Validation
        model.eval()
        val_loss_sum = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device, non_blocking=True)
                labels = labels.float().unsqueeze(1).to(device, non_blocking=True)
                
                if USE_NEW_AMP:
                    amp_ctx = autocast(device_type="cuda")
                else:
                    amp_ctx = autocast()
                
                with amp_ctx:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                
                val_loss_sum += loss.item() * images.size(0)
                probs = torch.sigmoid(outputs)
                preds = (probs >= 0.5).float()
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        # 3. Aggregation (Reduced across ranks)
        metrics_tensor = torch.tensor([
            train_loss_sum, train_correct, train_total,
            val_loss_sum, val_correct, val_total
        ], device=device)
        
        dist.all_reduce(metrics_tensor, op=dist.ReduceOp.SUM)
        
        # Unpack
        g_train_loss = metrics_tensor[0].item() / metrics_tensor[2].item()
        g_train_acc  = metrics_tensor[1].item() / metrics_tensor[2].item()
        g_val_loss   = metrics_tensor[3].item() / metrics_tensor[5].item()
        g_val_acc    = metrics_tensor[4].item() / metrics_tensor[5].item()

        peak_mem = torch.cuda.max_memory_allocated(device)
        peak_mem_gb = peak_mem / (1024**3)

        if rank == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Batch {batch_size} | "
                  f"TrainAcc={g_train_acc:.4f} ValAcc={g_val_acc:.4f} | "
                  f"Time={epoch_time:.2f}s Mem={peak_mem_gb:.2f}GB")

            # Save Checkpoint (Stores NEXT epoch index to resume from)
            torch.save({
                'epoch': epoch + 1,
                'model_state': model.module.state_dict(),
                'optimizer_state': optimizer.state_dict(),
                'scaler_state': scaler.state_dict()
            }, ckpt_path)

            # Log to CSV
            row = {
                "mode": "ddp_amp",
                "gpu_count": world_size, # Should be 1
                "batch_size": batch_size,
                "epoch": epoch + 1,
                "train_loss": g_train_loss,
                "train_acc": g_train_acc,
                "val_loss": g_val_loss,
                "val_acc": g_val_acc,
                "epoch_time": epoch_time,
                "peak_mem_bytes": peak_mem,
                "peak_mem_gb": peak_mem_gb
            }
            
            file_exists = os.path.exists(csv_path)
            pd.DataFrame([row]).to_csv(csv_path, mode="a", header=not file_exists, index=False)

    cleanup_dist()

# ============================================================
# 4. MAIN
# ============================================================

if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)
    
    CSV_FILE = "DDP_AMP_metrics.csv"
    
    # 1. Load Data
    print("Loading Dataset...")
    df = pd.read_csv("oral_cancer_balanced.csv")
    
    path_map = {}
    for root, dirs, files in os.walk("Data"):
        if "val" in dirs: dirs.remove("val")
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                path_map[file] = os.path.join(root, file)
                
    # 2. Configurations
    BATCH_SIZES = [64, 128, 512]
    NUM_EPOCHS = 10
    GPU_COUNT = 1
    
    print(f"Starting Single GPU DDP+AMP Benchmark. Saving to {CSV_FILE}")
    
    for bs in BATCH_SIZES:
        # Find a fresh port for each run to avoid "Address already in use"
        port = find_free_port()
        print(f"\n=== Starting Run: Batch Size {bs} (Port {port}) ===")
        
        try:
            mp.spawn(
                train_worker,
                args=(GPU_COUNT, df, path_map, bs, NUM_EPOCHS, CSV_FILE, port),
                nprocs=GPU_COUNT,
                join=True
            )
        except Exception as e:
            print(f"Error running batch size {bs}: {e}")
            
    print(f"\nAll runs completed. Check {CSV_FILE} for results.")

Overwriting single_gpu_ddp_amp_benchmark.py


In [4]:
!python single_gpu_ddp_amp_benchmark.py

Loading Dataset...
Starting Single GPU DDP+AMP Benchmark. Saving to DDP_AMP_metrics.csv

=== Starting Run: Batch Size 64 (Port 34255) ===
[RESUME] Found checkpoint for Batch 64. Loading...
[RESUME] Resuming from Epoch 10
Batch 64 already completed 10 epochs. Skipping.

=== Starting Run: Batch Size 128 (Port 53961) ===
[RESUME] Found checkpoint for Batch 128. Loading...
[RESUME] Resuming from Epoch 8
Epoch 9/10 | Batch 128 | TrainAcc=0.9248 ValAcc=0.9255 | Time=545.91s Mem=0.56GB
Epoch 10/10 | Batch 128 | TrainAcc=0.9292 ValAcc=0.9173 | Time=251.51s Mem=0.56GB

=== Starting Run: Batch Size 512 (Port 48881) ===
[START] No checkpoint found. Starting fresh for Batch 512
Epoch 1/10 | Batch 512 | TrainAcc=0.8054 ValAcc=0.8299 | Time=276.78s Mem=1.86GB
Epoch 2/10 | Batch 512 | TrainAcc=0.8494 ValAcc=0.8638 | Time=296.92s Mem=1.86GB
Epoch 3/10 | Batch 512 | TrainAcc=0.8625 ValAcc=0.8603 | Time=275.33s Mem=1.86GB
Epoch 4/10 | Batch 512 | TrainAcc=0.8745 ValAcc=0.8825 | Time=264.62s Mem=1.86GB
E