In [17]:
%%writefile main.py

import os
import numpy as np
import torch

import torchvision
from torchvision import datasets, models, transforms

from tqdm import tqdm
from torch.utils import data

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import copy
import random
import time
import os
import json

from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp

SEED = 1234
ROOT = "."
MODEL_NAME = "VGG16"
SENARIO = "1GPU"
EPOCHS = 3
BATCH_SIZE = 64


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

"""# 2. Initialize the DDP Environment"""

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'  # Change this to the master node's IP address if using multiple machines
    os.environ['MASTER_PORT'] = '12345'  # Pick a free port on the master node
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

"""# 3. Define a Model."""


# define the CNN architecture
vgg16 = models.vgg16(pretrained=True)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def create_model():

    for param in vgg16.features.parameters():
        param.requires_grad = False

    n_inputs = vgg16.classifier[6].in_features
    last_layer = nn.Linear(n_inputs, 10)
    vgg16.classifier[6] = last_layer
    print(f'The model has {count_parameters(vgg16):,} trainable parameters')
    model = vgg16
    return model

"""# 4. Create a Dummy Dataset"""

def create_dataloader(rank, world_size, batch_size=BATCH_SIZE, root = ROOT, max_length = 256):
    data_transform = transforms.Compose([transforms.RandomResizedCrop(224),
                                      transforms.ToTensor()])
    ## load the data with
    outdir = f"{root}/data"
    if rank == 0 and not os.path.exists(outdir):
        train_data = datasets.CIFAR10(outdir, train=True,
                                      download=True, transform=data_transform)
        test_data = datasets.CIFAR10(outdir, train=False,
                                    download=True, transform=data_transform)

    dist.barrier()  # Ensure all processes wait for the dataset to be downloaded

    train_data = datasets.CIFAR10(outdir, train=True,
                                      download=True, transform=data_transform)
    test_data = datasets.CIFAR10(outdir, train=False,
                                    download=True, transform=data_transform)
    ## create the validation split
    VALID_RATIO = 0.9

    n_train_examples = int(len(train_data) * VALID_RATIO)
    n_valid_examples = len(train_data) - n_train_examples
    train_data, valid_data = data.random_split(train_data,
                                           [n_train_examples, n_valid_examples])

    if rank == 0:
        print(f'Number of training examples: {len(train_data)}')
        print(f'Number of validation examples: {len(valid_data)}')
        print(f'Number of testing examples: {len(test_data)}')


    ## Creating Data Loaders

    train_sampler = DistributedSampler(train_data, num_replicas=world_size, rank=rank, shuffle=True)
    val_sampler = DistributedSampler(valid_data, num_replicas=world_size, rank=rank)

    train_dataloader = data.DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, pin_memory=True) #use num_workers > 0 for better performance
    val_dataloader = data.DataLoader(valid_data, batch_size=batch_size, sampler=val_sampler, pin_memory=True) #use num_workers > 0 for better performance
    test_dataloader = data.DataLoader(test_data, batch_size=batch_size, shuffle=False, pin_memory=True) #no sampling for test dataset
    return train_dataloader, val_dataloader, test_dataloader

"""# 5. Implement the Training Loop

## a. Help function
"""

RESULTS_FILE = f"{ROOT}/{MODEL_NAME}_{EPOCHS}epochs_{SENARIO}.json"

def log_results(scenario, results):
    """
    Save results to a JSON file for comparison across scenarios.
    """
    if os.path.exists(RESULTS_FILE):
        with open(RESULTS_FILE, 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}

    all_results[scenario] = results

    with open(RESULTS_FILE, 'w') as f:
        json.dump(all_results, f, indent=4)

def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

"""## b. train function"""
def train(model, iterator, optimizer, criterion, rank):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
    i=0
    for (x, y) in tqdm(iterator, desc=f"Training on the rank {rank}...", leave=False):

        x = x.to(rank)
        y = y.to(rank)

        optimizer.zero_grad()

        y_pred = model(x)

        loss = criterion(y_pred, y)

        acc = calculate_accuracy(y_pred, y)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        if i % 50 == 0 and rank == 0 :
            print(f"- On Training: {i} was passed over  {len(iterator)}")
        i+=1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


"""## c. Validation function"""
def evaluate(model, iterator, criterion, rank, mode = "Evaluating"):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    i=0
    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc=f"{mode} on the rank {rank} ...", leave=False):

            x = x.to(rank)
            y = y.to(rank)

            y_pred = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            if i % 50 == 0 and rank == 0:
                print(f"- On {mode}: {i} was passed over  {len(iterator)}")
            i+=1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


"""## d. Main loop"""

outdir = f'{ROOT}/model/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

def main_train(rank, world_size, root = outdir, num_epochs = EPOCHS, model_name = MODEL_NAME):
    ## a. Set up the distributed process groups
    setup(rank, world_size)
    print(f"Process {rank} initialized.")

    # setup mp_model and devices for this process


    ## b. Create Model, DataLoader
    train_dataloader, val_dataloader, test_dataloader = create_dataloader(rank, world_size)
    model = create_model().to(rank)

    ## c. Wrap the model with DistributedDataParallel
    ddp_model = DDP(model, device_ids=[rank])

    ## d. Loss and Optimizer
    #LR = 5e-4
    criterion = nn.CrossEntropyLoss().to(rank) # Move loss to GPU
    optimizer = optim.Adam(ddp_model.parameters(), lr=0.01)

    ## e. Training Loop
    best_valid_loss = float('inf')
    training_times = []
    train_losses = []
    train_accurcy = []
    validation_times = []
    validation_losses = []
    validation_accurcy = []

    epoch_times = []

    for epoch in range(num_epochs):
        start_epoch_time = time.monotonic()
        start_time = time.monotonic()

        train_loss, train_acc = train(ddp_model, train_dataloader, optimizer, criterion, rank)
        train_time = time.monotonic() - start_time
        training_times.append(train_time)
        train_losses.append(train_loss)
        train_accurcy.append(train_acc)

        start_time = time.monotonic()
        valid_loss, valid_acc = evaluate(ddp_model, val_dataloader, criterion, rank)
        val_time = time.monotonic() - start_time
        validation_times.append(val_time)
        validation_losses.append(valid_loss)
        validation_accurcy.append(valid_acc)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(ddp_model.state_dict(), f'{root}mlp-model.pt')

        end_time = time.monotonic()
        e_time = end_time - start_epoch_time
        epoch_times.append(e_time)
        epoch_mins, epoch_secs = epoch_time(start_epoch_time, end_time)

        print(f'--------------|     On process {rank}      |----------------')
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    ## f. test after train
    ddp_model.load_state_dict(torch.load(f'{root}mlp-model.pt'))
    start_time = time.monotonic()
    test_loss, test_acc = evaluate(ddp_model, test_dataloader, criterion, rank, mode = "Testing")
    test_time = time.monotonic() - start_time
    print(f'Test results on process {rank}: Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

    # Log results
    results = {
        "world_size": world_size,
        "rank": rank,
        "training_times": training_times,
        "train_losses": train_losses,
        "train_accurcy": train_accurcy,
        "validation_times": validation_times,
        "validation_losses": validation_losses,
        "validation_accurcy": validation_accurcy,
        "test_time": test_time,
        "test_loss": test_loss,
        "test_acc": test_acc,
        "epoch_times": epoch_times
     }

    scenario = f"model_{model_name}_epochs_{num_epochs}_{world_size}_GPUs_rank_{rank}"
    log_results(scenario, results)
    dist.barrier()

    cleanup()
    print(f'Process {rank} finished training.')

"""# 6. Main Execution"""
if __name__ == "__main__":

    def main():
        world_size = torch.cuda.device_count()
        print(f'Total number of devices detected: {world_size}')

        if world_size >= 1:
            #start the training process on all available GPUs

            if world_size > 1:
                #start the training process on all available GPUs

                mp.spawn(
                    main_train,
                    args=(world_size,),
                    nprocs=world_size,
                    join=True
                )
            else:
                #run training on single GPU
                main_train(rank=0, world_size=1)

        else:
            print('no GPUs found. Please make sure you have configured CUDA correctly')

    main()

Overwriting main.py


In [18]:
!python main.py

Total number of devices detected: 1
Process 0 initialized.
[rank0]:[W110 10:31:29.975779797 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
Files already downloaded and verified
Files already downloaded and verified
Number of training examples: 45000
Number of validation examples: 5000
Number of testing examples: 10000
The model has 119,586,826 trainable parameters
Training on the rank 0...:   0% 0/704 [00:00<?, ?it/s]- On Training: 0 was passed over  704
Training on the rank 0...:   7% 50/704 [00:23<04:47,  2.28it/s]- On Training: 50 was passed over  704
Training on the rank 0...:  14% 100/704 [00:45<04:27,  2.26it/s]- On Training: 100 was passed over  704
Training on the rank 0...:  21% 150/704 [01:07<04:1

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

def plot_training_metrics_seq2seq(json_file_1gpu, json_file_2gpus):
    """
    Generates plots comparing training metrics from 1-GPU and 2-GPU setups for a seq2seq model.
    Includes plots for rank 0, rank 1, and their average for the 2-GPU setup.
    """
    # Load 1-GPU data
    with open(json_file_1gpu, 'r') as f:
        data_1gpu = json.load(f)

    seq2seq_1gpu = data_1gpu[list(data_1gpu.keys())[0]]

    # Load 2-GPU data
    with open(json_file_2gpus, 'r') as f:
        data_2gpus = json.load(f)
    seq2seq_2gpus_rank0 = data_2gpus[list(data_2gpus.keys())[0]]
    seq2seq_2gpus_rank1 = data_2gpus[list(data_2gpus.keys())[1]]

    # Extract data for 1 GPU
    train_times_1gpu = seq2seq_1gpu['training_times']
    train_losses_1gpu = seq2seq_1gpu['train_losses']
    val_times_1gpu = seq2seq_1gpu['validation_times']
    val_losses_1gpu = seq2seq_1gpu['validation_losses']
    epoch_times_1gpu = seq2seq_1gpu['epoch_times']
    train_accuracy_1gpu = seq2seq_1gpu['train_accuracy']
    validation_accuracy_1gpu = seq2seq_1gpu['validation_accuracy']

    # Extract data for 2 GPUs (rank 0)
    train_times_2gpus_rank0 = seq2seq_2gpus_rank0['training_times']
    train_losses_2gpus_rank0 = seq2seq_2gpus_rank0['train_losses']
    val_times_2gpus_rank0 = seq2seq_2gpus_rank0['validation_times']
    val_losses_2gpus_rank0 = seq2seq_2gpus_rank0['validation_losses']
    epoch_times_2gpus_rank0 = seq2seq_2gpus_rank0['epoch_times']
    train_accuracy_2gpus_rank0 = seq2seq_2gpus_rank0['train_accuracy']
    validation_accuracy_2gpus_rank0 = seq2seq_2gpus_rank0['validation_accuracy']


    # Extract data for 2 GPUs (rank 1)
    train_times_2gpus_rank1 = seq2seq_2gpus_rank1['training_times']
    train_losses_2gpus_rank1 = seq2seq_2gpus_rank1['train_losses']
    val_times_2gpus_rank1 = seq2seq_2gpus_rank1['validation_times']
    val_losses_2gpus_rank1 = seq2seq_2gpus_rank1['validation_losses']
    epoch_times_2gpus_rank1 = seq2seq_2gpus_rank1['epoch_times']
    train_accuracy_2gpus_rank1 = seq2seq_2gpus_rank1['train_accuracy']
    validation_accuracy_2gpus_rank1 = seq2seq_2gpus_rank1['validation_accuracy']


    # Calculate averages for 2 GPUs
    train_times_2gpus_avg = np.mean([train_times_2gpus_rank0, train_times_2gpus_rank1], axis=0)
    train_losses_2gpus_avg = np.mean([train_losses_2gpus_rank0, train_losses_2gpus_rank1], axis=0)
    val_times_2gpus_avg = np.mean([val_times_2gpus_rank0, val_times_2gpus_rank1], axis=0)
    val_losses_2gpus_avg = np.mean([val_losses_2gpus_rank0, val_losses_2gpus_rank1], axis=0)
    epoch_times_2gpus_avg = np.mean([epoch_times_2gpus_rank0, epoch_times_2gpus_rank1], axis=0)
    train_accuracy_2gpus_avg = np.mean([train_accuracy_2gpus_rank0, train_accuracy_2gpus_rank1], axis=0)
    validation_accuracy_2gpus_avg = np.mean([validation_accuracy_2gpus_rank0, validation_accuracy_2gpus_rank1], axis=0)


    # Epochs for x-axis
    epochs = list(range(len(train_times_1gpu)))

    # Plot Training Times
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_times_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, train_times_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, train_times_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, train_times_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Training Time (s)')
    plt.title('Training Time Comparison')
    plt.legend()
    plt.grid()
    plt.show()

    # Plot Training Losses
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_losses_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, train_losses_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, train_losses_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, train_losses_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Comparison')
    plt.legend()
    plt.grid()
    plt.show()

    # Plot Validation Times
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, val_times_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, val_times_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, val_times_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, val_times_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Validation Time (s)')
    plt.title('Validation Time Comparison')
    plt.legend()
    plt.grid()
    plt.show()

    # Plot Validation Losses
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, val_losses_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, val_losses_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, val_losses_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, val_losses_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Validation Loss')
    plt.title('Validation Loss Comparison')
    plt.legend()
    plt.grid()
    plt.show()

    # Plot Test Losses
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, epoch_times_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, epoch_times_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, epoch_times_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, epoch_times_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Test Loss')
    plt.title('Epochs Time Comparison')
    plt.legend()
    plt.grid()
    plt.show()

    # Plot Training Accurcy
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_accuracy_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, train_accuracy_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, train_accuracy_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, train_accuracy_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Training Accurcy')
    plt.title('Training Accurcy Comparison')
    plt.legend()
    plt.grid()
    plt.show()

    # Plot Validation Accurcy
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, validation_accuracy_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, validation_accuracy_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, validation_accuracy_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, validation_accuracy_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Validation Accurcy')
    plt.title('Validation Accurcy Comparison')
    plt.legend()
    plt.grid()
    plt.show()

    # Plot Test Accurcy
    ''' plt.figure(figsize=(10, 6))
    plt.plot(epochs, test_Accurcys_1gpu, label='1 GPU', marker='o')
    plt.plot(epochs, test_Accurcys_2gpus_rank0, label='2 GPUs (Rank 0)', marker='o')
    plt.plot(epochs, test_Accurcys_2gpus_rank1, label='2 GPUs (Rank 1)', marker='o')
    plt.plot(epochs, test_Accurcys_2gpus_avg, label='2 GPUs (Average)', marker='o', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Test Accurcy')
    plt.title('Test Accurcy Comparison')
    plt.legend()
    plt.grid()
    plt.show()'''

if __name__ == "__main__":
    plot_training_metrics_seq2seq('/content/LeNet_10epochs_1GPU.json', '/content/LeNet_10epochs_2GPU.json')