In [1]:
%%writefile main.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import _LRScheduler
import torch.utils.data as data

import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision import models

from sklearn import decomposition
from sklearn import manifold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt
import numpy as np

import copy
import random
import time
import os
import json
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP

# Set seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

def setup(rank, world_size):
    
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
    
    dist.destroy_process_group()

# Define root directory
ROOT = '.'

# Load CIFAR-10 dataset
train_data = datasets.CIFAR10(root=ROOT, train=True, download=True)

# Calculate mean and std for normalization
means = train_data.data.mean(axis=(0, 1, 2)) / 255
stds = train_data.data.std(axis=(0, 1, 2)) / 255
print(f'Calculated means: {means}')
print(f'Calculated stds: {stds}')

# Define data transformations
train_transforms = transforms.Compose([
    transforms.RandomRotation(5),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomCrop(32, padding=2),
    transforms.ToTensor(),
    transforms.Normalize(mean=means, std=stds)
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=means, std=stds)
])

# Apply transformations to datasets
train_data = datasets.CIFAR10(ROOT, train=True, download=True, transform=train_transforms)
test_data = datasets.CIFAR10(ROOT, train=False, download=True, transform=test_transforms)

# Split training data into training and validation sets
VALID_RATIO = 0.9
n_train_examples = int(len(train_data) * VALID_RATIO)
n_valid_examples = len(train_data) - n_train_examples
train_data, valid_data = data.random_split(train_data, [n_train_examples, n_valid_examples])

# Apply test transformations to validation data
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transform = test_transforms

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

# Define AlexNet model
class AlexNet(nn.Module):
    def __init__(self, output_dim):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, 2, 1), nn.MaxPool2d(2), nn.ReLU(inplace=True),
            nn.Conv2d(64, 192, 3, padding=1), nn.MaxPool2d(2), nn.ReLU(inplace=True),
            nn.Conv2d(192, 384, 3, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, 3, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1), nn.MaxPool2d(2), nn.ReLU(inplace=True)
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.5), nn.Linear(256 * 2 * 2, 4096), nn.ReLU(inplace=True),
            nn.Dropout(0.5), nn.Linear(4096, 4096), nn.ReLU(inplace=True),
            nn.Linear(4096, output_dim)
        )

    def forward(self, x):
        x = self.features(x)
        h = x.view(x.shape[0], -1)
        x = self.classifier(h)
        return x, h

def create_model():
    OUTPUT_DIM = 10
    model = AlexNet(OUTPUT_DIM)
    return model

# Initialize model parameters
def initialize_parameters(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
        nn.init.constant_(m.bias.data, 0)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight.data, gain=nn.init.calculate_gain('relu'))
        nn.init.constant_(m.bias.data, 0)

# Count trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Define distributed data loader for GPU
def create_dataloader_gpu(rank, world_size):
    BATCH_SIZE = 256
    train_sampler = DistributedSampler(train_data, num_replicas=world_size, rank=rank)
    train_dataloader = data.DataLoader(train_data, batch_size=BATCH_SIZE, sampler=train_sampler, drop_last=True)
    val_dataloader = data.DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
    test_dataloader = data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
    return train_dataloader, val_dataloader, test_dataloader

# Define data loader for CPU
def create_dataloader_cpu():
    BATCH_SIZE = 256
    train_dataloader = data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
    val_dataloader = data.DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
    test_dataloader = data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
    return train_dataloader, val_dataloader, test_dataloader


# Define training step for GPU
def train_gpu(model, iterator, optimizer, criterion, device, batch_size):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    total_samples=0
    start_time = time.monotonic()
    i = 0
    for (x, y) in iterator:
        x = x.to(device)
        y = y.to(device)
        total_samples += y.shape[0]

        optimizer.zero_grad()
        y_hat, _ = model(x)
        loss = criterion(y_hat, y)
        acc = calculate_accuracy(y_hat, y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        if i%50 == 0 and device==0:
            print(f'train {i} over {len(iterator)}')
        i += 1

    end_time = time.monotonic()
    epoch_time = end_time - start_time
    samples_per_second = total_samples / epoch_time
    return epoch_loss / len(iterator), epoch_acc / len(iterator), samples_per_second


# Define evaluation step for GPU
def evaluate_gpu(model, iterator, criterion, device, batch_size):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    total_samples = 0
    start_time = time.monotonic()

    with torch.no_grad():
        for (x, y) in iterator:
            x = x.to(device)
            y = y.to(device)
            total_samples += y.shape[0]
            y_hat, _ = model(x)
            loss = criterion(y_hat, y)
            acc = calculate_accuracy(y_hat, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    end_time = time.monotonic()
    epoch_time = end_time - start_time
    samples_per_second = total_samples / epoch_time
    return epoch_loss / len(iterator), epoch_acc / len(iterator), samples_per_second


# Define training step for CPU
def train_cpu(model, iterator, optimizer, criterion, batch_size):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    total_samples = 0
    start_time = time.monotonic()

    for (x, y) in iterator:
        total_samples += y.shape[0]
        optimizer.zero_grad()
        y_hat, _ = model(x)
        loss = criterion(y_hat, y)
        acc = calculate_accuracy(y_hat, y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    end_time = time.monotonic()
    epoch_time = end_time - start_time
    samples_per_second = total_samples / epoch_time
    return epoch_loss / len(iterator), epoch_acc / len(iterator), samples_per_second


# Define evaluation step for CPU
def evaluate_cpu(model, iterator, criterion, batch_size):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    total_samples = 0
    start_time = time.monotonic()

    with torch.no_grad():
        for (x, y) in iterator:
            total_samples += y.shape[0]
            y_hat, _ = model(x)
            loss = criterion(y_hat, y)
            acc = calculate_accuracy(y_hat, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    end_time = time.monotonic()
    epoch_time = end_time - start_time
    samples_per_second = total_samples / epoch_time
    return epoch_loss / len(iterator), epoch_acc / len(iterator), samples_per_second


def calculate_accuracy(y_hat, y, topk=(1,)):
    maxk = max(topk)
    batch_size = y.size(0)
    _, pred = y_hat.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(y.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].flatten().sum(dtype=torch.float32)
        acc = correct_k / batch_size
        res.append(acc)
    return res[0]

# Define epoch time calculator
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


  

# Main training function
def main_train(rank, world_size, root=ROOT, num_epochs=3):
    BATCH_SIZE = 256
    setup(rank, world_size)
    print(f"Process {rank} initialized.")
    
    # Create model and move to GPU
    device = torch.device(f'cuda:{rank}')
    #torch.device(f'cuda:{rank}' if torch.cuda.is_available() and world_size > 1 else "cpu")
    model = create_model().to(device)
    model.apply(initialize_parameters)
    #if world_size > 1 and torch.cuda.is_available():
    ddp_model = DDP(model, device_ids=[rank])
    #else:
    #    ddp_model = model

    # Create data loaders
    #if device.type == 'cpu':
    #    train_dataloader, val_dataloader, test_dataloader = create_dataloader_cpu()
    #else:
    print('dataloaded')
    train_dataloader, val_dataloader, test_dataloader = create_dataloader_gpu(rank, world_size)

    # Define loss and optimizer
    LR = 5e-4
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(ddp_model.parameters(), lr=LR)

    # Training loop
    best_valid_loss = float('inf')

    # Initialize lists to store metrics
    training_times = []
    train_losses = []
    train_accuracies = []
    validation_times = []
    validation_losses = []
    validation_accuracies = []
    epoch_times = []
    train_throughputs = []
    validation_throughputs = []


    for epoch in range(num_epochs):
        print('epoch ',epoch )
        
        start_time = time.monotonic()
        '''if device.type == 'cpu':
            train_loss, train_acc, train_throughput = train_cpu(ddp_model, train_dataloader, optimizer, criterion, BATCH_SIZE)
            valid_loss, valid_acc, valid_throughput = evaluate_cpu(ddp_model, val_dataloader, criterion, BATCH_SIZE)
        else:'''
        train_loss, train_acc, train_throughput = train_gpu(ddp_model, train_dataloader, optimizer, criterion, device, BATCH_SIZE)
        print('finish trained  ',epoch )
        valid_loss, valid_acc, valid_throughput = evaluate_gpu(ddp_model, val_dataloader, criterion, device, BATCH_SIZE)


        #if (device.type != 'cpu' and rank == 0) or device.type == 'cpu':
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(ddp_model.state_dict(), os.path.join(root, 'tut4-model.pt'))

        epoch_mins, epoch_secs = epoch_time(start_time, time.monotonic())
        end_time = time.monotonic()
        epoch_time_seconds = end_time - start_time

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        
        training_times.append(epoch_time_seconds - (end_time - start_time)/2) #approximate training time
        train_losses.append(train_loss)
        train_accuracies.append(train_acc*100)
        validation_times.append( (end_time - start_time)/2 )#approximate validation time
        validation_losses.append(valid_loss)
        validation_accuracies.append(valid_acc*100)
        epoch_times.append(epoch_time_seconds)
        train_throughputs.append(train_throughput)
        validation_throughputs.append(valid_throughput)
    
    # Test the model
    start_time = time.monotonic()
    #if (device.type != 'cpu' and rank==0) or device.type == 'cpu':
        #load model only on rank 0 or if CPU
    ddp_model.load_state_dict(torch.load(os.path.join(root, 'tut4-model.pt')))
    
    '''if world_size > 1 and device.type != 'cpu':
        # Broadcast model to the rest of the devices if multiple GPUs
        for param in ddp_model.parameters():
             dist.broadcast(param.data, src=0)

    if device.type == 'cpu':
        test_loss, test_acc, test_throughput = evaluate_cpu(ddp_model, test_dataloader, criterion, BATCH_SIZE)
    else:'''
    test_loss, test_acc, test_throughput = evaluate_gpu(ddp_model, test_dataloader, criterion, device, BATCH_SIZE)

    end_time = time.monotonic()
    test_time=end_time-start_time
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
    
    #Generate Confusion Matrix
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for x, y in test_dataloader:
            if device.type != 'cpu':
                x = x.to(device)
                y = y.to(device)
            y_hat, _ = ddp_model(x)
            preds = torch.argmax(y_hat, dim=1).cpu().numpy()
            labels = y.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)
            
    cm = confusion_matrix(all_labels, all_preds)
    
    # Store results in a dictionary
    results = {
        f"model_vgg11_epochs_{num_epochs}_{world_size}_{device.type}_rank_{rank}": {
            "world_size": world_size,
            "rank": rank,
            "device": device.type,
            "training_times": training_times,
            "train_losses": train_losses,
            "train_accurcy": train_accuracies,
            "validation_times": validation_times,
            "validation_losses": validation_losses,
            "validation_accurcy": validation_accuracies,
             "test_time": test_time,
            "test_loss": test_loss,
            "test_acc": test_acc*100,
            "epoch_times": epoch_times,
             "train_throughputs": train_throughputs,
            "validation_throughputs": validation_throughputs,
             "test_throughput": test_throughput,
            "confusion_matrix": cm.tolist()
        }
    }
        
    
    print(f'Process {(rank )} finished training on {device.type}.')

     # Save results as a JSON file (both ranks)
    results_file = os.path.join(root, 'project3_2gpus.json')
    
    # Load existing data if the file exists
    try:
        with open(results_file, 'r') as f:
            all_results = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        all_results = {}

    # Update the results with the current rank's data
    all_results.update(results)
        
    with open(results_file, 'w') as f:
        json.dump(all_results, f, indent=4)
        print(f"Results saved to {results_file}")
    
    # if world_size > 1 and device.type != 'cpu':
    dist.destroy_process_group()


# Main execution
if __name__ == "__main__":
    def main():
        world_size = torch.cuda.device_count()
        print(f'Total number of devices detected: {world_size}')
        # world_size -= 1  # delete this if you want to use 2 gpus keep it for 1 gpu 
        if world_size >= 1:
            if world_size > 1:
                mp.spawn(main_train, args=(world_size,), nprocs=world_size, join=True)
            else:
                main_train(rank=0, world_size=1)
        else:
            print('No GPUs found. Running on CPU only.')
            
        
            
    main()

Writing main.py


In [2]:
!python main.py

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz
100%|███████████████████████| 170498071/170498071 [00:02<00:00, 58274587.76it/s]
Extracting ./cifar-10-python.tar.gz to .
Calculated means: [0.49139968 0.48215841 0.44653091]
Calculated stds: [0.24703223 0.24348513 0.26158784]
Files already downloaded and verified
Files already downloaded and verified
Number of training examples: 45000
Number of validation examples: 5000
Number of testing examples: 10000
Total number of devices detected: 2
Files already downloaded and verified
Files already downloaded and verified
Calculated means: [0.49139968 0.48215841 0.44653091]
Calculated stds: [0.24703223 0.24348513 0.26158784]
Calculated means: [0.49139968 0.48215841 0.44653091]
Calculated stds: [0.24703223 0.24348513 0.26158784]
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Number of training exa