In [1]:
import cudf
import cupy
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import os

In [2]:
torch.cuda.empty_cache()

!nvidia-smi

Thu Nov  7 12:38:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-32GB           Off | 00000000:3B:00.0 Off |                    0 |
| N/A   29C    P0              35W / 250W |      4MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-32GB           Off | 00000000:D8:00.0 Off |  

In [3]:
INPUT_IMAGE_CHANNELS = 3
INPUT_IMAGE_DIM = 32

CONV_LAYER1_OUTPUT_CHANNELS = 256
CONV_LAYER1_KERNEL_SIZE = 5
CONV_LAYER1_STRIDE = 1
CONV_LAYER1_PADDING = 0

CONV_MAX_POOL_1_KERNEL_SIZE = 2
CONV_MAX_POOL_1_PADDING_SIZE = 0
CONV_MAX_POOL_1_STRIDE_SIZE = 2

CONV_LAYER2_OUTPUT_CHANNELS = 128
CONV_LAYER2_KERNEL_SIZE = 5
CONV_LAYER2_STRIDE = 1
CONV_LAYER2_PADDING = 2

CONV_MAX_POOL_2_KERNEL_SIZE = 2
CONV_MAX_POOL_2_PADDING_SIZE = 0
CONV_MAX_POOL_2_STRIDE_SIZE = 2

CONV_LAYER3_OUTPUT_CHANNELS = 256
CONV_LAYER3_KERNEL_SIZE = 5
CONV_LAYER3_STRIDE = 1
CONV_LAYER3_PADDING = 2

CONV_MAX_POOL_3_KERNEL_SIZE = 2
CONV_MAX_POOL_3_PADDING_SIZE = 0
CONV_MAX_POOL_3_STRIDE_SIZE = 1

DROP_OUT_RATE = 0.25

LINEAR_LAYER_1_OUTPUT_SIZE = 16834
LINEAR_LAYER_2_OUTPUT_SIZE = 512

# NUMBER OF CLASSES
LINEAR_LAYER_3_OUTPUT_SIZE = 10 # 10 for the amount of classes that are in the dataset

In [4]:
class serialized_CNN_Model(nn.Module):
    def __init__(self, num_gpus):
        super(serialized_CNN_Model, self).__init__()

        # CONV2D LAYER1 AND CHANGE IN IMAGE DIMENSIONS
        self.conv_layer1 = nn.Conv2d(INPUT_IMAGE_CHANNELS, CONV_LAYER1_OUTPUT_CHANNELS, CONV_LAYER1_KERNEL_SIZE, CONV_LAYER1_STRIDE, CONV_LAYER1_PADDING)
        self.image_dimension = (INPUT_IMAGE_DIM - ((CONV_LAYER1_KERNEL_SIZE) - (2 * CONV_LAYER1_PADDING)))//CONV_LAYER1_STRIDE + 1
        self.image_channel_size = CONV_LAYER1_OUTPUT_CHANNELS
        print(self.image_dimension, self.image_channel_size)


        # MAX POOLING LAYER 1, Change in image dimensions
        self.maxPooling1 = nn.MaxPool2d(CONV_MAX_POOL_1_KERNEL_SIZE, CONV_MAX_POOL_1_STRIDE_SIZE, CONV_MAX_POOL_1_PADDING_SIZE)
        self.image_dimension = (self.image_dimension - ((CONV_MAX_POOL_1_KERNEL_SIZE) - (2 * CONV_MAX_POOL_1_PADDING_SIZE)))//CONV_MAX_POOL_1_STRIDE_SIZE + 1
        print(self.image_dimension, self.image_channel_size)

        
        # CONV2D LAYER2 AND CHANGE IN IMAGE DIMENSIONS
        self.conv_layer2 = nn.Conv2d(CONV_LAYER1_OUTPUT_CHANNELS, CONV_LAYER2_OUTPUT_CHANNELS, CONV_LAYER2_KERNEL_SIZE, CONV_LAYER2_STRIDE, CONV_LAYER2_PADDING)
        self.image_dimension = (self.image_dimension - ((CONV_LAYER2_KERNEL_SIZE) - (2 * CONV_LAYER2_PADDING)))//CONV_LAYER2_STRIDE + 1
        self.image_channel_size = CONV_LAYER2_OUTPUT_CHANNELS
        print(self.image_dimension, self.image_channel_size)


        # MAX POOLING LAYER 2 AND CHANGE IN IMAGE DIMENSIONS
        self.maxPooling2 = nn.MaxPool2d(CONV_MAX_POOL_2_KERNEL_SIZE, CONV_MAX_POOL_2_STRIDE_SIZE, CONV_MAX_POOL_2_PADDING_SIZE)
        self.image_dimension = (self.image_dimension - ((CONV_MAX_POOL_2_KERNEL_SIZE) - (2 * CONV_MAX_POOL_2_PADDING_SIZE)))//CONV_MAX_POOL_2_STRIDE_SIZE + 1
        print(self.image_dimension, self.image_channel_size)

        
        # CONV2D LAYER 3 AND CHANGE IN IMAGE DIMENSIONS
        self.conv_layer3 = nn.Conv2d(CONV_LAYER2_OUTPUT_CHANNELS, CONV_LAYER3_OUTPUT_CHANNELS, CONV_LAYER3_KERNEL_SIZE, CONV_LAYER3_STRIDE, CONV_LAYER3_PADDING)
        self.image_dimension = (self.image_dimension - ((CONV_LAYER3_KERNEL_SIZE) - (2 * CONV_LAYER3_PADDING)))//CONV_LAYER3_STRIDE + 1
        self.image_channel_size = CONV_LAYER3_OUTPUT_CHANNELS
        print(self.image_dimension, self.image_channel_size)
        

        # MAX POOLING LAYER 3 AND CHANGE IN IIMAGE DIMENSIONS
        self.maxPooling3 = nn.MaxPool2d(CONV_MAX_POOL_3_KERNEL_SIZE, CONV_MAX_POOL_3_STRIDE_SIZE, CONV_MAX_POOL_3_PADDING_SIZE)
        self.image_dimension = (self.image_dimension - ((CONV_MAX_POOL_3_KERNEL_SIZE) - (2 * CONV_MAX_POOL_3_PADDING_SIZE)))//CONV_MAX_POOL_3_STRIDE_SIZE + 1
        print(self.image_dimension, self.image_channel_size)


        # Since we flatten the image after the CONV2D Layers, we need to calculate the size of the feature
        # Vector going into the nn.Linear layer
        self.fc1_input_size = self.image_dimension * self.image_dimension * self.image_channel_size
        
        # Fully connected Layers
        self.fc1 = nn.Linear(self.fc1_input_size, LINEAR_LAYER_1_OUTPUT_SIZE)
        print(self.fc1_input_size, LINEAR_LAYER_1_OUTPUT_SIZE)
        
        self.fc2 = nn.Linear(LINEAR_LAYER_1_OUTPUT_SIZE, LINEAR_LAYER_2_OUTPUT_SIZE)
        print(LINEAR_LAYER_1_OUTPUT_SIZE, LINEAR_LAYER_2_OUTPUT_SIZE)

        
        self.fc3 = nn.Linear(LINEAR_LAYER_2_OUTPUT_SIZE, LINEAR_LAYER_3_OUTPUT_SIZE)
        print(LINEAR_LAYER_2_OUTPUT_SIZE, LINEAR_LAYER_3_OUTPUT_SIZE)


        self.layers = [self.conv_layer1, self.maxPooling1, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.conv_layer2, self.maxPooling2, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.conv_layer3, self.maxPooling3, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       torch.nn.Flatten(),
                       self.fc1, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.fc2, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.fc3,
                      ]
        
        self.model_layers = [torch.nn.Sequential(*self.layers)]

        self.model_layers[0].to(f'cuda:{0}')

    def forward(self, x):

        for i in range(len(self.model_layers)):
            x = x.to(f'cuda:{i}')
            x = self.model_layers[i](x)

        return x.to('cpu')


In [5]:
class Pipeline_parallel_CNN_Model(nn.Module):
    def __init__(self, num_gpus):
        super(Pipeline_parallel_CNN_Model, self).__init__()

        self.num_gpus = num_gpus
        
        # CONV2D LAYER1 AND CHANGE IN IMAGE DIMENSIONS
        self.conv_layer1 = nn.Conv2d(INPUT_IMAGE_CHANNELS, CONV_LAYER1_OUTPUT_CHANNELS, CONV_LAYER1_KERNEL_SIZE, CONV_LAYER1_STRIDE, CONV_LAYER1_PADDING)
        self.image_dimension = (INPUT_IMAGE_DIM - ((CONV_LAYER1_KERNEL_SIZE) - (2 * CONV_LAYER1_PADDING)))//CONV_LAYER1_STRIDE + 1
        self.image_channel_size = CONV_LAYER1_OUTPUT_CHANNELS
        print(self.image_dimension, self.image_channel_size)


        # MAX POOLING LAYER 1, Change in image dimensions
        self.maxPooling1 = nn.MaxPool2d(CONV_MAX_POOL_1_KERNEL_SIZE, CONV_MAX_POOL_1_STRIDE_SIZE, CONV_MAX_POOL_1_PADDING_SIZE)
        self.image_dimension = (self.image_dimension - ((CONV_MAX_POOL_1_KERNEL_SIZE) - (2 * CONV_MAX_POOL_1_PADDING_SIZE)))//CONV_MAX_POOL_1_STRIDE_SIZE + 1
        print(self.image_dimension, self.image_channel_size)

        
        # CONV2D LAYER2 AND CHANGE IN IMAGE DIMENSIONS
        self.conv_layer2 = nn.Conv2d(CONV_LAYER1_OUTPUT_CHANNELS, CONV_LAYER2_OUTPUT_CHANNELS, CONV_LAYER2_KERNEL_SIZE, CONV_LAYER2_STRIDE, CONV_LAYER2_PADDING)
        self.image_dimension = (self.image_dimension - ((CONV_LAYER2_KERNEL_SIZE) - (2 * CONV_LAYER2_PADDING)))//CONV_LAYER2_STRIDE + 1
        self.image_channel_size = CONV_LAYER2_OUTPUT_CHANNELS
        print(self.image_dimension, self.image_channel_size)


        # MAX POOLING LAYER 2 AND CHANGE IN IMAGE DIMENSIONS
        self.maxPooling2 = nn.MaxPool2d(CONV_MAX_POOL_2_KERNEL_SIZE, CONV_MAX_POOL_2_STRIDE_SIZE, CONV_MAX_POOL_2_PADDING_SIZE)
        self.image_dimension = (self.image_dimension - ((CONV_MAX_POOL_2_KERNEL_SIZE) - (2 * CONV_MAX_POOL_2_PADDING_SIZE)))//CONV_MAX_POOL_2_STRIDE_SIZE + 1
        print(self.image_dimension, self.image_channel_size)

        
        # CONV2D LAYER 3 AND CHANGE IN IMAGE DIMENSIONS
        self.conv_layer3 = nn.Conv2d(CONV_LAYER2_OUTPUT_CHANNELS, CONV_LAYER3_OUTPUT_CHANNELS, CONV_LAYER3_KERNEL_SIZE, CONV_LAYER3_STRIDE, CONV_LAYER3_PADDING)
        self.image_dimension = (self.image_dimension - ((CONV_LAYER3_KERNEL_SIZE) - (2 * CONV_LAYER3_PADDING)))//CONV_LAYER3_STRIDE + 1
        self.image_channel_size = CONV_LAYER3_OUTPUT_CHANNELS
        print(self.image_dimension, self.image_channel_size)
        

        # MAX POOLING LAYER 3 AND CHANGE IN IIMAGE DIMENSIONS
        self.maxPooling3 = nn.MaxPool2d(CONV_MAX_POOL_3_KERNEL_SIZE, CONV_MAX_POOL_3_STRIDE_SIZE, CONV_MAX_POOL_3_PADDING_SIZE)
        self.image_dimension = (self.image_dimension - ((CONV_MAX_POOL_3_KERNEL_SIZE) - (2 * CONV_MAX_POOL_3_PADDING_SIZE)))//CONV_MAX_POOL_3_STRIDE_SIZE + 1
        print(self.image_dimension, self.image_channel_size)
        

        # Since we flatten the image after the CONV2D Layers, we need to calculate the size of the feature
        # Vector going into the nn.Linear layer
        self.fc1_input_size = self.image_dimension * self.image_dimension * self.image_channel_size
        
        # Fully connected Layers
        self.fc1 = nn.Linear(self.fc1_input_size, LINEAR_LAYER_1_OUTPUT_SIZE)
        print(self.fc1_input_size, LINEAR_LAYER_1_OUTPUT_SIZE)
        
        
        self.fc2 = nn.Linear(LINEAR_LAYER_1_OUTPUT_SIZE, LINEAR_LAYER_2_OUTPUT_SIZE)
        print(LINEAR_LAYER_1_OUTPUT_SIZE, LINEAR_LAYER_2_OUTPUT_SIZE)

        
        self.fc3 = nn.Linear(LINEAR_LAYER_2_OUTPUT_SIZE, LINEAR_LAYER_3_OUTPUT_SIZE)
        print(LINEAR_LAYER_2_OUTPUT_SIZE, LINEAR_LAYER_3_OUTPUT_SIZE)


        self.layers = [self.conv_layer1, self.maxPooling1, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.conv_layer2, self.maxPooling2, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.conv_layer3, self.maxPooling3, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       torch.nn.Flatten(),
                       self.fc1, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.fc2, nn.Dropout(DROP_OUT_RATE), nn.ReLU(),
                       self.fc3,
                      ]

        layer_len = len(self.layers)
        split_stride = layer_len//self.num_gpus + 1
        cuda_ind = 0
        self.model_layers = []
        
        for i in range(0, layer_len, split_stride):
            print(f'cuda:{cuda_ind}')
            self.model_layers.append(nn.Sequential(*self.layers[i : i + split_stride]).to(f'cuda:{cuda_ind}'))
            cuda_ind += 1


    def forward(self, x):

        for i in range(self.num_gpus):
            x = x.to(f'cuda:{i}')
            x = self.model_layers[i](x)

        return x.to('cpu')
        

In [6]:
# Dataset Constants
DATASET_SIZE = 50000

# Training Hyperparams
BATCH_SIZE = 10
EPOCHS = 5

# Optimizer Hyperparams
LEARNING_RATE = 10e-3
SGD_MOMENTUM = 0.9 # How much of past velocity to maintain in gradient update

# LR Scheduler Hyperparams
GAMMA = 0.9 # Multiplies previous LR by 0.1
STEP_SIZE = 50000//BATCH_SIZE # Amount of steps before LR is decreased



# if __name__ == "__main__":

device = 'cuda' if torch.cuda.is_available() else 'cpu'
world_size = torch.cuda.device_count()

# In case we have zero cuda GPUs, no way to train the following
if world_size < 1:
    print("No Cuda Devices")
    exit()

dataset_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download dataset if it is not present on the system
DOWNLOAD_DATASET = False
if 'testing' not in os.listdir():
    DOWNLOAD_DATASET = True
    os.mkdir('testing')
if 'dataset' not in os.listdir('testing'):
    DOWNLOAD_DATASET = True
    os.mkdir('testing/dataset')
    
if 'training' not in os.listdir():
    DOWNLOAD_DATASET = True
    os.mkdir('training')
if 'dataset' not in os.listdir('training'):
    DOWNLOAD_DATASET = True
    os.mkdir('training/dataset')

training_data = datasets.CIFAR10(root="training/dataset/", train=True, download=DOWNLOAD_DATASET, transform=dataset_transforms)
testing_data = datasets.CIFAR10(root="training/dataset/", train=False, download=DOWNLOAD_DATASET, transform=dataset_transforms)
datal = DataLoader(training_data, batch_size=BATCH_SIZE)

TRAIN_SERIALIZED = False
TRAIN_PIPELINE_PARALLEL = True

In [7]:
if TRAIN_SERIALIZED:
    model = serialized_CNN_Model(2)
    model.to(device)
    
    # Optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=SGD_MOMENTUM, nesterov=True)
    
    # LR Scheduler
    learning_rate_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
    
    # Loss func
    criterion = nn.CrossEntropyLoss()
    
    import tqdm
    for epoch in range(EPOCHS):
        model.train()
        for X, y in tqdm.tqdm(datal):
            
            optimizer.zero_grad()
            
            prediction = model(X)
            loss = criterion(prediction, y)
            loss.backward()
            optimizer.step()
            
            learning_rate_scheduler.step()
    
        model.eval()
        test_data_loader = DataLoader(testing_data, batch_size=BATCH_SIZE)
        total_loss = 0
        for X, y in test_data_loader:
            
            prediction = model(X)
            loss = criterion(prediction, y)
            total_loss += loss
    
        print(f'epoch {epoch} total loss: {total_loss}')

    del model
    del optimizer
    del learning_rate_scheduler
    del criterion

    torch.cuda.empty_cache()

In [8]:
if TRAIN_PIPELINE_PARALLEL:
    model = Pipeline_parallel_CNN_Model(2)
    
    # Optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=SGD_MOMENTUM, nesterov=True)
    
    # LR Scheduler
    learning_rate_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
    
    # Loss func
    criterion = nn.CrossEntropyLoss()
    
    import tqdm
    for epoch in range(EPOCHS):
        model.train()
        for X, y in tqdm.tqdm(datal):
            
            optimizer.zero_grad()
            
            prediction = model(X)
            loss = criterion(prediction, y)
            loss.backward()
            optimizer.step()
            
            learning_rate_scheduler.step()
    
        model.eval()
        test_data_loader = DataLoader(testing_data, batch_size=BATCH_SIZE)
        total_loss = 0
        for X, y in test_data_loader:
            
            prediction = model(X)
            loss = criterion(prediction, y)
            total_loss += loss
    
        print(f'epoch {epoch} total loss: {total_loss}')

    del model
    del optimizer
    del learning_rate_scheduler
    del criterion

    torch.cuda.empty_cache()

28 256
14 256
14 128
7 128
7 256
6 256
9216 16834
16834 512
512 10
cuda:0
cuda:1


100%|██████████| 5000/5000 [01:01<00:00, 80.95it/s]


epoch 0 total loss: 1585.2926025390625


100%|██████████| 5000/5000 [01:01<00:00, 81.24it/s]


epoch 1 total loss: 1558.2706298828125


100%|██████████| 5000/5000 [01:01<00:00, 81.29it/s]


epoch 2 total loss: 1360.1629638671875


100%|██████████| 5000/5000 [01:01<00:00, 81.29it/s]


epoch 3 total loss: 1312.5411376953125


100%|██████████| 5000/5000 [01:01<00:00, 81.30it/s]


epoch 4 total loss: 1298.23974609375


In [9]:
!nvidia-smi

Thu Nov  7 12:44:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-32GB           Off | 00000000:3B:00.0 Off |                    0 |
| N/A   31C    P0              36W / 250W |  18046MiB / 32768MiB |     14%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-32GB           Off | 00000000:D8:00.0 Off |  