## CB02-4 Part Six: GPU

### 01 PyTorch: GPU version

In [1]:
import torch

In [2]:
torch.__version__

'2.2.0+cu121'

In [3]:
torch.cuda.is_available()

True

In [4]:
torch.backends.mps.is_available()

False

In [5]:
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6])
tensor1 * tensor2

tensor([ 4, 10, 18])

In [6]:
tensor1 = tensor1.to('cuda')
tensor2 = tensor2.to('cuda')
tensor1 * tensor2

tensor([ 4, 10, 18], device='cuda:0')

In [7]:
# transfer one tensor back to CPU
tensor1 = tensor1.to('cpu')
tensor1 * tensor2

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Note: All tensors shoule be on the same device

### 02 Single-GPU Training

In [5]:
import torch

class mlp(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()
        
        self.layers = torch.nn.Sequential(
            
            # 1st hidden layer
            torch.nn.Linear(num_inputs, 16),
            torch.nn.ReLU(),

            # 2nd hidden layer
            torch.nn.Linear(16, 16),
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(16, num_outputs)
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits

x_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])
y_train = torch.tensor([0, 0, 0, 1, 1])

x_test = torch.tensor([
[-0.8, 2.8],
[2.6, -1.6],
])
y_test = torch.tensor([0, 1])

from torch.utils.data import Dataset, DataLoader
class ToyDataset(Dataset):
    def __init__(self, x, y):
        self.features = x
        self.labels = y

    def __len__(self):
        return self.labels.shape[0] # self.features.shape[0]
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = ToyDataset(x_train, y_train)
test_dataset = ToyDataset(x_test, y_test)

from torch.utils.data import DataLoader

torch.manual_seed(0)
train_loader = DataLoader(
    dataset=train_dataset, 
    batch_size=2, 
    shuffle=True,
    num_workers= 0,
    drop_last=True
)

test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=2, 
    shuffle=False,
    num_workers= 0,
)

In [6]:
torch.manual_seed(0)
model = mlp(2, 2)

'''
1. define 'device'
2. transfer model on to GPU
'''
device = torch.device('cuda')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


optimizer = torch.optim.SGD(model.parameters(), lr=1)

num_epochs = 3



for epoch in range(num_epochs):

    model.train() # set the model to training mode: redundant for this example

    for idx, (features, labels) in enumerate(train_loader):
        '''
        3. transfer data on to GPU
        '''
        features = features.to(device)
        labels = labels.to(device)
    
        # clear the gradients for every batch
        optimizer.zero_grad()

        # forward pass
        logits = model(features)

        # compute the loss
        loss = torch.nn.functional.cross_entropy(logits, labels)
        
        # backward pass
        loss.backward()

        # update weights & biases through SGD
        optimizer.step()

        print(f'Epoch: {epoch}, Batch: {idx}, Loss: {loss:.2f}')
    
    model.eval() # set the model to evaluation mode: redundant for this example

Epoch: 0, Batch: 0, Loss: 0.80
Epoch: 0, Batch: 1, Loss: 0.82
Epoch: 1, Batch: 0, Loss: 0.30
Epoch: 1, Batch: 1, Loss: 0.22
Epoch: 2, Batch: 0, Loss: 0.00
Epoch: 2, Batch: 1, Loss: 0.00


### 03 Multi-GPU Training by accelerate

using 'accelerate config' to initialize '~/.cache/huggingface/accelerate'

In [7]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'

from accelerate import Accelerator

torch.manual_seed(0)
model = mlp(2, 2)

'''
1. initialize accelerator
'''
accelerator = Accelerator()


device = accelerator.device

model.to(device)


optimizer = torch.optim.SGD(model.parameters(), lr=1)

num_epochs = 3

'''
2. prepare using accerator
'''
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

for epoch in range(num_epochs):

    model.train() # set the model to training mode: redundant for this example

    for idx, (features, labels) in enumerate(train_loader):

        features = features.to(device)
        labels = labels.to(device)
    
        # clear the gradients for every batch
        optimizer.zero_grad()

        # forward pass
        logits = model(features)

        # compute the loss
        loss = torch.nn.functional.cross_entropy(logits, labels)
        
        # backward pass
        '''
        3. backward using accelerator
        '''
        accelerator.backward(loss)

        # update weights & biases through SGD
        optimizer.step()

        print(f'Epoch: {epoch}, Batch: {idx}, Loss: {loss:.2f}')
    
    model.eval() # set the model to evaluation mode: redundant for this example

Epoch: 0, Batch: 0, Loss: 0.80
Epoch: 0, Batch: 1, Loss: 0.82
Epoch: 1, Batch: 0, Loss: 0.30
Epoch: 1, Batch: 1, Loss: 0.22
Epoch: 2, Batch: 0, Loss: 0.00
Epoch: 2, Batch: 1, Loss: 0.00


  torch.utils._pytree._register_pytree_node(


In [8]:
active_gpus = list(range(torch.cuda.device_count()))
active_gpus

[0, 1, 2]