# PyTorch: CUDA vs CPU

Step 1: importing libraries, and checking if CUDA is available

In [1]:
import sys

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import time

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Step 2: Prepare dataset + Define the architecture

In [4]:
# Hyperparameters
num_epochs = 3
batch_size = 64
learning_rate = 0.001
sequence_length = 28
input_size = 28
hidden_size = 128
num_layers = 2
num_classes = 10

# Load the MNIST dataset
transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(root="./data/", train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate RNN
        out, _ = self.rnn(x, h0)
        
        # Decode the last time step
        out = self.fc(out[:, -1, :])
        return out

Step 3: Function to train the model

In [6]:
def train_model(device, num_epochs=3):
    # Create a model and move it to the specified device
	model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model.parameters(), lr=learning_rate)

	for epoch in range(num_epochs):
		running_loss = 0.0

		for i, (images, labels) in enumerate(train_loader):
			images = images.reshape(-1, sequence_length, input_size).to(device)
			labels = labels.to(device)

			# Forward pass
			outputs = model(images)
			loss = criterion(outputs, labels)

			# Backward pass and optimization
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

			running_loss += loss.item()

			# Log progress
			sys.stdout.write(f'\rEpoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
			sys.stdout.flush()

Step 4: Timing function


In [7]:
def time_training(device):
    start_time = time.time()
    train_model(device, num_epochs=num_epochs)
    end_time = time.time()
    return end_time - start_time

## Comparison of `CPU` vs `CUDA`

Train on CPU

In [8]:
print("Training on CPU")
cpu_time = time_training(torch.device("cpu"))
print(f"\nCPU training time: {cpu_time:.4f} seconds")

Training on CPU
Epoch [3/3], Batch [938/938], Loss: 0.2415
CPU training time: 65.2470 seconds


Train on GPU (if available)

In [9]:
if torch.cuda.is_available():
    print("Training on GPU")
    gpu_time = time_training(torch.device("cuda"))
    print(f"\nGPU training time: {gpu_time:.4f} seconds")
else:
    print("\nGPU is not available.")

Training on GPU
Epoch [3/3], Batch [938/938], Loss: 0.2803
GPU training time: 45.8349 seconds


Difference

In [10]:
print(f"GPU is {((cpu_time / gpu_time)-1)*100:.2f}% faster than CPU.")

GPU is 42.35% faster than CPU.
