# PyTorch: CUDA vs CPU

Step 1: importing libraries, and checking if CUDA is available

In [1]:
import sys

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import time

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Step 2: Prepare dataset + Define the architecture

In [8]:
# Hyperparameters
num_epochs = 3
batch_size = 64
learning_rate = 0.001

# Load the MNIST dataset
transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(root="./data/", train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [9]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(32 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 32 * 7 * 7)  # Flatten
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

Step 3: Function to train the model

In [10]:
def train_model(device, num_epochs=3):
    # Create a model and move it to the specified device
	model = CNN().to(device)
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model.parameters(), lr=learning_rate)

	for epoch in range(num_epochs):
		running_loss = 0.0

		for i, (images, labels) in enumerate(train_loader):
			images, labels = images.to(device), labels.to(device)

			# Forward pass
			outputs = model(images)
			loss = criterion(outputs, labels)

			# Backward pass and optimization
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

			running_loss += loss.item()

			# Log progress
			sys.stdout.write(f'\rEpoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
			sys.stdout.flush()

Step 4: Timing function


In [11]:
def time_training(device):
    start_time = time.time()
    train_model(device, num_epochs=num_epochs)
    end_time = time.time()
    return end_time - start_time

## Comparison of `CPU` vs `CUDA`

Train on CPU

In [12]:
print("Training on CPU")
cpu_time = time_training(torch.device("cpu"))
print(f"\nCPU training time: {cpu_time:.4f} seconds")

Training on CPU
Epoch [3/3], Batch [938/938], Loss: 0.0453
CPU training time: 58.1443 seconds


Train on GPU (if available)

In [13]:
if torch.cuda.is_available():
    print("Training on GPU")
    gpu_time = time_training(torch.device("cuda"))
    print(f"\nGPU training time: {gpu_time:.4f} seconds")
else:
    print("\nGPU is not available.")

Training on GPU
Epoch [3/3], Batch [938/938], Loss: 0.0265
GPU training time: 47.3114 seconds


Difference

In [14]:
print(f"GPU is {((cpu_time / gpu_time)-1)*100:.2f}% faster than CPU.")

GPU is 22.90% faster than CPU.
