# PyTorch 2.0

<img src="pytorch2 - 1.png">

In [2]:
import torch
torch.__version__

'2.0.0'

## Quick code examples

### Before PyTorch 2.0

In [3]:
import torch
import torchvision

model = torchvision.models.resnet50()

### After PyTorch 2.0

In [4]:
import torch 
import torchvision

model = torchvision.models.resnet50()   
compiled_model = torch.compile(model)

RuntimeError: Windows not yet supported for torch.compile

## 1. Getting Started

Why get GPU info?

PyTorch 2.0 features work best on newer NVIDIA GPUs.

To find our if your GPU is compatible, see NVIDIA GPU compatibility score -  https://developer.nvidia.com/cuda-gpus

If your GPU has score of 8.0+ , it can leverage most or all of new PyTorch 2.0 features.

In [7]:
# Make sure we're using a NVIDIA GPU
if torch.cuda.is_available():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

  # Get GPU name
  gpu_name = !nvidia-smi --query-gpu=gpu_name --format=csv
  gpu_name = gpu_name[1]
  GPU_NAME = gpu_name.replace(" ", "_") # remove underscores for easier saving
  print(f'GPU name: {GPU_NAME}')

  # Get GPU capability score
  GPU_SCORE = torch.cuda.get_device_capability()
  print(f"GPU capability score: {GPU_SCORE}")
  if GPU_SCORE >= (8, 0):
    print(f"GPU score higher than or equal to (8, 0), PyTorch 2.x speedup features available.")
  else:
    print(f"GPU score lower than (8, 0), PyTorch 2.x speedup features will be limited (PyTorch 2.x speedups happen most on newer GPUs).")
  
  # Print GPU info
  print(f"GPU information:\n{gpu_info}")

else:
  print("PyTorch couldn't find a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

GPU name: NVIDIA_GeForce_RTX_4090
GPU capability score: (8, 9)
GPU score higher than or equal to (8, 0), PyTorch 2.x speedup features available.
GPU information:
Wed May 31 11:17:05 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98                 Driver Version: 535.98       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090      WDDM  | 00000000:01:00.0  On |                  Off |
|  0%   42C    P8              20W / 450W |    647MiB / 24564MiB |      1%      Default |
|                                         |                      |                  N/

### 1.1 Globally set devices

Previously we've set device of our tensor/model using `.to(device)`

But in PyTorch 2.0, it is possible to set the device with context manager as well as a global device.

In [8]:
import torch

# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set the device with context manager (PyTorch 2.x+)
with torch.device("cuda"):
    # All tensors or PyTorch objects created in the context manager will be on target device
    layer = torch.nn.Linear(20,30)
    print(f"Layer weights are on devie: {layer.weight.device}")
    print(f"Layer creating data on device: {layer(torch.randn(120,20)).device}")

Layer weights are on devie: cuda:0
Layer creating data on device: cuda:0


In [9]:
import torch

# Set the device globally
torch.set_default_device("cuda")

layer = torch.nn.Linear(20,30)
print(f"Layer weights are on devie: {layer.weight.device}")
print(f"Layer creating data on device: {layer(torch.randn(120,20)).device}")

Layer weights are on devie: cuda:0
Layer creating data on device: cuda:0


## 2. Setting up the experiments

Time to test speed!

TO keep things simple, we will run 4 experiments.

* Model: ResNet 50
* Data: Cifar10
* Epochs: 5(single run) and 3x5(multi run)
* Batch Size: 128
* Image: 224

In [10]:
import torch
import torchvision

print(f"PyTorch version: {torch.__version__}")
print(f"TorchVision version: {torchvision.__version__}")

# Set the target device
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

PyTorch version: 2.0.0
TorchVision version: 0.15.0
Using device: cuda


<img src="pytorch2 - 2.png">

### 2.1 Create model and transforms

In [12]:
# Create model weights and transforms
model_weights = torchvision.models.ResNet50_Weights.DEFAULT
tranforms = model_weights.transforms()

tranforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [13]:
# Create model
model = torchvision.models.resnet50(weights=model_weights)
model

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to C:\Users\HIMANSHU GOSWAMI/.cache\torch\hub\checkpoints\resnet50-11ad3fa6.pth
100%|█████████████████████████████████████████████████████████████████████████████| 97.8M/97.8M [00:05<00:00, 19.2MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [14]:
# Count the number of parameters in the model

total_params = sum(torch.numel(param) for param in model.parameters())
total_params

25557032

**Note:** PyTorch 2.0 *relative* speedups will be most noticable when as much of the GPU as possble is being used.
    This means a larger model may take longer to train on the whole but will relatively faster.
    
    Eg. a model with 1M parameters may take ~10 minutes to train but model with 25M parameters may take ~20 minutes to train.

In [16]:
def create_model(num_classes=10):
    """
    Create  a resnet50 model with transform and returns them both.
    """
    model_weights = torchvision.models.ResNet50_Weights.DEFAULT
    transforms = model_weights.transforms()
    model = torchvision.models.resnet50(weights=model_weights)
    
    # Adjust the head layer
    model.fc = torch.nn.Linear(in_features=2048,
                              out_features=num_classes)
    
    return model, transforms

model, transforms = create_model()

transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

### 2.2 Speedups are most noticable when a large portion of GPU is being used

Since modern GPUs are so *fast* at performing operations, you will often notice the majority of *relative* speedups when as much data as possible is on the GPU.

In practice, you generally want to use as much of your GPU memory as possible.

* Increasing the batch size.
* Increasing data size - 224,224  instead of 32,32
* Increase the model size - for example instead of using model with 1M parameters, use a model with 10M parameters.
* Decrease data transfer - since bandwidth costs (CPU <=> GPU) will slow down a GPU

As a result of doing the above, your relative speedups should be better.

Eg. overall training time may take longer but not linearly.

Resource for improving model speed = https://sebastianraschka.com/blog/2023/pytorch-faster.html

**Note:** This concept of using as much data on GPU as possible isn't restricted specifically to PyTorch 2.0.

### 2.3 Checking the memory limits of our GPU

In [17]:
# Check available GPU memory and total GPU memory
total_free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
print(f"Total free GPU Memory: {round(total_free_gpu_memory * 1e-9, 3)} GB")
print(f"Total GPU Memory: {round(total_gpu_memory * 1e-9, 3)} GB")

Total free GPU Memory: 22.926 GB
Total GPU Memory: 25.757 GB


* If the GPU has 16GB+ of free memory, set batch size to 128

In [18]:
# Set batch size depending on amount of GPU memory
total_free_gpu_memory_gb = round(total_free_gpu_memory * 1e-9, 3)
if total_free_gpu_memory_gb >= 16:
    BATCH_SIZE = 128 # Note: you could experiment with higher values here if you like.
    IMAGE_SIZE = 224
    print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batch size of {BATCH_SIZE} and image size {IMAGE_SIZE}")
else:
    BATCH_SIZE = 32
    IMAGE_SIZE = 128
    print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batch size of {BATCH_SIZE} and image size {IMAGE_SIZE}")

GPU memory available is 22.926 GB, using batch size of 128 and image size 224


In [20]:
transforms.crop_size = IMAGE_SIZE
transforms.resize_size = IMAGE_SIZE 
print(f"Updated data transforms:\n{transforms}")

Updated data transforms:
ImageClassification(
    crop_size=224
    resize_size=224
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)


### 2.4 More potential speedups with TF32

TF32 = TensorFloat32

TensorFloat32 = a datatype that bridges Float32 and Float16

What we want in computing:
1. Fast model training
2. Accurate model training

TensorFloat32 = a datatype from NVIDIA which combines float32 and float16

In [21]:
if GPU_SCORE >= (8, 0):
    print(f"[INFO] Using GPU with score: {GPU_SCORE}, enabling TensorFloat32 (TF32) computing (faster on new GPUs)")
    torch.backends.cuda.matmul.allow_tf32 = True
else:
    print(f"[INFO] Using GPU with score: {GPU_SCORE}, TensorFloat32 (TF32) not available, to use it you need a GPU with score >= (8, 0)")
    torch.backends.cuda.matmul.allow_tf32 = False

[INFO] Using GPU with score: (8, 9), enabling TensorFloat32 (TF32) computing (faster on new GPUs)
