<a href="https://colab.research.google.com/github/GonMazzini/Loads_Surrogate_Transferability/blob/main/MemoryCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Memory in PyTorch
# A basic notebook to understand the GPU allocated memory.

#by GonMazzini     
## https://github.com/GonMazzini

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Simple example with a torch tensor.

In [4]:
x = torch.randn(5000,5000, dtype = torch.float64).to('cuda')

In [5]:
5000*5000*64/(1024*1024*8) # MB

190.73486328125

In [6]:
torch.cuda.get_device_name(device=0)

'Tesla K80'

In [8]:
torch.cuda.memory_allocated()/(1024*1024)  # MB

190.73486328125

In [10]:
torch.cuda.memory_reserved()/(1024*1024) # MB

192.0

Example with a model

https://medium.com/deep-learning-for-protein-design/a-comprehensive-guide-to-memory-usage-in-pytorch-b9b7c78031d3

In [11]:
class MLP(nn.Module):
    """ A feedforward network designed for tuning number of layers and hidden units.
    By @GonMazzini"""
    def __init__(self, input_dim, output_dim, n_hidLayers, hidden_size):
        super(MLP, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_size = hidden_size
        self.n_hidLayers = n_hidLayers
        current_dim = input_dim
        self.layers = nn.ModuleList()
        
        for hdim in [self.hidden_size]*self.n_hidLayers:
            self.layers.append(nn.Linear(current_dim, hdim))
            current_dim = hdim
        self.layers.append(nn.Linear(current_dim, output_dim))

    def forward(self, x):
        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        out = F.relu(self.layers[-1](x))
        return out 

In [13]:
model = MLP(6,8,2,50)
model

MLP(
  (layers): ModuleList(
    (0): Linear(in_features=6, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=8, bias=True)
  )
)

In [27]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params

3308

In [15]:
x = torch.randn(1,pytorch_total_params, dtype = torch.float64).to('cuda')

In [19]:
1*3308*64/(1024*1024*8)  # MB

0.025238037109375

In [18]:
torch.cuda.memory_allocated(device)/(1024*1024)  # MB
# Returns the current GPU memory occupied by tensors in bytes for a given device.

0.025390625

In [11]:
0.025390625*1024*1024 # 26624.0

26624.0

In [36]:
in_size=6
out_size=8
hidden_size=50

model = nn.Sequential(nn.Linear(in_size, hidden_size),
                    *[nn.Linear(hidden_size, hidden_size) for _ in range(1)],
                    nn.Linear(hidden_size, out_size))
model

Sequential(
  (0): Linear(in_features=6, out_features=50, bias=True)
  (1): Linear(in_features=50, out_features=50, bias=True)
  (2): Linear(in_features=50, out_features=8, bias=True)
)

In [31]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params

3308

In [9]:
def estimate_memory(model, sample_input, optimizer_type=torch.optim.Adam, batch_size=1, use_amp=False, device=0):
    """Predict the maximum memory usage of the model. 
    Args:
        optimizer_type (Type): the class name of the optimizer to instantiate
        model (nn.Module): the neural network model
        sample_input (torch.Tensor): A sample input to the network. It should be 
            a single item, not a batch, and it will be replicated batch_size times.
        batch_size (int): the batch size
        use_amp (bool): whether to estimate based on using mixed precision
        device (torch.device): the device to use
    """
    # Reset model and optimizer
    model.cpu()
    optimizer = optimizer_type(model.parameters(), lr=.001)
    a = torch.cuda.memory_allocated(device)
    model.to(device)
    b = torch.cuda.memory_allocated(device)
    model_memory = b - a
    model_input = sample_input.unsqueeze(0).repeat(batch_size, 1)
    output = model(model_input.to(device)).sum()
    c = torch.cuda.memory_allocated(device)
    if use_amp:
        amp_multiplier = .5
    else:
        amp_multiplier = 1
    forward_pass_memory = (c - b)*amp_multiplier
    gradient_memory = model_memory
    if isinstance(optimizer, torch.optim.Adam):
        o = 2
    elif isinstance(optimizer, torch.optim.RMSprop):
        o = 1
    elif isinstance(optimizer, torch.optim.SGD):
        o = 0
    else:
        raise ValueError("Unsupported optimizer. Look up how many moments are" +
            "stored by your optimizer and add a case to the optimizer checker.")
    gradient_moment_memory = o*gradient_memory
    total_memory = model_memory + forward_pass_memory + gradient_memory + gradient_moment_memory

    return total_memory

def test_memory(in_size=6, out_size=8, hidden_size=50, hidden_layers = 1, optimizer_type=torch.optim.Adam, batch_size=1, use_amp=False, device=0):
    sample_input = torch.randn(batch_size, in_size, dtype=torch.float32)
    model = nn.Sequential(nn.Linear(in_size, hidden_size),
                        *[nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers)],
                        nn.Linear(hidden_size, out_size))
    
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'number of model parameters /byme: {pytorch_total_params}')

    max_mem_est = estimate_memory(model, sample_input[0], optimizer_type=optimizer_type, batch_size=batch_size, use_amp=use_amp)
    print(f'Maximum Memory Estimate: {max_mem_est/(1024*1024)} MB')
    optimizer = optimizer_type(model.parameters(), lr=.001)
    print("Beginning mem:", torch.cuda.memory_allocated(device), "Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number")
    model.to(device)
    print("After model to device:", torch.cuda.memory_allocated(device))
    for i in range(3):
        print("Iteration", i)
        with torch.cuda.amp.autocast(enabled=use_amp):
            a = torch.cuda.memory_allocated(device)
            out = model(sample_input.to(device)).sum() # Taking the sum here just to get a scalar output
            b = torch.cuda.memory_allocated(device)
        print("1 - After forward pass", torch.cuda.memory_allocated(device))
        print("2 - Memory consumed by forward pass", b - a)
        out.backward()
        print("3 - After backward pass", torch.cuda.memory_allocated(device))
        optimizer.step()
        print("4 - After optimizer step", torch.cuda.memory_allocated(device))

In [22]:
 torch.cuda.memory_allocated(device)

0

In [13]:
test_memory(in_size=6, out_size=8, hidden_size=512, hidden_layers = 4 ,batch_size = 256)

number of model parameters /byme: 1058312
Maximum Memory Estimate: 18.65673828125 MB
Beginning mem: 4233728 Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number
After model to device: 4233728
Iteration 0
1 - After forward pass 6861824
2 - Memory consumed by forward pass 2628096
3 - After backward pass 8467968
4 - After optimizer step 16935424
Iteration 1
1 - After forward pass 19563008
2 - Memory consumed by forward pass 2627584
3 - After backward pass 16935424
4 - After optimizer step 16935424
Iteration 2
1 - After forward pass 19563008
2 - Memory consumed by forward pass 2627584
3 - After backward pass 16935424
4 - After optimizer step 16935424


In [23]:
test_memory(batch_size = 64)

number of model parameters: 5858
Beginning mem: 0 Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number
After model to device: 26112
Iteration 0
1 - After forward pass 66560
2 - Memory consumed by forward pass 40448
3 - After backward pass 52736
4 - After optimizer step 104960
Iteration 1
1 - After forward pass 144896
2 - Memory consumed by forward pass 39936
3 - After backward pass 104960
4 - After optimizer step 104960
Iteration 2
1 - After forward pass 144896
2 - Memory consumed by forward pass 39936
3 - After backward pass 104960
4 - After optimizer step 104960


In [5]:
test_memory(batch_size = 32)

number of model parameters /byme: 3308
Maximum Memory Estimate" 0.07275390625
Beginning mem: 15360 Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number
After model to device: 15360
Iteration 0
1 - After forward pass 30208
2 - Memory consumed by forward pass 14848
3 - After backward pass 31232
4 - After optimizer step 61952
Iteration 1
1 - After forward pass 76288
2 - Memory consumed by forward pass 14336
3 - After backward pass 61952
4 - After optimizer step 61952
Iteration 2
1 - After forward pass 76288
2 - Memory consumed by forward pass 14336
3 - After backward pass 61952
4 - After optimizer step 61952
