<a href="https://colab.research.google.com/github/GonMazzini/Loads_Surrogate_Transferability/blob/main/MemoryCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Memory in PyTorch
# A basic notebook to understand the GPU allocated memory.

#by GonMazzini     
## https://github.com/GonMazzini

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Simple example with a torch tensor.

In [4]:
x = torch.randn(5000,5000, dtype = torch.float64).to('cuda')

In [5]:
5000*5000*64/(1024*1024*8) # MB

190.73486328125

In [6]:
torch.cuda.get_device_name(device=0)

'Tesla K80'

In [8]:
torch.cuda.memory_allocated()/(1024*1024)  # MB

190.73486328125

In [10]:
torch.cuda.memory_reserved()/(1024*1024) # MB

192.0

Example with a model

In [11]:
class MLP(nn.Module):
    """ A feedforward network designed for tuning number of layers and hidden units.
    By @GonMazzini"""
    def __init__(self, input_dim, output_dim, n_hidLayers, hidden_size):
        super(MLP, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_size = hidden_size
        self.n_hidLayers = n_hidLayers
        current_dim = input_dim
        self.layers = nn.ModuleList()
        
        for hdim in [self.hidden_size]*self.n_hidLayers:
            self.layers.append(nn.Linear(current_dim, hdim))
            current_dim = hdim
        self.layers.append(nn.Linear(current_dim, output_dim))

    def forward(self, x):
        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        out = F.relu(self.layers[-1](x))
        return out 

In [13]:
model = MLP(6,8,2,50)
model

MLP(
  (layers): ModuleList(
    (0): Linear(in_features=6, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=8, bias=True)
  )
)

In [14]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params

3308

In [15]:
x = torch.randn(1,pytorch_total_params, dtype = torch.float64).to('cuda')

In [19]:
1*3308*64/(1024*1024*8)  # MB

0.025238037109375

In [18]:
torch.cuda.memory_allocated(device)/(1024*1024)  # MB

0.025390625

In [7]:
in_size=6
out_size=8
hidden_size=50

model = nn.Sequential(nn.Linear(in_size, hidden_size),
                    *[nn.Linear(hidden_size, hidden_size) for _ in range(1)],
                    nn.Linear(hidden_size, out_size))
model

Sequential(
  (0): Linear(in_features=6, out_features=50, bias=True)
  (1): Linear(in_features=50, out_features=50, bias=True)
  (2): Linear(in_features=50, out_features=8, bias=True)
)

In [3]:
def test_memory(in_size=6, out_size=8, hidden_size=50, optimizer_type=torch.optim.Adam, batch_size=1, use_amp=False, device=0):
    sample_input = torch.randn(batch_size, in_size, dtype=torch.float32)
    model = nn.Sequential(nn.Linear(in_size, hidden_size),
                        *[nn.Linear(hidden_size, hidden_size) for _ in range(2)],
                        nn.Linear(hidden_size, out_size))
    
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'model parameters: {pytorch_total_params}')

    # max_mem_est = estimate_memory(model, sample_input[0], optimizer_type=optimizer_type, batch_size=batch_size, use_amp=use_amp)
    # print("Maximum Memory Estimate", max_mem_est)
    optimizer = optimizer_type(model.parameters(), lr=.001)
    print("Beginning mem:", torch.cuda.memory_allocated(device), "Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number")
    model.to(device)
    print("After model to device:", torch.cuda.memory_allocated(device))
    for i in range(3):
        print("Iteration", i)
        with torch.cuda.amp.autocast(enabled=use_amp):
            a = torch.cuda.memory_allocated(device)
            out = model(sample_input.to(device)).sum() # Taking the sum here just to get a scalar output
            b = torch.cuda.memory_allocated(device)
        print("1 - After forward pass", torch.cuda.memory_allocated(device))
        print("2 - Memory consumed by forward pass", b - a)
        out.backward()
        print("3 - After backward pass", torch.cuda.memory_allocated(device))
        optimizer.step()
        print("4 - After optimizer step", torch.cuda.memory_allocated(device))

In [4]:
 torch.cuda.memory_allocated(device)

0

In [5]:
test_memory(batch_size = 64)

Beginning mem: 0 Note - this may be higher than 0, which is due to PyTorch caching. Don't worry too much about this number
After model to device: 8237568
Iteration 0
1 - After forward pass 13409280
2 - Memory consumed by forward pass 5171712
3 - After backward pass 16475648
4 - After optimizer step 32950784
Iteration 1
1 - After forward pass 38121984
2 - Memory consumed by forward pass 5171200
3 - After backward pass 32950784
4 - After optimizer step 32950784
Iteration 2
1 - After forward pass 38121984
2 - Memory consumed by forward pass 5171200
3 - After backward pass 32950784
4 - After optimizer step 32950784
