In [1]:
import sys
sys.path.append('..')

In [2]:
!nvidia-smi

Thu Oct 13 21:41:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 27%   34C    P8    21W / 260W |      0MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from transformers import GPT2Config
import torch
from torch import nn
import copy
import time
from tqdm import tqdm

from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
from light_attention.models.gpt2 import LightGPT2Attention
from light_attention.profile import estimate_layer_memory

In [4]:
RANDOM_SEED = 3407

# HuggingFace GPT2Attention vs LightGPT2Attention

## Memory

In [4]:
configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0, summary_first_dropout=0, initializer_range=0.1)
attn = GPT2Attention(configuration)

b = 10
seq = 1024
emb = 768
input_shape = (b,seq,emb)
# if you don't have graphviz installed - set fout to None to skip graph building and rendering
estimate_layer_memory(copy.deepcopy(attn), device='cuda', input_shape=input_shape, fout='huggingface_graph', verbose=False)


Before placing the model on GPU
MA 0.0 MB         Max_MA 0.0 MB         CA 0.0 MB         Max_CA 0.0 MB 

After placing the model on GPU:
MA 10.0122 MB         Max_MA 10.0122 MB         CA 22.0 MB         Max_CA 22.0 MB 

Params (empirical) 10.0122 MB

After input batch generation, before forward pass:
MA 40.0122 MB         Max_MA 40.0122 MB         CA 52.0 MB         Max_CA 52.0 MB 

Graph has been saved in huggingface_graph.pdf.

After backward:
MA 1751.0127 MB         Max_MA 1751.0127 MB         CA 1854.0 MB         Max_CA 1854.0 MB 

Activations (empirical) 1741.0005 MB


In [4]:
configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0, summary_first_dropout=0, initializer_range=0.1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
attn = LightGPT2Attention(configuration)

b = 10
seq = 1024
emb = 768
input_shape = (b,seq,emb)
# if you don't have graphviz installed - set fout to None to skip graph building and rendering
estimate_layer_memory(copy.deepcopy(attn), device='cuda', input_shape=input_shape, fout='light_graph', verbose=False)


Before placing the model on GPU
MA 0.0 MB         Max_MA 0.0 MB         CA 0.0 MB         Max_CA 0.0 MB 

After placing the model on GPU:
MA 10.0122 MB         Max_MA 10.0122 MB         CA 22.0 MB         Max_CA 22.0 MB 

Params (empirical) 10.0122 MB

After input batch generation, before forward pass:
MA 40.0122 MB         Max_MA 40.0122 MB         CA 52.0 MB         Max_CA 52.0 MB 

Graph has been saved in light_graph.pdf.

After backward:
MA 851.0127 MB         Max_MA 851.0127 MB         CA 1344.0 MB         Max_CA 1344.0 MB 

Activations (empirical) 841.0005 MB


## Correctness

In [5]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
attn1 = GPT2Attention(configuration).cuda()

b = 10
seq = 1024
emb = 768
x1 = torch.randn((b,seq,emb), dtype=torch.float, device='cuda')
x1 = nn.Parameter(x1)

y1 = attn1(x1)[0]

y1.cos().mean().backward()

In [6]:
!nvidia-smi

Thu Oct 13 21:41:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 27%   38C    P2    97W / 260W |   3654MiB / 11019MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
attn2 = LightGPT2Attention(configuration).cuda()

b = 10
seq = 1024
emb = 768
x2 = torch.randn((b,seq,emb), dtype=torch.float, device='cuda')
x2 = nn.Parameter(x2)

y2 = attn2(x2)[0]

y2.cos().mean().backward()

In [8]:
torch.allclose(x1, x2), \
torch.allclose(attn1.c_attn.weight, attn2.c_attn.weight), \
torch.allclose(y1, y2), \
torch.allclose(attn1.c_attn.weight.grad, attn2.c_attn.weight.grad), \
torch.allclose(x1.grad, x2.grad)

(True, True, False, False, False)

In [18]:
attn1.c_attn.weight.grad, attn2.c_attn.weight.grad

(tensor([[-5.4295e-09,  3.3732e-08,  1.2488e-08,  ..., -5.4792e-07,
          -4.2262e-08,  8.8687e-08],
         [-9.5828e-09,  5.4152e-08, -4.3143e-09,  ..., -1.7547e-07,
           9.1430e-08, -1.7308e-07],
         [ 5.6753e-08,  4.4316e-08,  1.2221e-08,  ..., -2.4896e-07,
           1.4978e-07, -1.4345e-07],
         ...,
         [-6.6360e-09, -2.7853e-09,  4.5166e-08,  ...,  3.2868e-07,
          -2.6284e-07,  8.2507e-08],
         [ 2.0470e-08,  9.8933e-09,  3.8972e-08,  ..., -5.0272e-07,
          -1.0428e-07,  2.8344e-07],
         [-4.0574e-08,  5.9461e-08,  2.4368e-08,  ...,  5.9955e-07,
           4.4250e-07,  1.9155e-07]], device='cuda:0'),
 tensor([[-5.4295e-09,  3.3732e-08,  1.2488e-08,  ..., -5.4792e-07,
          -4.2262e-08,  8.8687e-08],
         [-9.5828e-09,  5.4152e-08, -4.3143e-09,  ..., -1.7547e-07,
           9.1430e-08, -1.7308e-07],
         [ 5.6753e-08,  4.4315e-08,  1.2221e-08,  ..., -2.4896e-07,
           1.4978e-07, -1.4345e-07],
         ...,
        

## Speed

In [5]:
device = torch.device('cuda')

### Forward

In [11]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
attn1 = GPT2Attention(configuration).to(device)
# attn1.eval()

samples = 100
b = 10
seq = 1024
emb = 768

# to mitigate the time needed for sample generation
xs = torch.randn((samples, b,seq,emb), dtype=torch.float, device=device)

# fake run to allocate memory
with torch.no_grad():
    for x in tqdm(xs): 
        y = attn1(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
        break

time1 = time.time()
with torch.no_grad():
    for x in tqdm(xs): 
        y = attn1(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

  0%|          | 0/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:01<00:00, 61.72it/s]

Forward pass takes 0.0162 seconds on average. Computed for 100 samples.





In [13]:
!nvidia-smi

Thu Oct 13 20:41:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 35%   47C    P2    94W / 260W |   8656MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = False
attn2 = LightGPT2Attention(configuration).to(device)
# attn2.eval()

samples = 100
b = 10
seq = 1024
emb = 768

# to mitigate the time needed for sample generation
xs = torch.randn((samples, b,seq,emb), dtype=torch.float, device=device)

# fake run to allocate memory
with torch.no_grad():
    for x in tqdm(xs): 
        y = attn2(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
        break

time1 = time.time()
with torch.no_grad():
    for x in tqdm(xs): 
        y = attn2(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

  0%|          | 0/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:01<00:00, 61.40it/s]

Forward pass takes 0.0163 seconds on average. Computed for 100 samples.





### Backward

In [14]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
attn1 = GPT2Attention(configuration).to(device)

samples = 100
b = 10
seq = 1024
emb = 768

# to mitigate the time needed for sample generation
xs = torch.randn((samples, b,seq,emb), dtype=torch.float, device=device)

# fake run to allocate memory
if device == 'cuda':
    for x in tqdm(xs): 
        y = attn1(x)[0]
        y.mean().backward()
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
        break

time1 = time.time()
for x in tqdm(xs): 
    y = attn1(x)[0]
    y.mean().backward()
    torch.cuda.synchronize(device)
time2 = time.time()
print(f'Backward-forward loop takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

100%|██████████| 100/100 [00:04<00:00, 23.91it/s]

Backward-forward loop takes 0.0419 seconds on average. Computed for 100 samples.





In [15]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = False
attn2 = LightGPT2Attention(configuration).to(device)

samples = 100
b = 10
seq = 1024
emb = 768

# to mitigate the time needed for sample generation
xs = torch.randn((samples, b,seq,emb), dtype=torch.float, device=device)

# fake run to allocate memory
if device == 'cuda':
    for x in tqdm(xs): 
        y = attn2(x)[0]
        y.mean().backward()
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
        break

time1 = time.time()
for x in tqdm(xs): 
    y = attn2(x)[0]
    y.mean().backward()
    # cuda operations are asynchronous
    torch.cuda.synchronize(device)
time2 = time.time()
print(f'Backward-forward loop takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

100%|██████████| 100/100 [00:04<00:00, 20.93it/s]

Backward-forward loop takes 0.0478 seconds on average. Computed for 100 samples.



