In [1]:
import sys
sys.path.append('..')

In [2]:
import os
os.environ["PATH"] += os.pathsep + '/trinity/home/d.cherniuk/libs/graphviz-2.50.0/bin'

In [3]:
from transformers import GPT2Config
import torch
from torch import nn
import copy
import time

from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
from light_attention.attention import LightAttention
from light_attention.profile import estimate_layer_memory

In [4]:
RANDOM_SEED = 10

# HuggingFace GPT2Attention vs Custom LightAttention

## Memory

In [7]:
configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0, summary_first_dropout=0, initializer_range=0.1)
attn = GPT2Attention(configuration)

b = 10
seq = 1024
emb = 768
input_shape = (b,seq,emb)
estimate_layer_memory(copy.deepcopy(attn), 'cuda', (b, seq, emb), fout='huggingface_graph', verbose=False)


Before placing the model on GPU
MA 0.0 MB         Max_MA 0.0 MB         CA 0.0 MB         Max_CA 0.0 MB 

After placing the model on GPU:
MA 10.0122 MB         Max_MA 10.0122 MB         CA 22.0 MB         Max_CA 22.0 MB 

Params (empirical) 10.0122 MB

Params (analytical, torch) 9.0117 MB

After input batch generation, before forward pass:
MA 40.0122 MB         Max_MA 40.0122 MB         CA 52.0 MB         Max_CA 52.0 MB 

Graph has been saved in huggingface_graph.pdf.

After backward:
MA 1721.0127 MB         Max_MA 1721.0127 MB         CA 1854.0 MB         Max_CA 1854.0 MB 


Activations (analytical, torchviz) 1830.0 MB
Activations (empirical) 1711.0005 MB


In [5]:
configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0, summary_first_dropout=0, initializer_range=0.1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = False
attn = LightAttention(configuration)

b = 10
seq = 1024
emb = 768
input_shape = (b,seq,emb)
estimate_layer_memory(copy.deepcopy(attn), 'cuda', (b, seq, emb), fout='light_graph', verbose=False)


Before placing the model on GPU
MA 0.0 MB         Max_MA 0.0 MB         CA 0.0 MB         Max_CA 0.0 MB 

After placing the model on GPU:
MA 10.0122 MB         Max_MA 10.0122 MB         CA 22.0 MB         Max_CA 22.0 MB 

Params (empirical) 10.0122 MB

Params (analytical, torch) 9.0117 MB

After input batch generation, before forward pass:
MA 40.0122 MB         Max_MA 40.0122 MB         CA 52.0 MB         Max_CA 52.0 MB 

Graph has been saved in light_graph.pdf.

After backward:
MA 1301.0127 MB         Max_MA 1301.0127 MB         CA 1824.0 MB         Max_CA 1824.0 MB 


Activations (analytical, torchviz) 1350.0 MB
Activations (empirical) 1291.0005 MB


## Correctness

In [6]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
attn1 = GPT2Attention(configuration).cuda()

b = 10
seq = 1024
emb = 768
x1 = torch.randn((b,seq,emb), dtype=torch.float, device='cuda')
x1 = nn.Parameter(x1)

y1 = attn1(x1)[0]

loss1 = y1.mean().backward()

In [7]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
attn2 = LightAttention(configuration).cuda()

b = 10
seq = 1024
emb = 768
x2 = torch.randn((b,seq,emb), dtype=torch.float, device='cuda')
x2 = nn.Parameter(x2)

y2 = attn2(x2)[0]

loss2 = y2.mean().backward()

In [8]:
torch.allclose(x1, x2), \
torch.allclose(attn1.c_attn.weight, attn2.c_attn.weight), \
torch.allclose(y1, y2), \
torch.allclose(attn1.c_attn.weight.grad, attn2.c_attn.weight.grad), \
torch.allclose(x1.grad, x2.grad)

(True, True, True, True, True)

# Speed

In [10]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
attn1 = GPT2Attention(configuration).cuda()

samples = 100
b = 10
seq = 1024
emb = 768

# to mitigate the time needed for sample generation
xs = torch.randn((samples, b,seq,emb), dtype=torch.float, device='cuda')

# time1 = time.time()
# with torch.no_grad():
#     for x in xs: 
#         y = attn1(x)[0]    
# time2 = time.time()
# print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

time1 = time.time()
for x in xs: 
    y = attn1(x)[0]
    y.mean().backward()
time2 = time.time()

print(f'Backward-forward loop takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

Backward-forward loop takes 0.0239 seconds on average. Computed for 100 samples.


In [9]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=1, embd_pdrop=0, attn_pdrop=0.1, resid_pdrop=0)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
attn2 = LightAttention(configuration).cuda()

samples = 100
b = 10
seq = 1024
emb = 768

# to mitigate the time needed for sample generation
xs = torch.randn((samples, b,seq,emb), dtype=torch.float, device='cuda')

# time1 = time.time()
# with torch.no_grad():
#     for x in xs: 
#         y = attn2(x)[0]    
# time2 = time.time()
# print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

time1 = time.time()
for x in xs: 
    y = attn2(x)[0]
    y.mean().backward()
time2 = time.time()

print(f'Backward-forward loop takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

Backward-forward loop takes 0.03 seconds on average. Computed for 100 samples.
