In [1]:
import sys
sys.path.append('..')

In [2]:
!nvidia-smi

Thu Oct 13 23:41:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 28%   37C    P5    29W / 260W |      0MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, GPT2Model
import transformers
import torch
from torch import nn
import copy
import time
from tqdm import tqdm
import gc

from light_attention.models.gpt2 import LightGPT2LMHeadModel, LightGPT2Model, LightGPT2Attention
from light_attention.profile import estimate_layer_memory, mem_usage

In [4]:
torch.__version__, transformers.__version__, torch.device("cuda")

('1.11.0+cu102', '4.19.2', device(type='cuda'))

In [5]:
RANDOM_SEED = 15835

# HuggingFace GPT2Model vs LightGPT2Model

# Memory

### GPT2-small

### Vanilla

In [5]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=12, n_layer=12, n_positions=1024, n_embd=768)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
emb = configuration.n_embd
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

Params (empirical) 487.4688 MB

After input batch generation, before forward pass:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

After backward:
MA 12119.2896 MB         Max_MA 12119.2896 MB         CA 12654.0 MB         Max_CA 12654.0 MB 

Activations (empirical) 11631.7896 MB


### Light

In [5]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=12, n_layer=12, n_positions=1024, n_embd=768)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

Params (empirical) 487.4688 MB

After input batch generation, before forward pass:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

After backward:
MA 7799.2896 MB         Max_MA 7799.2896 MB         CA 7902.0 MB         Max_CA 7902.0 MB 

Activations (empirical) 7311.7896 MB


### GPT2-medium

In [29]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=16, n_layer=24, n_positions=1024, n_embd=1024)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

Params (empirical) 1377.5547 MB

Params (analytical, torch) 1353.543 MB

After input batch generation, before forward pass:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

After backward:
MA 32351.126 MB         Max_MA 32351.126 MB         CA 33558.0 MB         Max_CA 33558.0 MB 


Activations (analytical, torchviz) 32485.571 MB
Activations (empirical) 30973.5396 MB


In [36]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=16, n_layer=24, n_positions=1024, n_embd=1024)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

Params (empirical) 1377.5547 MB

Params (analytical, torch) 1353.543 MB

After input batch generation, before forward pass:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

After backward:
MA 20831.126 MB         Max_MA 20831.126 MB         CA 20870.0 MB         Max_CA 20870.0 MB 


Activations (analytical, torchviz) 20197.5711 MB
Activations (empirical) 19453.5396 MB


### GPT2-large

In [33]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=20, n_layer=36, n_positions=1024, n_embd=1280)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

Params (empirical) 3061.3125 MB

Params (analytical, torch) 2952.6904 MB

After input batch generation, before forward pass:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

After backward:
MA 61104.6338 MB         Max_MA 61104.6338 MB         CA 63376.0 MB         Max_CA 63376.0 MB 


Activations (analytical, torchviz) 60887.3214 MB
Activations (empirical) 58043.2896 MB


In [35]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=20, n_layer=36, n_positions=1024, n_embd=1280)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

Params (empirical) 3061.3125 MB

Params (analytical, torch) 2952.6904 MB

After input batch generation, before forward pass:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

After backward:
MA 39504.6338 MB         Max_MA 39504.6338 MB         CA 39616.0 MB         Max_CA 39616.0 MB 


Activations (analytical, torchviz) 37847.3215 MB
Activations (empirical) 36443.2896 MB


## GPT2-xl

In [7]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=25, n_layer=48, n_positions=1024, n_embd=1600)
model = GPT2Model(configuration)
b = 2
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0156 MB         Max_MA 0.0156 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 6124.4683 MB         Max_MA 6124.4683 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

Params (empirical) 6124.4526 MB

Params (analytical, torch) 5941.8152 MB

After input batch generation, before forward pass:
MA 6124.4683 MB         Max_MA 6124.4683 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

After backward:
MA 54503.8359 MB         Max_MA 54503.8359 MB         CA 56948.0 MB         Max_CA 56948.0 MB 


Activations (analytical, torchviz) 50729.6655 MB
Activations (empirical) 48379.3677 MB


In [9]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=25, n_layer=48, n_positions=1024, n_embd=1600)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 6124.4839 MB         Max_MA 6124.4839 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

Params (empirical) 6124.4526 MB

Params (analytical, torch) 5941.8152 MB

After input batch generation, before forward pass:
MA 6124.4839 MB         Max_MA 6124.4839 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

After backward:
MA 67049.7734 MB         Max_MA 67049.7734 MB         CA 67132.0 MB         Max_CA 67132.0 MB 


Activations (analytical, torchviz) 63059.322 MB
Activations (empirical) 60925.2896 MB


# Correctness

In [6]:
device = torch.device('cuda')

In [8]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x1 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
model1 = GPT2Model(configuration).to(device)
model1.eval()
y1 = model1(x1).last_hidden_state
y1.cos().mean().backward()

#### LightGPT2Model through model class...

In [9]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x2 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = LightGPT2Model(configuration).to(device)
model2.eval()
y2 = model2(x2).last_hidden_state
y2.cos().mean().backward()

#### ... or through attention module substitution

In [12]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x2 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = GPT2Model(configuration)
for i in range(len(model2.h)):
    weight_attn = model1.h[i].attn.c_attn.weight.detach()
    bias_attn = model1.h[i].attn.c_attn.bias.detach()
    weight_proj = model1.h[i].attn.c_proj.weight.detach()
    bias_proj = model1.h[i].attn.c_proj.bias.detach()
    model2.h[i].attn = LightGPT2Attention(configuration).cuda()
    model2.h[i].attn.c_attn.weight = nn.Parameter(weight_attn, requires_grad=True)
    model2.h[i].attn.c_attn.bias = nn.Parameter(bias_attn, requires_grad=True)
    model2.h[i].attn.c_proj.weight = nn.Parameter(weight_proj, requires_grad=True)
    model2.h[i].attn.c_proj.bias = nn.Parameter(bias_proj, requires_grad=True)
model2 = model2.cuda()
y2 = model2(x2).last_hidden_state
y2.cos().mean().backward()

In [10]:
torch.allclose(x1, x2), \
torch.allclose(model1.h[0].attn.c_attn.weight, model2.h[0].attn.c_attn.weight), \
torch.allclose(y1, y2), \
torch.allclose(model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad), \
torch.allclose(model1.h[0].mlp.c_fc.weight.grad, model2.h[0].mlp.c_fc.weight.grad)

(True, True, True, True, True)

In [11]:
model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad

(tensor([[-2.3828e-06,  1.9859e-06,  1.1530e-06,  ...,  1.9360e-06,
          -8.0775e-07,  4.3802e-06],
         [ 1.8664e-06,  1.9181e-06,  8.7818e-07,  ..., -7.8725e-07,
           1.3553e-06, -7.7554e-06],
         [-2.2365e-06,  6.1567e-07,  8.5674e-07,  ..., -9.5081e-06,
          -8.7413e-06, -8.6345e-06],
         ...,
         [ 2.6643e-06, -1.0185e-06, -1.7076e-06,  ...,  3.5832e-06,
           6.7438e-06,  9.4983e-06],
         [ 5.2340e-07, -4.3724e-07, -1.2882e-06,  ..., -2.0538e-06,
          -7.1473e-06,  8.3647e-06],
         [-4.2918e-07,  1.4737e-06,  1.9334e-06,  ..., -2.1098e-06,
           3.3350e-06,  5.5057e-06]], device='cuda:0'),
 tensor([[-2.3828e-06,  1.9859e-06,  1.1530e-06,  ...,  1.9360e-06,
          -8.0775e-07,  4.3802e-06],
         [ 1.8664e-06,  1.9181e-06,  8.7818e-07,  ..., -7.8725e-07,
           1.3553e-06, -7.7554e-06],
         [-2.2365e-06,  6.1567e-07,  8.5674e-07,  ..., -9.5081e-06,
          -8.7413e-06, -8.6345e-06],
         ...,
        

# Speed

In [7]:
device = torch.device('cuda')

## Forward

In [14]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=12)
model1 = GPT2Model(configuration).to(device)
# model1.eval()
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 8
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
with torch.no_grad():
    y = model1(xs[0])[0]
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
with torch.no_grad():
    for x in tqdm(xs): 
        y = model1(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

100%|██████████| 100/100 [00:27<00:00,  3.64it/s]

Forward pass takes 0.275 seconds on average. Computed for 100 samples.





In [15]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=12)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = LightGPT2Model(configuration).to(device)
# model2.eval()
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 8
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
with torch.no_grad():
    y = model2(xs[0])[0]
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
with torch.no_grad():
    for x in tqdm(xs): 
        y = model2(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

100%|██████████| 100/100 [00:28<00:00,  3.57it/s]

Forward pass takes 0.28 seconds on average. Computed for 100 samples.





In [6]:
device = torch.device('cuda')

## Backward

In [9]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=10)
model1 = GPT2Model(configuration).to(device)
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 4
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
y = model1(xs[0])[0]
y.mean().backward()
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
for x in tqdm(xs): 
    y = model1(x)[0]
    y.mean().backward()
    # cuda operations are asynchronous
    torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

100%|██████████| 100/100 [00:32<00:00,  3.09it/s]

Forward pass takes 0.324 seconds on average. Computed for 100 samples.





In [7]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=10)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = False
model2 = LightGPT2Model(configuration).to(device)
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 4
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
y = model2(xs[0])[0]
y.mean().backward()
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
for x in tqdm(xs): 
    y = model2(x)[0]
    y.mean().backward()
    # cuda operations are asynchronous
    torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

100%|██████████| 100/100 [00:34<00:00,  2.88it/s]

Forward pass takes 0.348 seconds on average. Computed for 100 samples.



