In [1]:
import sys
sys.path.append('..')

In [2]:
!nvidia-smi

Wed Oct 26 21:16:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:1C:00.0 Off |                    0 |
| N/A   30C    P0    40W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, GPT2Model
import transformers
import torch
from torch import nn
import copy
import time
from tqdm import tqdm
import gc

from light_attention.models.gpt2 import LightGPT2LMHeadModel, LightGPT2Model, LightGPT2Attention
from light_attention.profile import estimate_layer_memory, mem_usage

In [4]:
torch.__version__, transformers.__version__, torch.device("cuda")

('1.11.0+cu102', '4.23.1', device(type='cuda'))

In [5]:
RANDOM_SEED = 15835

# HuggingFace GPT2Model vs LightGPT2Model

# Memory

### GPT2-small

### Vanilla

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=12, n_layer=12, n_positions=1024, n_embd=768)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
emb = configuration.n_embd
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

### Light

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=12, n_layer=12, n_positions=1024, n_embd=768)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

### GPT2-medium

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=16, n_layer=24, n_positions=1024, n_embd=1024)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=16, n_layer=24, n_positions=1024, n_embd=1024)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

### GPT2-large

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=20, n_layer=36, n_positions=1024, n_embd=1280)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=20, n_layer=36, n_positions=1024, n_embd=1280)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

## GPT2-xl

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=25, n_layer=48, n_positions=1024, n_embd=1600)
model = GPT2Model(configuration)
b = 2
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

In [None]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=25, n_layer=48, n_positions=1024, n_embd=1600)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()

# Correctness

In [6]:
device = torch.device('cuda')

## FP32

In [19]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x1 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
model1 = GPT2Model(configuration).to(device)
# model1.eval()
y1 = model1(x1).last_hidden_state
y1.cos().mean().backward()

#### LightGPT2Model through model class...

In [7]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x2 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = LightGPT2Model(configuration).to(device)
# model2.eval()
y2 = model2(x2).last_hidden_state
y2.cos().mean().backward()

#### ... or through attention module substitution

In [20]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x2 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = GPT2Model(configuration)
for i in range(len(model2.h)):
    weight_attn = model1.h[i].attn.c_attn.weight.detach()
    bias_attn = model1.h[i].attn.c_attn.bias.detach()
    weight_proj = model1.h[i].attn.c_proj.weight.detach()
    bias_proj = model1.h[i].attn.c_proj.bias.detach()
    model2.h[i].attn = LightGPT2Attention(configuration).cuda()
    model2.h[i].attn.c_attn.weight = nn.Parameter(weight_attn, requires_grad=True)
    model2.h[i].attn.c_attn.bias = nn.Parameter(bias_attn, requires_grad=True)
    model2.h[i].attn.c_proj.weight = nn.Parameter(weight_proj, requires_grad=True)
    model2.h[i].attn.c_proj.bias = nn.Parameter(bias_proj, requires_grad=True)
model2 = model2.cuda()
# model2.eval()
y2 = model2(x2).last_hidden_state
y2.cos().mean().backward()

torch.float32


In [21]:
torch.allclose(x1, x2), \
torch.allclose(model1.h[0].attn.c_attn.weight, model2.h[0].attn.c_attn.weight), \
torch.allclose(model1.h[0].mlp.c_fc.weight, model2.h[0].mlp.c_fc.weight), \
torch.allclose(y1, y2), \
torch.allclose(model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad), \
torch.allclose(model1.h[0].mlp.c_fc.weight.grad, model2.h[0].mlp.c_fc.weight.grad)

(True, True, True, True, True, True)

In [22]:
model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad

(tensor([[ 1.6000e-06,  1.1021e-06, -2.9144e-06,  ..., -6.6696e-06,
          -1.1884e-06, -3.4231e-06],
         [ 2.7922e-06, -3.8577e-08,  3.6139e-06,  ..., -6.0986e-06,
           3.3991e-06,  6.3748e-06],
         [-3.2562e-06,  6.4851e-07, -1.1973e-07,  ..., -8.6639e-06,
           5.2016e-06, -1.7093e-06],
         ...,
         [-1.8265e-06,  2.2623e-06,  1.7213e-06,  ...,  4.6443e-07,
          -7.9710e-06, -6.7724e-06],
         [-7.3301e-07, -6.3993e-07,  1.1762e-06,  ..., -1.6340e-06,
          -2.1716e-06,  1.2650e-05],
         [-2.7116e-06, -6.4420e-09, -2.2616e-06,  ...,  7.5805e-06,
           1.0454e-05,  1.2966e-05]], device='cuda:0'),
 tensor([[ 1.6000e-06,  1.1021e-06, -2.9144e-06,  ..., -6.6696e-06,
          -1.1884e-06, -3.4231e-06],
         [ 2.7922e-06, -3.8577e-08,  3.6139e-06,  ..., -6.0986e-06,
           3.3991e-06,  6.3748e-06],
         [-3.2562e-06,  6.4851e-07, -1.1973e-07,  ..., -8.6639e-06,
           5.2016e-06, -1.7093e-06],
         ...,
        

## Mixed Precision

In [23]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x1 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
model1 = GPT2Model(configuration).to(device)
# model1.eval()

with torch.autocast(device_type='cuda', dtype=torch.float16):
    y1 = model1(x1).last_hidden_state
    y1.cos().mean().backward()

In [24]:
torch.manual_seed(RANDOM_SEED)
b = 4
seq = 1024
x2 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = LightGPT2Model(configuration).to(device)
# model2.eval()
with torch.autocast(device_type='cuda', dtype=torch.float16):
    y2 = model2(x2).last_hidden_state
    y2.cos().mean().backward()

torch.float32


In [25]:
torch.allclose(x1, x2), \
torch.allclose(model1.h[0].attn.c_attn.weight, model2.h[0].attn.c_attn.weight), \
torch.allclose(model1.h[0].mlp.c_fc.weight, model2.h[0].mlp.c_fc.weight), \
torch.allclose(y1, y2), \
torch.allclose(model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad, atol=1e-6), \
torch.allclose(model1.h[0].mlp.c_fc.weight.grad, model2.h[0].mlp.c_fc.weight.grad)

(True, True, True, True, True, True)

In [26]:
model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad

(tensor([[ 5.9605e-07,  3.5763e-07, -2.3842e-07,  ..., -8.5831e-06,
          -2.6226e-06, -1.7881e-06],
         [ 5.9605e-07, -3.5763e-07,  8.9407e-07,  ..., -7.7486e-06,
           4.3511e-06,  6.3181e-06],
         [ 1.1921e-07,  2.9802e-07,  2.3842e-07,  ..., -1.0729e-05,
           6.5565e-06, -4.4107e-06],
         ...,
         [ 0.0000e+00, -4.7684e-07, -8.9407e-07,  ..., -1.2517e-06,
          -7.6890e-06, -4.7684e-06],
         [ 5.3644e-07, -2.3842e-07, -4.7684e-07,  ..., -6.5565e-06,
          -7.7486e-07,  1.3173e-05],
         [-1.1921e-07, -1.7881e-07,  2.9802e-07,  ...,  7.9274e-06,
           1.1086e-05,  9.7752e-06]], device='cuda:0'),
 tensor([[ 5.9605e-07,  3.5763e-07, -2.3842e-07,  ..., -8.5831e-06,
          -2.5630e-06, -1.7881e-06],
         [ 4.7684e-07, -2.3842e-07,  1.0133e-06,  ..., -7.7486e-06,
           4.2915e-06,  6.3181e-06],
         [ 1.1921e-07,  3.5763e-07,  2.3842e-07,  ..., -1.0729e-05,
           6.4373e-06, -4.4107e-06],
         ...,
        

# Speed

## Forward

In [None]:
device = torch.device('cuda')

In [None]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=12)
model1 = GPT2Model(configuration).to(device)
# model1.eval()
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 8
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
with torch.no_grad():
    y = model1(xs[0])[0]
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
with torch.no_grad():
    for x in tqdm(xs): 
        y = model1(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

In [None]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=12)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = LightGPT2Model(configuration).to(device)
# model2.eval()
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 8
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
with torch.no_grad():
    y = model2(xs[0])[0]
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
with torch.no_grad():
    for x in tqdm(xs): 
        y = model2(x)[0]
        # cuda operations are asynchronous
        torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

## Backward

In [None]:
device = torch.device('cuda')

In [None]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=10)
model1 = GPT2Model(configuration).to(device)
# model1.eval()
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 4
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
y = model1(xs[0])[0]
y.mean().backward()
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
for x in tqdm(xs): 
    y = model1(x)[0]
    torch.cuda.synchronize(device)
    y.mean().backward()
    # cuda operations are asynchronous
    torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')

In [None]:
torch.manual_seed(RANDOM_SEED)

configuration = GPT2Config(n_layer=10)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = False
model2 = LightGPT2Model(configuration).to(device)
model2.eval()
seq = configuration.n_positions
emb = configuration.n_embd
samples = 100
b = 4
seq = 1024
xs = torch.randint(0, 50257, size=(samples, b,seq), device=device)

# fake run to allocate memory
y = model2(xs[0])[0]
y.mean().backward()
# cuda operations are asynchronous
torch.cuda.synchronize(device)

time1 = time.time()
for x in tqdm(xs): 
    y = model2(x)[0]
    torch.cuda.synchronize(device)
    y.mean().backward()
    # cuda operations are asynchronous
    torch.cuda.synchronize(device)
time2 = time.time()
print(f'Forward pass takes {((time2-time1) / samples):.3} seconds on average. Computed for {samples} samples.')