In [1]:
import sys
sys.path.append('..')

In [2]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, GPT2Model
import transformers
import torch
from torch import nn
import copy
import time
import tqdm
import gc

from light_attention.models.gpt2 import LightGPT2LMHeadModel, LightGPT2Model
from light_attention.profile import estimate_layer_memory, mem_usage

2022-06-28 13:37:15.743104: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [3]:
torch.__version__, transformers.__version__, torch.device("cuda")

('1.10.0+cu111', '4.20.1', device(type='cuda'))

In [4]:
RANDOM_SEED = 15835

# Memory

### GPT2-small

### Vanilla

In [5]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=12, n_layer=12, n_positions=1024, n_embd=768)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
emb = configuration.n_embd
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

Params (empirical) 487.4688 MB

After input batch generation, before forward pass:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

After backward:
MA 12119.2896 MB         Max_MA 12119.2896 MB         CA 12654.0 MB         Max_CA 12654.0 MB 

Activations (empirical) 11631.7896 MB


### Light

In [5]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=12, n_layer=12, n_positions=1024, n_embd=768)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

Params (empirical) 487.4688 MB

After input batch generation, before forward pass:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

After backward:
MA 7799.2896 MB         Max_MA 7799.2896 MB         CA 7902.0 MB         Max_CA 7902.0 MB 

Activations (empirical) 7311.7896 MB


### GPT2-medium

In [29]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=16, n_layer=24, n_positions=1024, n_embd=1024)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

Params (empirical) 1377.5547 MB

Params (analytical, torch) 1353.543 MB

After input batch generation, before forward pass:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

After backward:
MA 32351.126 MB         Max_MA 32351.126 MB         CA 33558.0 MB         Max_CA 33558.0 MB 


Activations (analytical, torchviz) 32485.571 MB
Activations (empirical) 30973.5396 MB


In [36]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=16, n_layer=24, n_positions=1024, n_embd=1024)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

Params (empirical) 1377.5547 MB

Params (analytical, torch) 1353.543 MB

After input batch generation, before forward pass:
MA 1377.5864 MB         Max_MA 1377.5864 MB         CA 1380.0 MB         Max_CA 1380.0 MB 

After backward:
MA 20831.126 MB         Max_MA 20831.126 MB         CA 20870.0 MB         Max_CA 20870.0 MB 


Activations (analytical, torchviz) 20197.5711 MB
Activations (empirical) 19453.5396 MB


### GPT2-large

In [33]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=20, n_layer=36, n_positions=1024, n_embd=1280)
model = GPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

Params (empirical) 3061.3125 MB

Params (analytical, torch) 2952.6904 MB

After input batch generation, before forward pass:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

After backward:
MA 61104.6338 MB         Max_MA 61104.6338 MB         CA 63376.0 MB         Max_CA 63376.0 MB 


Activations (analytical, torchviz) 60887.3214 MB
Activations (empirical) 58043.2896 MB


In [35]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=20, n_layer=36, n_positions=1024, n_embd=1280)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0317 MB         Max_MA 0.0317 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

Params (empirical) 3061.3125 MB

Params (analytical, torch) 2952.6904 MB

After input batch generation, before forward pass:
MA 3061.3442 MB         Max_MA 3061.3442 MB         CA 3138.0 MB         Max_CA 3138.0 MB 

After backward:
MA 39504.6338 MB         Max_MA 39504.6338 MB         CA 39616.0 MB         Max_CA 39616.0 MB 


Activations (analytical, torchviz) 37847.3215 MB
Activations (empirical) 36443.2896 MB


## GPT2-xl

In [7]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=25, n_layer=48, n_positions=1024, n_embd=1600)
model = GPT2Model(configuration)
b = 2
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0156 MB         Max_MA 0.0156 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 6124.4683 MB         Max_MA 6124.4683 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

Params (empirical) 6124.4526 MB

Params (analytical, torch) 5941.8152 MB

After input batch generation, before forward pass:
MA 6124.4683 MB         Max_MA 6124.4683 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

After backward:
MA 54503.8359 MB         Max_MA 54503.8359 MB         CA 56948.0 MB         Max_CA 56948.0 MB 


Activations (analytical, torchviz) 50729.6655 MB
Activations (empirical) 48379.3677 MB


In [9]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config(n_head=25, n_layer=48, n_positions=1024, n_embd=1600)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)
torch.cuda.empty_cache()


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 6124.4839 MB         Max_MA 6124.4839 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

Params (empirical) 6124.4526 MB

Params (analytical, torch) 5941.8152 MB

After input batch generation, before forward pass:
MA 6124.4839 MB         Max_MA 6124.4839 MB         CA 6140.0 MB         Max_CA 6140.0 MB 

After backward:
MA 67049.7734 MB         Max_MA 67049.7734 MB         CA 67132.0 MB         Max_CA 67132.0 MB 


Activations (analytical, torchviz) 63059.322 MB
Activations (empirical) 60925.2896 MB


# Correctness

In [6]:
torch.manual_seed(RANDOM_SEED)
b = 2
seq = 1024
x1 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
model1 = GPT2Model(configuration)
model1 = model1.cuda()
y1 = model1(x1).last_hidden_state
y1.cos().mean().backward()

In [7]:
torch.manual_seed(RANDOM_SEED)
b = 2
seq = 1024
x2 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = LightGPT2Model(configuration)
model2 = model2.cuda()
y2 = model2(x2).last_hidden_state
y2.cos().mean().backward()

In [8]:
torch.allclose(x1, x2), \
torch.allclose(model1.h[0].attn.c_attn.weight, model2.h[0].attn.c_attn.weight), \
torch.allclose(y1, y2), \
torch.allclose(model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad), \
torch.allclose(model1.h[0].mlp.c_fc.weight.grad, model2.h[0].mlp.c_fc.weight.grad)

(True, True, True, True, True)

In [9]:
model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad

(tensor([[ 1.5888e-06,  2.3713e-06,  1.1521e-06,  ...,  9.5301e-06,
          -3.9004e-06,  3.4029e-06],
         [ 2.2943e-06,  4.1748e-07, -1.6698e-06,  ...,  4.8337e-06,
          -1.2031e-06, -1.3247e-05],
         [-2.0791e-06, -5.0380e-07, -7.1940e-07,  ..., -1.0359e-05,
          -8.5861e-06, -8.6150e-06],
         ...,
         [ 3.3493e-06,  2.2633e-06,  1.2692e-07,  ..., -3.9649e-06,
           4.8613e-06,  5.5402e-06],
         [-3.0494e-06,  8.7274e-07, -8.1078e-07,  ...,  4.1371e-06,
          -1.0492e-05, -1.2938e-07],
         [-1.7868e-06,  1.8953e-06, -2.4562e-06,  ...,  1.7462e-05,
           6.1537e-06,  2.0687e-05]], device='cuda:0'),
 tensor([[ 1.5888e-06,  2.3713e-06,  1.1521e-06,  ...,  9.5301e-06,
          -3.9004e-06,  3.4029e-06],
         [ 2.2943e-06,  4.1749e-07, -1.6698e-06,  ...,  4.8337e-06,
          -1.2031e-06, -1.3247e-05],
         [-2.0791e-06, -5.0379e-07, -7.1940e-07,  ..., -1.0359e-05,
          -8.5861e-06, -8.6150e-06],
         ...,
        