In [1]:
import sys
sys.path.append('..')

In [2]:
import os
os.environ["PATH"] += os.pathsep + '/trinity/home/d.cherniuk/libs/graphviz-2.50.0/bin'

In [3]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, GPT2Model
import torch
from torch import nn
import copy
import time
import tqdm
from prettytable import PrettyTable

from light_attention.attention import LightGPT2LMHeadModel, LightGPT2Model
from light_attention.profile import estimate_layer_memory, mem_usage

In [4]:
RANDOM_SEED = 15835

# Memory

## Vanilla

In [5]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config()
model = GPT2Model(configuration)
# model = model.cuda()

In [6]:
b = 4
seq = configuration.n_positions
# emb = configuration.n_embd
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

Params (empirical) 487.4688 MB

Params (analytical, torch) 474.7002 MB

After input batch generation, before forward pass:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

After backward:
MA 12119.2896 MB         Max_MA 12119.2896 MB         CA 12654.0 MB         Max_CA 12654.0 MB 


Activations (analytical, torchviz) 12195.8207 MB
Activations (empirical) 11631.7896 MB


## Light

In [5]:
torch.manual_seed(RANDOM_SEED)
configuration = GPT2Config()
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model = LightGPT2Model(configuration)
# model = model.cuda()

In [6]:
b = 4
seq = configuration.n_positions
x = torch.randint(0, configuration.vocab_size, size=(b,seq), device='cuda')
estimate_layer_memory(copy.deepcopy(model), x, device='cuda', input_shape=None)


Before placing the model on GPU
MA 0.0312 MB         Max_MA 0.0312 MB         CA 2.0 MB         Max_CA 2.0 MB 

After placing the model on GPU:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

Params (empirical) 487.4688 MB

Params (analytical, torch) 474.7002 MB

After input batch generation, before forward pass:
MA 487.5 MB         Max_MA 487.5 MB         CA 542.0 MB         Max_CA 542.0 MB 

After backward:
MA 7799.2896 MB         Max_MA 7799.2896 MB         CA 7902.0 MB         Max_CA 7902.0 MB 


Activations (analytical, torchviz) 7587.8207 MB
Activations (empirical) 7311.7896 MB


In [8]:
table = PrettyTable()
table.field_names = ["Model", "Max Memory Allocated, MB", "Max Memory Reserved, MB", "Activations Memory, MB"]
table.add_row(["Vanilla gpt2", 12119.3, 12654.0, 11631.8])
table.add_row(["Light gpt2", 7799.3, 7902.0, 7311.8])
table.title = "batch_size=4, seq_length=1024, emb_size=768, blocks=12"
print(table)

+--------------------------------------------------------------------------------------------+
|                   batch_size=4, seq_length=1024, emb_size=768, blocks=12                   |
+--------------+--------------------------+-------------------------+------------------------+
|    Model     | Max Memory Allocated, MB | Max Memory Reserved, MB | Activations Memory, MB |
+--------------+--------------------------+-------------------------+------------------------+
| Vanilla gpt2 |         12119.3          |         12654.0         |        11631.8         |
|  Light gpt2  |          7799.3          |          7902.0         |         7311.8         |
+--------------+--------------------------+-------------------------+------------------------+


# Correctness

In [5]:
torch.manual_seed(RANDOM_SEED)
b = 2
seq = 1024
x1 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
model1 = GPT2Model(configuration)
model1 = model1.cuda()
y1 = model1(x1).last_hidden_state
y1.cos().mean().backward()

In [10]:
torch.manual_seed(RANDOM_SEED)
b = 2
seq = 1024
x2 = torch.randint(0, 50257, size=(b,seq), device='cuda')
configuration = GPT2Config(n_layer=1)
configuration.use_dropmatmul = True
configuration.use_lightsoftmax = True
model2 = LightGPT2Model(configuration)
model2 = model2.cuda()
y2 = model2(x2).last_hidden_state
y2.cos().mean().backward()

In [11]:
torch.allclose(x1, x2), \
torch.allclose(model1.h[0].attn.c_attn.weight, model2.h[0].attn.c_attn.weight), \
torch.allclose(y1, y2), \
torch.allclose(model1.h[0].attn.c_attn.weight.grad, model2.h[0].attn.c_attn.weight.grad), \
torch.allclose(model1.h[0].mlp.c_fc.weight.grad, model2.h[0].mlp.c_fc.weight.grad)

(True, True, True, True, True)