In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from utils import quantise_model,get_model_memory_size, compute_quantisation_mse, Perplexity
from datasets import load_dataset
import torch

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

# model = model.half()

text = [t for t in load_dataset("wikitext", "wikitext-2-raw-v1", split="test")['text'] if len(t) > 30]
# get_model_memory_size(model)
# model, parameter_mapping = quantise_model(model, chunk_size=1024)
# get_model_memory_size(model, parameter_mapping)


In [7]:
t = torch.ones((1,10), dtype=torch.long)
model = model.transformer.wte
model.forward(t)

tensor([[[ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
         [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
         [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
         ...,
         [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
         [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
         [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432]]],
       grad_fn=<EmbeddingBackward0>)

In [2]:
model = model.to('cuda')

In [2]:
torch.cuda.memory_allocated()/1000000

0.000512

In [3]:
torch.cuda.memory_reserved()/1000000

2.097152

In [None]:
text = text[0:100]

In [None]:
list(model.named_parameters())[0]

In [None]:
perplexity = Perplexity()
p = perplexity._compute(text, model, tokenizer)
print(p['mean_perplexity'])


In [None]:
# put parameter_mapping on gpu
for parameter in parameter_mapping.keys():
    parameter_mapping[parameter]['scales'] = parameter_mapping[parameter]['scales'].to('cuda')
    parameter_mapping[parameter]['locations'] = parameter_mapping[parameter]['locations'].to('cuda')

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(42)
generator("Once upon a time, ", max_length=30, num_return_sequences=1)


0:1000
fp32: 135.18940559005736 (40 seconds , 9533MiB memory, 100% gpu util)
fp16: 135.193484375 (15 seconds, 4847MiB memory, 100% gpu util)
int8_1024: 191.00541243076324 (took 9 minutes, used approx 9361 memory, very low gpu util (20% max)). putting the parameter_mappings on the gpu actually slows the inference down to 12 minutes

0:100
fp32: 188.55674095153807
int8_256: 238.52101341247558
int8_1024: 299.4311764335632

moving model itself to cuda
base: 642
fp16: 368 (this is 1.15 times bigger than expected)
int8_1024: 242 (this is 1.5 times bigger than expected) and this does not include 100MB of parameter_mapping, which should be like 2 MB 

In [1]:
import math
from utils import scalar_quantisation
import torch


torch.cuda.memory._record_memory_history()
chunk_size = 1000
t = torch.rand((10000000))
t = t.to('cuda')

def q(t):
    shape = t.shape
    t_flat = t.flatten()
    t_q = torch.zeros_like(t_flat).type(torch.int8)
    n_chunks = math.ceil(len(t_flat) / chunk_size)
    scales = torch.zeros(n_chunks)
    locations = torch.zeros(n_chunks)

    for chunk_id in range(n_chunks):

        left = chunk_id * chunk_size
        right = min(len(t_flat), (chunk_id + 1) * chunk_size)

        t_q[left:right], scales[chunk_id], locations[chunk_id] = scalar_quantisation(t_flat[left:right])

    t_q = t_q.reshape(shape)
    return t_flat

def q_inplace(t):
    shape = t.shape
    t_flat = t.flatten()
    n_chunks = math.ceil(len(t_flat) / chunk_size)
    scales = torch.zeros(n_chunks)
    locations = torch.zeros(n_chunks)

    for chunk_id in range(n_chunks):

        left = chunk_id * chunk_size
        right = min(len(t_flat), (chunk_id + 1) * chunk_size)

        t_flat[left:right], scales[chunk_id], locations[chunk_id] = scalar_quantisation(t_flat[left:right])

    t_flat = t_flat.reshape(shape)
    t_flat = t_flat.type(torch.int8)

    return t_flat

qq = q(t)


torch.cuda.memory._dump_snapshot("q.pickle")



In [1]:
import torch

with torch.no_grad():

    d = {'a': torch.rand((100000000)).to('cuda'), 'b':torch.rand((100000000)).to('cuda')}

In [2]:
with torch.no_grad():

    del d['a']
    del d['b']
    del d

In [3]:
torch.cuda.empty_cache()

In [4]:
b

tensor([0.8888, 0.2352, 0.0999,  ..., 0.0175, 0.4840, 0.5250], device='cuda:0')

In [5]:
a

tensor([0.9091, 0.5111, 0.9969,  ..., 0.5063, 0.5423, 0.8254], device='cuda:0')