In [1]:
!pip install -U -q transformers bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from huggingface_hub import login
login()

In [3]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# quantization allows to run larger model on small household GPUs
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

## LLMs generates sentence autoregressively

We use GPT2 as an example.

In [13]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2",
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)

print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


# Tokenizer and embedding

Tokenizer transforms sentence in English to integers.

Embedding is a huge table where integers are converted to their indexed entries.

In [15]:


# Tokenize some input text
model_inputs = tokenizer("Do you love me?", return_tensors="pt")

print(model_inputs)
print(model.transformer.wte(model_inputs['input_ids']))


{'input_ids': tensor([[5211,  345, 1842,  502,   30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
tensor([[[ 0.1801, -0.0464,  0.1143,  ...,  0.1077, -0.0258,  0.1069],
         [-0.0337,  0.0484,  0.0309,  ..., -0.1242, -0.0810, -0.0539],
         [-0.1244, -0.1096, -0.0113,  ..., -0.1417,  0.0967, -0.2077],
         [ 0.1515, -0.0247,  0.0936,  ..., -0.1684,  0.1065,  0.0572],
         [-0.0660, -0.0374,  0.0515,  ..., -0.0804,  0.1048, -0.1505]]],
       grad_fn=<EmbeddingBackward0>)


In [None]:
model.generation_config.pad_token_id = tokenizer.pad_token_id
generated_ids = model.generate(**model_inputs, max_new_tokens=10, do_sample=False)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

In [5]:
print('Generated tokens', generated_ids[0,model_inputs['input_ids'].shape[1]:])

Generated tokens tensor([ 314, 1842,  345,   13,  314, 1842,  345,   13,  314, 1842])


In [6]:
import copy

new_model_inputs = copy.copy(model_inputs)

Under the hood, the model predicts the probability of the next token, and the highest in this case is chosen to be the output.

The chosen token is concatenated to the input integer list, and the process goes on until the model predict the End-of-Sentence token.

In [11]:
for i in range(10):

    highest_prob_token = torch.argmax(model(**new_model_inputs)['logits'][0,-1])
    print('Tokens with highest probability', highest_prob_token)
    print(tokenizer.batch_decode(highest_prob_token.reshape(1,-1)))

    new_model_inputs = dict(
        input_ids = torch.cat([new_model_inputs['input_ids'], highest_prob_token.reshape(1,1)], axis=1),
    )

    new_model_inputs['attention_mask'] = torch.ones_like(new_model_inputs['input_ids'])



Tokens with highest probability tensor(198)
['\n']
Tokens with highest probability tensor(40)
['I']
Tokens with highest probability tensor(1842)
[' love']
Tokens with highest probability tensor(345)
[' you']
Tokens with highest probability tensor(13)
['.']
Tokens with highest probability tensor(198)
['\n']
Tokens with highest probability tensor(198)
['\n']
Tokens with highest probability tensor(40)
['I']
Tokens with highest probability tensor(1842)
[' love']
Tokens with highest probability tensor(345)
[' you']


In [8]:
new_model_inputs = dict(
    input_ids = torch.cat([model_inputs['input_ids'], highest_prob_token.reshape(1,1)], axis=1),
)

new_model_inputs['attention_mask'] = torch.ones_like(new_model_inputs['input_ids'])


In [9]:
next_highest_prob_token = torch.argmax(model(**new_model_inputs)['logits'][0,-1])
print('Tokens with highest probability', next_highest_prob_token)
print(tokenizer.batch_decode(next_highest_prob_token.reshape(1,-1)))

Tokens with highest probability tensor(198)
['\n']


In [10]:


model_id = "mistralai/Mistral-7B-Instruct-v0.1"


model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map='auto',
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [None]:
text = "[INST] Do you love me? [/INST]"

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
)

encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False)
model_inputs = encodeds.to('cuda:0')
model.generation_config.pad_token_id = tokenizer.pad_token_id
generated_ids = model.generate(**model_inputs, max_new_tokens=200, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

In [None]:
encodeds['input_ids']

In [None]:
model.model.embed_tokens(encodeds['input_ids']).shape