In [None]:
!pip install -U -q transformers bitsandbytes

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# quantization allows to run larger model on small household GPUs
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

## LLMs generates sentence autoregressively

We use GPT2 as an example.

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2",
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)

print(model)

# Tokenizer and embedding

Tokenizer transforms sentence in English to integers.

Embedding is a huge table where integers are converted to their indexed entries.

In [None]:


# Tokenize some input text
model_inputs = tokenizer("Do you love me?", return_tensors="pt")

print(model_inputs)
print(model.transformer.wte(model_inputs['input_ids']))


In [None]:
model.generation_config.pad_token_id = tokenizer.pad_token_id
generated_ids = model.generate(**model_inputs, max_new_tokens=10, do_sample=False)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

In [None]:
print('Generated tokens', generated_ids[0,model_inputs['input_ids'].shape[1]:])

In [None]:
import copy

new_model_inputs = copy.copy(model_inputs)

Under the hood, the model predicts the probability of the next token, and the highest in this case is chosen to be the output.

The chosen token is concatenated to the input integer list, and the process goes on until the model predict the End-of-Sentence token.

In [None]:
for i in range(10):

    highest_prob_token = torch.argmax(model(**new_model_inputs)['logits'][0,-1])
    print('Tokens with highest probability', highest_prob_token)
    print(tokenizer.batch_decode(highest_prob_token.reshape(1,-1)))

    new_model_inputs = dict(
        input_ids = torch.cat([new_model_inputs['input_ids'], highest_prob_token.reshape(1,1)], axis=1),
    )

    new_model_inputs['attention_mask'] = torch.ones_like(new_model_inputs['input_ids'])



In [None]:
new_model_inputs = dict(
    input_ids = torch.cat([model_inputs['input_ids'], highest_prob_token.reshape(1,1)], axis=1),
)

new_model_inputs['attention_mask'] = torch.ones_like(new_model_inputs['input_ids'])


In [None]:
next_highest_prob_token = torch.argmax(model(**new_model_inputs)['logits'][0,-1])
print('Tokens with highest probability', next_highest_prob_token)
print(tokenizer.batch_decode(next_highest_prob_token.reshape(1,-1)))

In [None]:


model_id = "mistralai/Mistral-7B-Instruct-v0.1"


model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map='auto',
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
)

In [None]:
text = "[INST] Do you love me? [/INST]"

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
)

encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False)
model_inputs = encodeds.to('cuda:0')
model.generation_config.pad_token_id = tokenizer.pad_token_id
generated_ids = model.generate(**model_inputs, max_new_tokens=200, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

In [None]:
encodeds['input_ids']

In [None]:
model.model.embed_tokens(encodeds['input_ids']).shape