In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True, torch_dtype="auto").to(DEVICE)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 15.07it/s]


In [37]:
input_text = "Bazując na kontekście odpowiedz na pytania - nie używaj swojej wiedzy wewnętrznej. \
    `TEXT`: 'Korona Kielce nigdy nie wygrała ekstraklasy. W 2012 roku zdobyła swoje najwyższe 5 miejsce w historii.' \
    `QUERY`: W którym roku Korona Kielce wygrała ekstraklasę? Podaj tylko rok."

input_ids = tokenizer(input_text, return_tensors="pt").to(DEVICE)

In [5]:
inverted_tokenizer = {v: k for k, v in tokenizer.get_vocab().items()}

In [52]:
len(input_ids["input_ids"][0])

107

In [48]:
pred = model.forward(
    input_ids=input_ids.get("input_ids"),
    attention_mask=input_ids.get("attention_mask"),
    output_attentions=True,
)

In [61]:
pred.attentions[-1].squeeze()[-1].shape

torch.Size([107, 107])

In [66]:
output = model.generate(
    **input_ids,
    output_attentions=True,
    max_new_tokens=50,
    # dola_layers='low',
    # repetition_penalty=1.2,
    temperature=0.0,
    use_cache=False,
    return_dict_in_generate=True
)



In [67]:
tokenizer.decode(output.sequences[0])[len(input_text):].strip()

'#### Response:\n2012\n\n\n<|endoftext|>'

In [68]:
input_ids["input_ids"]

tensor([[  350, 13581, 23341,  1055,  4139, 12681, 18391,  2413, 12248, 16426,
          1055,   282,  3637,  4807,   448,  4930,   318, 30042,  5693,  1175,
         16768,  1324, 29926,   281,  1000,  1537,   591,  1233, 30023, 11615,
          6197, 29889,   268,   421, 16975,  6998,   525, 29968,   272,  2681,
           476,   709,   346,   302,   335,  4518,  4930,  5018,  3874,  4065,
         14921,  4151, 29895,  3333, 29891, 29889,   399, 29871, 29906, 29900,
         29896, 29906,  4042, 26853,  4065, 16768,  1324,  8823, 12822, 29267,
           911, 29871, 29945, 16045,   281,  3603,  2236,  6169,   268,   421,
         13356, 24422,  6998,   399, 11593, 29885,  4042, 12555,  2681,   476,
           709,   346,  5018,  3874,  4065, 14921,  4151, 29895,  3333, 30023,
         29973,  8594,  1175, 28918,  2901, 28159, 29889]], device='cuda:0')

In [69]:
for i, tok_id in enumerate(output.sequences[0][len(input_ids["input_ids"][0]):]):
    print(f'{i}. token = {inverted_tokenizer[tok_id.item()]}')

0. token = <0x0A>
1. token = <0x0A>
2. token = ####
3. token = ▁Response
4. token = :
5. token = <0x0A>
6. token = 2
7. token = 0
8. token = 1
9. token = 2
10. token = <0x0A>
11. token = <0x0A>
12. token = <0x0A>
13. token = <|endoftext|>


In [73]:
output.attentions[6][-1].squeeze()[-1]

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [9.0234e-01, 9.7168e-02, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [6.0547e-01, 4.0527e-02, 3.5547e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [2.9883e-01, 8.2016e-05, 5.4598e-05,  ..., 1.6992e-01, 0.0000e+00,
         0.0000e+00],
        [1.1133e-01, 2.6822e-05, 1.7262e-04,  ..., 1.6211e-01, 2.5781e-01,
         0.0000e+00],
        [7.0801e-02, 4.9591e-05, 7.1049e-05,  ..., 1.3281e-01, 2.4707e-01,
         3.1836e-01]], device='cuda:0', dtype=torch.bfloat16)

In [16]:
model.state_dict().get('model.embed_tokens.weight').to('cpu').shape

torch.Size([32064, 3072])

In [21]:
tokenizer.vocab_size

32000

In [20]:
model.state_dict()['lm_head.weight'].shape

torch.Size([32064, 3072])

In [7]:
pred.attentions[-1].shape

torch.Size([1, 32, 24, 24])