In [1]:
import torch

In [2]:
torch.__version__

'2.3.1+cu118'

In [3]:
import transformers

In [4]:
transformers.__version__

'4.41.2'

In [5]:
from transformers import AutoTokenizer

In [6]:
from transformers import pipeline
pipe = pipeline("text-generation", model="openai-community/gpt2")




In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [8]:
sentence = "i skip across the"

In [9]:
input_ids = tokenizer(sentence, return_tensors='pt').input_ids

In [10]:
input_ids

tensor([[   72, 14267,  1973,   262]])

In [11]:
# words --> tokens --> Unique ID --> vector embeddings inside the model internally

In [12]:
tokenizer.decode(3711)

'iced'

In [13]:
for token_id in input_ids[0]:
    print(tokenizer.decode(token_id))

i
 skip
 across
 the


In [14]:
from transformers import AutoModelForCausalLM

In [15]:
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

In [16]:
input_ids

tensor([[   72, 14267,  1973,   262]])

In [17]:
outputs = gpt2(input_ids)

In [18]:
outputs.logits.shape

torch.Size([1, 4, 50257])

In [19]:
final_logits = gpt2(input_ids).logits[0,-1]

In [20]:
final_logits

tensor([-87.7328, -86.4928, -89.6479,  ..., -88.9680, -90.8354, -86.7778],
       grad_fn=<SelectBackward0>)

In [21]:
final_logits.shape

torch.Size([50257])

In [22]:
final_logits.argmax() # token ID --> Index location logits

tensor(4675)

In [23]:
tokenizer.decode(final_logits.argmax())

' street'

In [24]:
top10_logits = torch.topk(final_logits,10)

In [25]:
for index in top10_logits.indices:
    print(tokenizer.decode(index))

 street
 river
 room
 country
 road
 floor
 line
 border
 bridge
 city


In [26]:
final_logits.softmax(dim=0)

tensor([8.9118e-07, 3.0796e-06, 1.3129e-07,  ..., 2.5911e-07, 4.0040e-08,
        2.3158e-06], grad_fn=<SoftmaxBackward0>)

In [27]:
top10 = torch.topk(final_logits.softmax(dim=0),10)

In [28]:
for value, index in zip(top10.values, top10.indices):
    print(f"{tokenizer.decode(index)} -- {value.item():.1%}")

 street -- 2.5%
 river -- 2.3%
 room -- 2.1%
 country -- 1.7%
 road -- 1.6%
 floor -- 1.4%
 line -- 1.4%
 border -- 1.4%
 bridge -- 1.2%
 city -- 1.0%


In [29]:
output_ids = gpt2.generate(input_ids, max_new_tokens=20, do_sample=True, top_k=5, temperature=1.3)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [30]:
decoded_text = tokenizer.decode(output_ids[0])

In [31]:
decoded_text

'i skip across the river, the river was so full, it could barely contain the entire river, the whole river was'

In [32]:
print(decoded_text)

i skip across the river, the river was so full, it could barely contain the entire river, the whole river was


In [33]:
# help(gpt2)

In [34]:
from transformers import pipeline
pipe = pipeline("text-generation", model="openai-community/gpt2")

In [35]:
pipe("I went to the happy store today and bought a ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I went to the happy store today and bought a iphone," he said.\n\nFellow employee, Dr. Kishon, told Metro-Review.com he purchased a phone from Erika at 10 AM on Thursday following the two'}]