In [1]:
import rootutils
root_path = rootutils.setup_root(".", indicator=".project-root", pythonpath=True)

In [None]:
import torch

In [3]:
GPT_CONFIG_124M = {
 "vocab_size": 50257, # Vocabulary size
 "context_length": 1024, # Context length
 "emb_dim": 768, # Embedding dimension
 "n_heads": 12, # Number of attention heads
 "n_layers": 12, # Number of layers
 "dropout": 0.1, # Dropout rate
 "qvk_bias": False
}

In [4]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

In [5]:
output

tensor([[[-0.0053,  0.0976, -0.1119,  ...,  1.2888,  0.2626,  0.6683],
         [ 0.0028, -0.2366,  0.1721,  ...,  0.5953,  0.2498,  0.7447],
         [ 0.4675,  0.4470,  0.1792,  ...,  1.2521,  0.3048,  0.7748],
         [ 0.0664,  0.7225,  0.9206,  ...,  0.4790,  0.7428,  0.7014]],

        [[ 0.3623,  1.2142,  0.5221,  ...,  0.1853,  0.0114, -0.5029],
         [-0.0224,  0.7787,  0.2769,  ...,  0.1735,  0.5418,  0.1144],
         [ 0.7427,  0.4012,  0.3209,  ...,  0.3268,  0.7522, -0.1639],
         [ 0.5743,  0.6240,  0.4408,  ...,  1.1961,  1.2648,  0.2242]]],
       grad_fn=<AddBackward0>)

In [6]:
output.shape

torch.Size([2, 4, 768])

In [7]:
model = GPTModel(GPT_CONFIG_124M)

In [38]:
print("Token embedding layer shape:", model.emb_layer.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])


In [8]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)


tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [9]:
batch.shape

torch.Size([2, 4])

In [10]:
model(batch)

tensor([[[-0.0476,  0.3486,  0.3194,  ...,  0.1212, -0.0145,  0.2356],
         [ 0.6108, -1.0272,  0.7170,  ...,  0.8626,  0.3677, -0.1301],
         [-0.1292, -0.3700,  0.0999,  ..., -0.3548, -0.6217, -0.1343],
         [-0.7753, -0.0209,  0.2924,  ..., -0.2467,  0.5945, -0.5021]],

        [[-0.2779,  0.0502,  0.9543,  ...,  0.1103,  0.4312,  0.0555],
         [-0.3889, -1.0040,  0.6679,  ...,  0.4921,  0.0253, -0.8168],
         [-0.5116, -1.2918,  0.8687,  ..., -0.6738, -0.5146,  0.2423],
         [-1.0213, -0.5296, -0.0295,  ...,  0.3974, -0.2751, -0.7770]]],
       grad_fn=<UnsafeViewBackward0>)

In [11]:
model(batch).shape

torch.Size([2, 4, 50257])

In [39]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):

        idx_cond = idx[:, -context_size:]
        
        with torch.no_grad():
            logits = model(idx_cond) #2 passo
 
        logits = logits[:, -1, :] # (batch, n_token, vocab_size) ->(batch_size, vocab_size)  # 3 passo

        probs = torch.softmax(logits, dim=-1) # 4 passo, convertendo para probabilidades

        idx_next = torch.argmax(probs, dim=-1, keepdim=True) # 5 passo, pegando o indice do token mais provavel

        idx = torch.cat((idx, idx_next), dim=1) # 6 passo, concatenando o indice do token mais provavel com o contexto

    return idx



In [49]:
start_context = "Hello, my friend"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 616, 1545]
encoded_tensor.shape: torch.Size([1, 4])


In [54]:
model.eval()
out = generate_text_simple(model=model,idx=encoded_tensor,max_new_tokens=10, context_size=GPT_CONFIG_124M["context_length"])

In [55]:
print("Output:", out)
print("Output length:", len(out[0]))
print("Out shape:", out.shape)

Output: tensor([[15496,    11,   616,  1545, 44684,  6759, 39602, 33753,  2126, 46260,
         16911, 46633,   400, 25329]])
Output length: 14
Out shape: torch.Size([1, 14])


In [56]:
tokenizer.decode(out.tolist()[0])

'Hello, my friend Vargmat Rout Ober ideacorruptionomas 372thumbles'

In [57]:
def text_to_tokens_ids(text, tokenizer):
    encoded_text = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded_text).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(tokens_ids, tokenizer):
    flat = tokens_ids.squeeze(0)
    text = tokenizer.decode(flat.tolist())
    return text


In [58]:
text_to_tokens_ids(start_context, tokenizer)

tensor([[15496,    11,   616,  1545]])

In [59]:
token_ids_to_text(text_to_tokens_ids("Hello, how are you?", tokenizer), tokenizer)

'Hello, how are you?'

In [60]:
token_ids_to_text(out, tokenizer)

'Hello, my friend Vargmat Rout Ober ideacorruptionomas 372thumbles'

In [63]:
inputs = torch.tensor([[16833, 3626, 6100],[40, 1107, 588]])

In [64]:
targets = torch.tensor([[3626, 6100, 345 ], [1107, 588, 11311]])

In [65]:
with torch.no_grad():
    logits = model(inputs)

In [66]:
logits.shape

torch.Size([2, 3, 50257])

In [67]:
probs = torch.softmax(logits, dim=-1)
probs

tensor([[[4.0523e-05, 1.5334e-05, 3.3921e-05,  ..., 1.9002e-05,
          3.3693e-05, 2.2031e-05],
         [2.5165e-05, 5.3138e-06, 3.1071e-05,  ..., 3.1750e-05,
          2.2444e-05, 9.2135e-06],
         [1.7155e-05, 9.1176e-06, 2.2495e-05,  ..., 1.0265e-05,
          8.0308e-06, 1.3215e-05]],

        [[2.5355e-05, 3.5654e-05, 3.1680e-05,  ..., 1.9384e-05,
          1.8915e-05, 1.5493e-05],
         [3.7725e-05, 1.4277e-05, 3.9441e-05,  ..., 4.6905e-05,
          2.1555e-05, 1.1217e-05],
         [9.2747e-06, 4.8651e-06, 4.1455e-05,  ..., 2.1042e-05,
          1.2174e-05, 3.1666e-05]]])

In [68]:
probs.shape

torch.Size([2, 3, 50257])

In [26]:
inputs

tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])

In [70]:
token_ids  = torch.argmax(probs, dim = -1, keepdim = True)
token_ids

tensor([[[ 2107],
         [32179],
         [16402]],

        [[ 6017],
         [13489],
         [30643]]])

In [73]:
token_ids[0]

tensor([[ 2107],
        [32179],
        [16402]])

In [74]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:"
 f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  livehzPO


In [75]:
print(f"Targets batch 2: {token_ids_to_text(targets[1], tokenizer)}")
print(f"Outputs batch 2:"
 f" {token_ids_to_text(token_ids[1].flatten(), tokenizer)}")

Targets batch 2:  really like chocolate
Outputs batch 2: alty spottedIFIC


In [31]:
text_idx  = 0
target_probas_1 = probs[text_idx, [0,1,2] ,targets[text_idx]]
target_probas_1

tensor([6.9085e-06, 1.3188e-05, 1.1656e-05])

In [85]:
probs.shape

torch.Size([2, 3, 50257])

In [89]:
targets[text_idx]

tensor([ 1107,   588, 11311])

In [92]:
probs.shape

torch.Size([2, 3, 50257])

In [91]:
text_idx  = 1
target_probas_2 = probs[text_idx, [0,1,2] ,targets[text_idx]]
target_probas_2

tensor([1.2733e-05, 3.6689e-05, 1.1817e-05])

In [90]:
probs[1, 0 ,1107]

tensor(1.2733e-05)

In [93]:
probs = torch.softmax(logits, dim=-1)
probs

tensor([[[4.0523e-05, 1.5334e-05, 3.3921e-05,  ..., 1.9002e-05,
          3.3693e-05, 2.2031e-05],
         [2.5165e-05, 5.3138e-06, 3.1071e-05,  ..., 3.1750e-05,
          2.2444e-05, 9.2135e-06],
         [1.7155e-05, 9.1176e-06, 2.2495e-05,  ..., 1.0265e-05,
          8.0308e-06, 1.3215e-05]],

        [[2.5355e-05, 3.5654e-05, 3.1680e-05,  ..., 1.9384e-05,
          1.8915e-05, 1.5493e-05],
         [3.7725e-05, 1.4277e-05, 3.9441e-05,  ..., 4.6905e-05,
          2.1555e-05, 1.1217e-05],
         [9.2747e-06, 4.8651e-06, 4.1455e-05,  ..., 2.1042e-05,
          1.2174e-05, 3.1666e-05]]])

In [94]:
logits_flat = logits.view(-1, logits.shape[-1])
logits_flat.shape

torch.Size([6, 50257])

In [95]:
target_flat = targets.view(-1)
target_flat.shape

torch.Size([6])

In [96]:
target_flat

tensor([ 3626,  6100,   345,  1107,   588, 11311])

In [98]:
logits_flat.shape

torch.Size([6, 50257])

In [101]:
loss = torch.nn.functional.cross_entropy(logits_flat, target_flat)
loss

tensor(11.2182)

In [103]:
import torch.nn.functional as F

log_probs = F.log_softmax(logits_flat, dim=-1)  # [6, 50257]
target_log_probs = log_probs[torch.arange(logits_flat.shape[0]), target_flat]  # [6]
loss_manual = -target_log_probs.mean()
loss_manual

tensor(11.2182)