In [12]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [13]:
import tiktoken
import torch
import torch.nn as nn
from GPT.Tokenization import text_to_tokens, token_to_text

In [14]:
tokenizer = tiktoken.get_encoding('gpt2')

In [15]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [16]:
from GPT.GPT_Model import GPTModel, GPT_CONFIG_124M
model = GPTModel(GPT_CONFIG_124M)

In [17]:
with torch.no_grad():
    logits = model(inputs)

In [18]:
probas = torch.softmax(logits, dim = -1) # Probability of each token in vocabulory 
print(probas.shape)

torch.Size([2, 3, 50257])


In [19]:
token_ids = torch.argmax(probas, dim = -1, keepdim=True)
print(f"Token IDs:\n {token_ids}")

Token IDs:
 tensor([[[10723],
         [48768],
         [11766]],

        [[41373],
         [47269],
         [18757]]])


In [20]:
print(f"Target Batch 1: {token_to_text(targets[0], tokenizer)}")
print(f"Output Batch 1: {token_to_text(token_ids[0].flatten(), tokenizer)}")

Target Batch 1:  effort moves you
Output Batch 1: Watch043pert


## **Cross-Entropy Loss**

In [21]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print(f"Text 1: {target_probas_1}")

text_idx = 1 
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print(f"Text 2: {target_probas_2}")

Text 1: tensor([2.8425e-05, 9.7405e-06, 8.4400e-06])
Text 2: tensor([5.3366e-06, 1.9943e-05, 1.7796e-05])


In [22]:
# Logarithm to all the token probabilities 
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([-10.4683, -11.5392, -11.6825, -12.1409, -10.8226, -10.9366])


In [23]:
# Avg probability for each token 
avg_log_prob = torch.mean(log_probas)
print(avg_log_prob)

tensor(-11.2650)


In [24]:
neg_avg_log_probs = avg_log_prob * -1
print(neg_avg_log_probs)

tensor(11.2650)


In [25]:
print(f"Logits shape: {logits.shape}")
print(f"Targets shape: {targets.shape}")

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])


In [26]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print(f"Logits flat: {logits_flat.shape}")
print(f"Target flat: {targets_flat.shape}")

Logits flat: torch.Size([6, 50257])
Target flat: torch.Size([6])


In [27]:
loss = nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(11.2650)


## **Perplexity**

In [28]:
# Concept related to cross entropy loss, Perplexity is simply the exponential of the cross-entropy loss 
perplexity = torch.exp(loss)
print(perplexity)

tensor(78043.2500)


## **Evaluating LLM Performance**

In [29]:
import os
with open("C:\\Users\\hites\\OneDrive\\Desktop\\GPT\\data\\the-verdict.txt", 'r', encoding='utf-8')  as f:
    raw_text = f.read()

In [30]:
print(raw_text[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [31]:
total_characters = len(raw_text)
total_tokens = len(tokenizer.encode(raw_text))

print(f"Characters: {total_characters}")
print(f"Tokens: {total_tokens}")

Characters: 20479
Tokens: 5145


### **Dataset & DataLoader**

In [32]:
from torch.utils.data import Dataset, DataLoader 
from GPT.Dataset_and_DataLoaders import GPTDataset, create_dataloader_v1

In [33]:
# Train / Validation ratio 
train_ratio = 0.90 
split_idx = int(train_ratio * len(raw_text))
train_data = raw_text[:split_idx]
val_data = raw_text[split_idx:]

In [34]:
len(train_data), len(val_data)

(18431, 2048)

In [35]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [36]:
GPT_CONFIG_124M["context_length"]

256

In [37]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data, 
    batch_size=2, 
    max_length=GPT_CONFIG_124M["context_length"], 
    stride=GPT_CONFIG_124M["context_length"], 
    drop_last=True, 
    shuffle=True, 
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data, 
    batch_size=2, 
    max_length=GPT_CONFIG_124M["context_length"], 
    stride=GPT_CONFIG_124M["context_length"], 
    drop_last=False,  
    shuffle=False, 
    num_workers=0
)

In [38]:
train_loader, val_loader

(<torch.utils.data.dataloader.DataLoader at 0x1e2c4da2ec0>,
 <torch.utils.data.dataloader.DataLoader at 0x1e2c4da2770>)

In [39]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [40]:
print("Train Loader")
for X, y in train_loader:
    print(X.shape, y.shape)
print("Length of train_loader: ", len(train_loader))

print("\nValidation Loader\n")
for X, y in val_loader:
    print(X.shape, y.shape)
print("Length of val_loader: ", len(val_loader))


Train Loader
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
Length of train_loader:  9

Validation Loader

torch.Size([2, 256]) torch.Size([2, 256])
Length of val_loader:  1


In [41]:
train_tokens = 0 
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0 
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print(f"Training Tokens : {train_tokens}")
print(f"Val Tokens : {val_tokens}")
print(f"All tokens: {train_tokens + val_tokens}")

Training Tokens : 4608
Val Tokens : 512
All tokens: 5120


In [42]:
from GPT.GPT_Model import GPTModel
model = GPTModel(GPT_CONFIG_124M)

In [43]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

### **Calculating Loss**

In [44]:
from GPT.Loss_Calculation import calc_loss_batch, calc_loss_loader

In [45]:
torch.manual_seed(123)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print(f"Training Loss: {train_loss}")
print(f"Val Loss: {val_loss}")

Training Loss: 11.001539760165745
Val Loss: 10.995049476623535
