## Pretraining on unlabeled data

### 1) Evaluating generative text models

In [4]:
from utils import GPTModel
import torch

GPT_CONFIG_124M = {
    "vocab_size": 200019,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [5]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();

In [6]:
import tiktoken
from utils import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

In [7]:
start_context = "Pierwszy dzień wiosny jest"
tokenizer = tiktoken.get_encoding("o200k_base")
input_ids = text_to_token_ids(start_context, tokenizer)
print("Input IDs:", input_ids)

Input IDs: tensor([[152687,   8811,   3705, 155653,    286,   2453,   3008,  12637]])


In [8]:
def token_ids_to_text(token_ids, tokenizer):
    decoded = tokenizer.decode(token_ids.squeeze(0).tolist())
    return decoded

In [9]:
token_ids_to_text(input_ids, tokenizer)
print("Token IDs to text:", token_ids_to_text)

Token IDs to text: <function token_ids_to_text at 0x7c6252be5ee0>


In [10]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"],
)

In [11]:
token_ids.squeeze(0)

tensor([152687,   8811,   3705, 155653,    286,   2453,   3008,  12637,  43195,
        173252,  76557,  76131,  48593,  63588, 185378,  49039, 150980, 164286])

In [12]:
token_ids_to_text(token_ids, tokenizer)

'Pierwszy dzień wiosny jest dressed ఎమ్మ(ll proactiveddietrade الأراضي исслед mike العمليات'

### 2) Calculating the text generation loss: cross-entropy and perplexity

In [13]:
inputs = text_to_token_ids("Wszystkie drogi prowadzą do", tokenizer)
print(inputs)
targets = text_to_token_ids("drogi prowadzą do Rzymu", tokenizer)
print(targets)

tensor([[    54, 148556,  51201,   6517,   6248, 104788,  21589,    621]])
tensor([[100256,   6248, 104788,  21589,    621,    460,  28178,     84]])


In [14]:
with torch.no_grad():
    logits = model(inputs)

In [15]:
logits.shape

torch.Size([1, 8, 200019])

In [16]:
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([1, 8, 200019])


In [17]:
probas

tensor([[[4.2369e-06, 1.9948e-06, 2.6355e-06,  ..., 3.0819e-06,
          5.7465e-06, 1.6902e-06],
         [6.5892e-06, 1.6625e-06, 6.2114e-06,  ..., 7.2826e-06,
          6.6397e-06, 3.6026e-06],
         [3.6242e-06, 4.5758e-06, 6.0980e-06,  ..., 3.7388e-06,
          1.1860e-05, 1.4350e-06],
         ...,
         [1.1959e-05, 3.8941e-06, 4.6082e-06,  ..., 3.9672e-06,
          7.8944e-06, 6.7520e-06],
         [5.4063e-06, 2.0371e-06, 6.5283e-06,  ..., 3.8506e-06,
          4.0004e-06, 3.1869e-06],
         [4.0427e-06, 1.7514e-06, 8.6368e-06,  ..., 1.2341e-05,
          1.1058e-06, 6.4744e-06]]])

In [18]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:", token_ids)

Token IDs: tensor([[[ 89054],
         [199706],
         [123900],
         [188689],
         [ 69453],
         [ 69048],
         [  8870],
         [122900]]])


In [19]:
print(f"Targets batch: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch: drogi prowadzą do Rzymu
Outputs batch:  dvePhilip njem verdwijnenintern Rearら сест


In [20]:
text_idx = 0
target_probas = probas[text_idx, -1, targets[text_idx]]
print("Target probabilities:", target_probas)

Target probabilities: tensor([6.1825e-06, 3.1068e-06, 7.2742e-06, 2.8316e-06, 5.9877e-06, 4.8219e-06,
        5.0395e-06, 9.6494e-06])


In [21]:
log_probs = torch.log(target_probas)
print("Log probabilities:", log_probs)

Log probabilities: tensor([-11.9938, -12.6819, -11.8312, -12.7747, -12.0258, -12.2424, -12.1982,
        -11.5486])


In [22]:
-1 * torch.mean(log_probs)

tensor(12.1621)

In [23]:
torch.nn.functional.cross_entropy(
    logits.flatten(0, 1),
    targets.flatten())

tensor(12.4225)

### 3) Calculating the training and validation set losses

In [24]:
from utils import create_dataloader_v1

text_file = "../data_brzechwa.txt"
text_data = open(text_file, "r", encoding="utf-8").read()
train_ratio = 0.9
split_idx = int(len(text_data) * train_ratio)
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [25]:
torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False, 
    num_workers=0
)

In [26]:
print("Train loader: ")
for x, y in train_loader:
    pass
print(x.shape, y.shape)

Train loader: 
torch.Size([2, 256]) torch.Size([2, 256])


In [27]:
print("Val loader: ")
for x, y in val_loader:
    pass
print(x.shape, y.shape)

Val loader: 
torch.Size([2, 256]) torch.Size([2, 256])


In [28]:
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print(f"Train tokens: {train_tokens}")
print(f"Val tokens: {val_tokens}")
print(f"Total tokens: {train_tokens + val_tokens}")

Train tokens: 183296
Val tokens: 19968
Total tokens: 203264


In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [30]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break

    return total_loss / num_batches

In [None]:
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print(f"Train loss: {train_loss:.4f}")
print(f"Val loss: {val_loss:.4f}")


In [None]:
torch.exp(torch.tensor([]))