## Pretraining on unlabeled data

### 1) Evaluating generative text models

In [1]:
from utils import GPTModel
import torch

GPT_CONFIG_124M = {
    "vocab_size": 200019,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [2]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();

In [3]:
import tiktoken
from utils import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

In [4]:
start_context = "Pierwszy dzień wiosny jest"
tokenizer = tiktoken.get_encoding("o200k_base")
input_ids = text_to_token_ids(start_context, tokenizer)
print("Input IDs:", input_ids)

Input IDs: tensor([[152687,   8811,   3705, 155653,    286,   2453,   3008,  12637]])


In [5]:
def token_ids_to_text(token_ids, tokenizer):
    decoded = tokenizer.decode(token_ids.squeeze(0).tolist())
    return decoded

In [6]:
token_ids_to_text(input_ids, tokenizer)
print("Token IDs to text:", token_ids_to_text)

Token IDs to text: <function token_ids_to_text at 0x75621fee0ea0>


In [7]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"],
)

In [8]:
token_ids.squeeze(0)

tensor([152687,   8811,   3705, 155653,    286,   2453,   3008,  12637,  43195,
        173252,  76557,  76131,  48593,  63588, 185378,  49039, 150980, 164286])

In [9]:
token_ids_to_text(token_ids, tokenizer)

'Pierwszy dzień wiosny jest dressed ఎమ్మ(ll proactiveddietrade الأراضي исслед mike العمليات'

### 2) Calculating the text generation loss: cross-entropy and perplexity

In [10]:
inputs = text_to_token_ids("Wszystkie drogi prowadzą do", tokenizer)
print(inputs)
targets = text_to_token_ids("drogi prowadzą do Rzymu", tokenizer)
print(targets)

tensor([[    54, 148556,  51201,   6517,   6248, 104788,  21589,    621]])
tensor([[100256,   6248, 104788,  21589,    621,    460,  28178,     84]])


In [11]:
with torch.no_grad():
    logits = model(inputs)

In [12]:
logits.shape

torch.Size([1, 8, 200019])

In [13]:
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([1, 8, 200019])


In [14]:
probas

tensor([[[4.2369e-06, 1.9948e-06, 2.6355e-06,  ..., 3.0819e-06,
          5.7465e-06, 1.6902e-06],
         [6.5892e-06, 1.6625e-06, 6.2114e-06,  ..., 7.2826e-06,
          6.6397e-06, 3.6026e-06],
         [3.6242e-06, 4.5758e-06, 6.0980e-06,  ..., 3.7388e-06,
          1.1860e-05, 1.4350e-06],
         ...,
         [1.1959e-05, 3.8941e-06, 4.6082e-06,  ..., 3.9672e-06,
          7.8944e-06, 6.7520e-06],
         [5.4063e-06, 2.0371e-06, 6.5283e-06,  ..., 3.8506e-06,
          4.0004e-06, 3.1869e-06],
         [4.0427e-06, 1.7514e-06, 8.6368e-06,  ..., 1.2341e-05,
          1.1058e-06, 6.4744e-06]]])

In [15]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:", token_ids)

Token IDs: tensor([[[ 89054],
         [199706],
         [123900],
         [188689],
         [ 69453],
         [ 69048],
         [  8870],
         [122900]]])


In [16]:
print(f"Targets batch: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch: drogi prowadzą do Rzymu
Outputs batch:  dvePhilip njem verdwijnenintern Rearら сест


In [17]:
text_idx = 0
target_probas = probas[text_idx, -1, targets[text_idx]]
print("Target probabilities:", target_probas)

Target probabilities: tensor([6.1825e-06, 3.1068e-06, 7.2742e-06, 2.8316e-06, 5.9877e-06, 4.8219e-06,
        5.0395e-06, 9.6494e-06])


In [18]:
log_probs = torch.log(target_probas)
print("Log probabilities:", log_probs)

Log probabilities: tensor([-11.9938, -12.6819, -11.8312, -12.7747, -12.0258, -12.2424, -12.1982,
        -11.5486])


In [19]:
-1 * torch.mean(log_probs)

tensor(12.1621)

In [20]:
torch.nn.functional.cross_entropy(
    logits.flatten(0, 1),
    targets.flatten())

tensor(12.4225)

### 3) Calculating the training and validation set losses

In [21]:
from utils import create_dataloader_v1

text_file = "../data_brzechwa.txt"
text_data = open(text_file, "r", encoding="utf-8").read()
train_ratio = 0.9
split_idx = int(len(text_data) * train_ratio)
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [22]:
torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False, 
    num_workers=0
)

In [23]:
print("Train loader: ")
for x, y in train_loader:
    pass
print(x.shape, y.shape)

Train loader: 
torch.Size([2, 256]) torch.Size([2, 256])


In [24]:
print("Val loader: ")
for x, y in val_loader:
    pass
print(x.shape, y.shape)

Val loader: 
torch.Size([2, 256]) torch.Size([2, 256])


In [25]:
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print(f"Train tokens: {train_tokens}")
print(f"Val tokens: {val_tokens}")
print(f"Total tokens: {train_tokens + val_tokens}")

Train tokens: 183296
Val tokens: 19968
Total tokens: 203264


In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [27]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break

    return total_loss / num_batches

In [28]:
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print(f"Train loss: {train_loss:.4f}")
print(f"Val loss: {val_loss:.4f}")


KeyboardInterrupt: 

In [None]:
torch.exp(torch.tensor([12.3836]))

tensor([238852.2031])

### 4) Training an LLM

In [None]:
from idna import decode
from sympy import evaluate


def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1}, Step {global_step}, "
                      f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
                      f"Tokens Seen: {tokens_seen}")
        
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()




In [30]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.1)

In [None]:
num_epochs = 10
train_loses, val_losses, tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context="Pierwszy dzień wiosny jest",
    tokenizer=tokenizer
)

Epoch 1, Step 0, Train Loss: 12.1878, Val Loss: 12.2071, Tokens Seen: 512
Epoch 1, Step 5, Train Loss: 11.3739, Val Loss: 11.4418, Tokens Seen: 3072
Epoch 1, Step 10, Train Loss: 10.8490, Val Loss: 10.9868, Tokens Seen: 5632
Epoch 1, Step 15, Train Loss: 10.4299, Val Loss: 10.5681, Tokens Seen: 8192
Epoch 1, Step 20, Train Loss: 10.0242, Val Loss: 10.1256, Tokens Seen: 10752
Epoch 1, Step 25, Train Loss: 9.5087, Val Loss: 9.6780, Tokens Seen: 13312
Epoch 1, Step 30, Train Loss: 9.0586, Val Loss: 9.2317, Tokens Seen: 15872
Epoch 1, Step 35, Train Loss: 8.5883, Val Loss: 8.8167, Tokens Seen: 18432
Epoch 1, Step 40, Train Loss: 8.2973, Val Loss: 8.4601, Tokens Seen: 20992
Epoch 1, Step 45, Train Loss: 7.9086, Val Loss: 8.1707, Tokens Seen: 23552
Epoch 1, Step 50, Train Loss: 7.8568, Val Loss: 7.9584, Tokens Seen: 26112
Epoch 1, Step 55, Train Loss: 7.5224, Val Loss: 7.8080, Tokens Seen: 28672
Epoch 1, Step 60, Train Loss: 7.4544, Val Loss: 7.6940, Tokens Seen: 31232
Epoch 1, Step 65, Trai

KeyboardInterrupt: 

In [33]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5,3))

    ax1.plot(epochs_seen, train_losses, label='Train Loss', color='blue')
    ax1.plot(epochs_seen, val_losses, label='Val Loss', color='orange', linestyle='--')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend(loc='upper right')
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))

    ax2 = ax1.twiny()
    ax2.plot(tokens_seen, train_losses, alpha=0)
    ax2.set_xlabel('Tokens Seen')

    fig.tight_layout
    plt.savefig("losses_plot.png")
    plt.show()


epochs_seen = list(range(1, num_epochs + 1))
plot_losses(epochs_seen, tokens_seen, train_loses, val_losses)

NameError: name 'tokens_seen' is not defined