In [28]:
import yaml
import tiktoken
import torch
from torch import nn

from dataset import Data
from dataloader import get_data_loader
from embeddings import Embeddings
from transformer_block import TransformerBlock
from gpt2 import GPT2Model
from utils import text_to_tokens,tokens_to_text,generate_text
from loss import loss_fn

with open("config.yaml","r") as f:
    config = yaml.safe_load(f)

In [22]:
with open("the-verdict.txt","r") as f:
    raw_text = f.read()

len(text_to_tokens(raw_text)[0])

5145

In [23]:
train_ratio = 0.9
split_index = int(len(raw_text) * train_ratio)
train_text = raw_text[:split_index]
val_text = raw_text[split_index:]


In [24]:
train_dataset = Data(
    raw_text=train_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)

val_dataset = Data(
    raw_text=val_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)


In [25]:
train_dl = get_data_loader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
    )

val_dl = get_data_loader(
    val_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
)


In [26]:
train_tokens = 0 
for x,y in train_dl:
    train_tokens += x.numel()
print(f"Train tokens: {train_tokens}")

val_tokens = 0
for x,y in val_dl:
    val_tokens += x.numel()
print(f"Val tokens: {val_tokens}")


print(f'total tokens: {train_tokens + val_tokens}')

Train tokens: 4608
Val tokens: 512
total tokens: 5120


In [27]:
model = GPT2Model(config)

logits = model(x)
logits.shape

torch.Size([2, 256, 50257])

In [30]:
loss_fn(logits,y,device="cpu")

tensor(11.0049, grad_fn=<NllLossBackward0>)

In [8]:
generate_text(
    "every step takes you",
    model,
    "cpu",
    look_back=40,
    num_tokens_to_generate=3
)

'every step takes you togglebacker Brands'

In [36]:
optimizer = torch.optim.AdamW(model.parameters(),lr=0.001)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [37]:
for i in range(10):
    for x,y in train_dl:
        logits = model(x)
        loss = loss_fn(logits,y,device="cpu")
        print(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Epoch {i} loss: {loss.item()}")


tensor(10.9596, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 10.95964241027832
tensor(9.8988, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 9.898833274841309
tensor(9.2066, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 9.206628799438477
tensor(8.7733, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 8.773294448852539
tensor(8.1027, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 8.102672576904297
tensor(7.7405, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 7.740510940551758
tensor(7.4890, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 7.489040851593018
tensor(7.4083, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 7.40825891494751
tensor(7.4855, grad_fn=<NllLossBackward0>)
Epoch 0 loss: 7.48552131652832
tensor(6.3769, grad_fn=<NllLossBackward0>)
tensor(6.1044, grad_fn=<NllLossBackward0>)
tensor(6.0550, grad_fn=<NllLossBackward0>)
tensor(17.5011, grad_fn=<NllLossBackward0>)
tensor(6.8025, grad_fn=<NllLossBackward0>)
tensor(6.5340, grad_fn=<NllLossBackward0>)
tensor(13.2796, grad_fn=<NllLossBackward0>)
tensor(6.5210, grad_fn=

KeyboardInterrupt: 