In [38]:
import yaml
import tiktoken
import torch
from torch import nn

from dataset import Data
from dataloader import get_data_loader
from embeddings import Embeddings
from transformer_block import TransformerBlock
from gpt2 import GPT2Model
from utils import text_to_tokens,tokens_to_text,generate_text
from loss import loss_fn

with open("config.yaml","r") as f:
    config = yaml.safe_load(f)

In [39]:
with open("the-verdict.txt","r") as f:
    raw_text = f.read()

len(text_to_tokens(raw_text)[0])

5145

In [40]:
train_ratio = 0.9
split_index = int(len(raw_text) * train_ratio)
train_text = raw_text[:split_index]
val_text = raw_text[split_index:]


In [41]:
train_dataset = Data(
    raw_text=train_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)

val_dataset = Data(
    raw_text=val_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)


In [42]:
train_dl = get_data_loader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
    )

val_dl = get_data_loader(
    val_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
)


In [43]:
train_tokens = 0 
for x,y in train_dl:
    train_tokens += x.numel()
print(f"Train tokens: {train_tokens}")

val_tokens = 0
for x,y in val_dl:
    val_tokens += x.numel()
print(f"Val tokens: {val_tokens}")


print(f'total tokens: {train_tokens + val_tokens}')

Train tokens: 4608
Val tokens: 512
total tokens: 5120


In [27]:
model = GPT2Model(config)

logits = model(x)
logits.shape

torch.Size([2, 256, 50257])

In [None]:
optimizer = torch.optim.AdamW(model.parameters(),lr=0.001)

In [44]:

def train_loop(
        model,
        train_dl,
        optimizer,
        loss_fn,
        epochs,
        device
        ):
    
    model.train()
    for epoch in range(epochs):
        for x,y in train_dl:
            x,y = x.to(device),y.to(device)
            logits = model(x)
            loss = loss_fn(logits,y,device)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

tensor(5.9992, grad_fn=<NllLossBackward0>)
tensor(6.0088, grad_fn=<NllLossBackward0>)
tensor(6.0734, grad_fn=<NllLossBackward0>)
tensor(6.1916, grad_fn=<NllLossBackward0>)
tensor(6.0178, grad_fn=<NllLossBackward0>)
tensor(5.8662, grad_fn=<NllLossBackward0>)
tensor(5.7812, grad_fn=<NllLossBackward0>)
tensor(5.8639, grad_fn=<NllLossBackward0>)
tensor(5.9188, grad_fn=<NllLossBackward0>)
tensor(5.9992, grad_fn=<NllLossBackward0>)
tensor(6.0088, grad_fn=<NllLossBackward0>)
tensor(6.0734, grad_fn=<NllLossBackward0>)
tensor(6.1916, grad_fn=<NllLossBackward0>)
tensor(6.0178, grad_fn=<NllLossBackward0>)


KeyboardInterrupt: 