In [1]:
import yaml
import tiktoken
import torch
from torch import nn

from dataset import Data
from dataloader import get_data_loader
from embeddings import Embeddings
from transformer_block import TransformerBlock
from gpt2 import GPT2Model
from utils import text_to_tokens,tokens_to_text,generate_text
from loss import cross_entropy
from train import traininng_loop

with open("config.yaml","r") as f:
    config = yaml.safe_load(f)

In [2]:
with open("the-verdict.txt","r") as f:
    raw_text = f.read()

len(text_to_tokens(raw_text)[0])

5145

In [3]:
train_ratio = 0.9
split_index = int(len(raw_text) * train_ratio)
train_text = raw_text[:split_index]
val_text = raw_text[split_index:]


# Dataset 

In [4]:
train_dataset = Data(
    raw_text=train_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)

val_dataset = Data(
    raw_text=val_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)


# Dataloader 

In [5]:
train_dl = get_data_loader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
    )

val_dl = get_data_loader(
    val_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
)


In [6]:
for x,y in train_dl:
    print(x.shape)
    print(y.shape)
    break

torch.Size([2, 256])
torch.Size([2, 256])


In [7]:
train_dl.batch_size

2

In [8]:
x.numel()

512

In [9]:
train_tokens = 0 
for x,y in train_dl:
    train_tokens += x.numel()
print(f"Train tokens: {train_tokens}")

val_tokens = 0
for x,y in val_dl:
    val_tokens += x.numel()
print(f"Val tokens: {val_tokens}")


print(f'total tokens: {train_tokens + val_tokens}')

Train tokens: 4608
Val tokens: 512
total tokens: 5120


In [10]:
model = GPT2Model(config)

with torch.no_grad():
    logits = model(x)
    print(logits.shape)


torch.Size([2, 256, 50257])


In [11]:
optimizer = torch.optim.AdamW(model.parameters(),lr=0.001)

In [12]:
traininng_loop(
    model,
    train_dl,
    loss_fn = cross_entropy,
    optimizer = optimizer,
    num_epochs = 10,
    device = "cpu"
)

2025-04-28 10:34:11,482 - INFO - Epoch 1/10
2025-04-28 10:34:22,993 - INFO - Loss: 8.520448154873318
2025-04-28 10:34:22,995 - INFO - Epoch 2/10
2025-04-28 10:34:33,419 - INFO - Loss: 7.529747486114502
2025-04-28 10:34:33,421 - INFO - Epoch 3/10
2025-04-28 10:34:46,783 - INFO - Loss: 6.087440437740749
2025-04-28 10:34:46,785 - INFO - Epoch 4/10
2025-04-28 10:34:57,059 - INFO - Loss: 5.935275077819824
2025-04-28 10:34:57,060 - INFO - Epoch 5/10
2025-04-28 10:35:08,493 - INFO - Loss: 5.851986090342204
2025-04-28 10:35:08,495 - INFO - Epoch 6/10
2025-04-28 10:35:19,613 - INFO - Loss: 5.750375270843506
2025-04-28 10:35:19,615 - INFO - Epoch 7/10
2025-04-28 10:35:31,279 - INFO - Loss: 5.759007665846083
2025-04-28 10:35:31,282 - INFO - Epoch 8/10
2025-04-28 10:35:40,790 - INFO - Loss: 5.75902975930108
2025-04-28 10:35:40,792 - INFO - Epoch 9/10
2025-04-28 10:35:51,593 - INFO - Loss: 5.4895776642693415
2025-04-28 10:35:51,595 - INFO - Epoch 10/10
2025-04-28 10:36:06,303 - INFO - Loss: 5.25975