In [1]:
import yaml
import tiktoken
import torch
from torch import nn

from dataset import Data,SpamDataset
from dataloader import get_data_loader
from embeddings import Embeddings
from transformer_block import TransformerBlock
from gpt2 import GPT2Model
from utils import text_to_tokens,tokens_to_text,generate_text
from loss import cross_entropy
from train import traininng_loop
from evaluation import eval

with open("config.yaml","r") as f:
    config = yaml.safe_load(f)

In [2]:
torch.set_printoptions(sci_mode=False,precision=10)

In [3]:
with open("the-verdict.txt","r") as f:
    raw_text = f.read()

len(text_to_tokens(raw_text)[0])

5145

In [21]:
train_dateset = SpamDataset(
    csv_path='sms+spam+collection/train.csv',
    tokenizer=tiktoken.get_encoding("gpt2"),
    max_len=None
)
valid_dataset = SpamDataset(
    csv_path='sms+spam+collection/val.csv',
    tokenizer=tiktoken.get_encoding("gpt2"),
    max_len=train_dateset.max_len
)

test_dataset = SpamDataset(
    csv_path='sms+spam+collection/test.csv',
    tokenizer=tiktoken.get_encoding("gpt2"),
    max_len=train_dateset.max_len
)

train_dl = get_data_loader(
    train_dateset,batch_size=32,shuffle=False,drop_last=True,num_workers=0
)

valid_dl = get_data_loader(
    valid_dataset,batch_size=32,shuffle=False,drop_last=True,num_workers=0
)

test_dl = get_data_loader(
    test_dataset,batch_size=32,shuffle=False,drop_last=True,num_workers=0
)

In [4]:
train_dateset.max_len

NameError: name 'train_dateset' is not defined

In [18]:
for x,y in train_dl:
    print(x)
    print(x.shape)
    print('-'*100)
    print(y)
    break

tensor([[ 3855,   534, 11376,  ..., 50256, 50256, 50256],
        [ 9690,   329,   534,  ..., 50256, 50256, 50256],
        [ 3103,  3304,   616,  ..., 50256, 50256, 50256],
        ...,
        [ 4261,    38,  3525,  ..., 50256, 50256, 50256],
        [ 1026,   338,   266,  ..., 50256, 50256, 50256],
        [30099,  1576,    11,  ..., 50256, 50256, 50256]])
torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
tensor([1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
        0, 1, 0, 1, 0, 1, 0, 0])


In [4]:
train_ratio = 0.9
split_index = int(len(raw_text) * train_ratio)
train_text = raw_text[:split_index]
val_text = raw_text[split_index:]


# Dataset 

In [5]:
train_dataset = Data(
    raw_text=train_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)

val_dataset = Data(
    raw_text=val_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)


# Dataloader 

In [6]:
train_dl = get_data_loader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
    )

val_dl = get_data_loader(
    val_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
)


In [7]:
# for x,y in train_dl:
#     print(x.shape)
#     print(y.shape)
#     break

In [8]:
# train_tokens = 0 
# for x,y in train_dl:
#     train_tokens += x.numel()
# print(f"Train tokens: {train_tokens}")

# val_tokens = 0
# for x,y in val_dl:
#     val_tokens += x.numel()
# print(f"Val tokens: {val_tokens}")


# print(f'total tokens: {train_tokens + val_tokens}')

In [9]:
config['num_classes'] = 2

model = GPT2Model(config)

with torch.no_grad():
    logits = model(torch.randint(0,50257,(32,10)))
    print(logits.shape)


torch.Size([32, 10, 2])


In [10]:
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004)

In [11]:
traininng_loop(
    model,
    train_dl,
    val_dl,
    loss_fn = cross_entropy,
    optimizer = optimizer,
    num_epochs = 10,
    device = "cpu",
    text_to_generate = "Every single step",
    look_back = config["context_window"],
    num_tokens_to_generate = config["num_tokens_to_generate"],
)

2025-04-29 19:12:29,535 - INFO - Epoch 1/10
2025-04-29 19:12:44,508 - INFO - Seen tokens: 4608
2025-04-29 19:12:44,510 - INFO - Loss: 9.1975
2025-04-29 19:12:44,781 - INFO - Validation Loss: 7.7019
2025-04-29 19:12:45,422 - INFO - Generated text: Every single step...                 
2025-04-29 19:12:45,423 - INFO - Epoch 2/10
2025-04-29 19:12:57,431 - INFO - Seen tokens: 9216
2025-04-29 19:12:57,433 - INFO - Loss: 6.6992
2025-04-29 19:12:57,651 - INFO - Validation Loss: 6.7359
2025-04-29 19:12:58,294 - INFO - Generated text: Every single step                    
2025-04-29 19:12:58,295 - INFO - Epoch 3/10
2025-04-29 19:13:07,063 - INFO - Seen tokens: 13824
2025-04-29 19:13:07,064 - INFO - Loss: 7.2867
2025-04-29 19:13:07,281 - INFO - Validation Loss: 6.6831
2025-04-29 19:13:08,020 - INFO - Generated text: Every single step the the the the the the the the the the the the the the the the the the the the
2025-04-29 19:13:08,021 - INFO - Epoch 4/10
2025-04-29 19:13:16,744 - INFO - Seen to

In [14]:
torch.save(model.state_dict(),"model.pth")

In [12]:
eval(
    model,
    val_loader=val_dl,
    loss_fn= cross_entropy,
    device='cpu'
)

2025-04-29 19:14:24,002 - INFO - Validation Loss: 6.3376


In [16]:
generate_text(
    "Hello what is ",
    model = model, 
    device = "cpu",
    look_back=20,
    num_tokens_to_generate=10,
    temperature=2.0,
    top_k=20,
)

'Hello what is  in her work to paint, the mant---'