In [1]:
import os 

import yaml
import tiktoken
import torch
from torch import nn
import wandb

from processing_data.dataset import Data,ClassificationDataset
from processing_data.dataloader import get_data_loader
from embeddings import Embeddings
from transformer_block import TransformerBlock
from gpt2 import GPT2Model
from utils import text_to_tokens,tokens_to_text
from loss import cross_entropy,classification_loss
from train import Trainer
from evaluation import eval

# from dotenv import load_dotenv


with open("config.yaml","r") as f:
    config = yaml.safe_load(f)

with open("generate_text_config.yaml","r") as f:
    generate_text_config = yaml.safe_load(f)


In [2]:
# API Keys 
# print(load_dotenv()) 
# os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")

True


In [2]:
# turn off scientific notation
torch.set_printoptions(sci_mode=False,precision=10) 

# read the-verdict.txt
with open("raw_data/the-verdict.txt","r") as f: 
    raw_text = f.read()

In [5]:
train_dateset = ClassificationDataset(
    csv_path='raw_data/sms_spam_collection/train.csv',
    tokenizer=tiktoken.get_encoding("gpt2"),
    max_len=None
)
val_dataset = ClassificationDataset(
    csv_path='raw_data/sms_spam_collection/val.csv',
    tokenizer=tiktoken.get_encoding("gpt2"),
    max_len=train_dateset.max_len
)

test_dataset = ClassificationDataset(
    csv_path='raw_data/sms_spam_collection/test.csv',
    tokenizer=tiktoken.get_encoding("gpt2"),
    max_len=train_dateset.max_len
)

train_dl = get_data_loader(train_dateset,batch_size=32,shuffle=False,drop_last=True,num_workers=0)
val_dl = get_data_loader(val_dataset,batch_size=32,shuffle=False,drop_last=True,num_workers=0)
test_dl = get_data_loader(test_dataset,batch_size=32,shuffle=False,drop_last=True,num_workers=0)

In [6]:
train_dateset.max_len

118

In [13]:
for x,y in train_dl:
    print(x.shape)
    print('-'*100)
    print(y.shape)

torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
torch.Size([32])
torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
torch.Size([32])
torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
torch.Size([32])
torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
torch.Size([32])
torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
torch.Size([32])
torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
torch.Size([32])
torch.Size([32, 118])
----------------------------------------------------------------------------------------------------
torch.Size([32])
torch.Size([32, 118]

In [11]:
len(train_dl)

32

# Dataset & DataLoader 

In [3]:
train_ratio = 0.9
split_index = int(len(raw_text) * train_ratio)
train_text = raw_text[:split_index]
val_text = raw_text[split_index:]


In [4]:
train_dataset = Data(
    raw_text=train_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)

val_dataset = Data(
    raw_text=val_text,
    tokenizer=tiktoken.get_encoding("gpt2"),
    context_length=config["context_window"],
    stride=config["stride"]
)

train_dl = get_data_loader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
    )

val_dl = get_data_loader(
    val_dataset,
    batch_size=config["batch_size"],
    shuffle=config["shuffle"],
    drop_last=config["drop_last"],
    num_workers=config["num_workers"]
)


In [7]:
# for x,y in train_dl:
#     print(x.shape)
#     print(y.shape)
#     break

In [8]:
# train_tokens = 0 
# for x,y in train_dl:
#     train_tokens += x.numel()
# print(f"Train tokens: {train_tokens}")

# val_tokens = 0
# for x,y in val_dl:
#     val_tokens += x.numel()
# print(f"Val tokens: {val_tokens}")


# print(f'total tokens: {train_tokens + val_tokens}')

In [5]:

model = GPT2Model(config)
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004)

# with torch.no_grad():
#     logits = model(x)

#     print(logits.shape)


# Wandb

In [6]:
wandb.init(
    project="Foundation_models",
    name="generate text run 2",
    config=config
)

[34m[1mwandb[0m: Currently logged in as: [33mhawardizayee[0m ([33mhawardizayee-unitedhealthcare[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
trainer = Trainer(
    model,
    train_dl,
    val_dl,
    loss_fn=cross_entropy,
    optimizer=optimizer,
    config=config,
    device="cpu",
    generate_text_config=generate_text_config
)

In [8]:
trainer.train(epochs=10,generate_text=True)

2025-05-10 08:35:45,411 - INFO - Epoch 1/10
2025-05-10 08:36:01,865 - INFO - Generated text: Every single step  . to,,,-- to the.,, to the----.. 
2025-05-10 08:36:01,866 - INFO - Epoch 2/10
2025-05-10 08:36:14,737 - INFO - Generated text: Every single step it  " "              
2025-05-10 08:36:14,738 - INFO - Epoch 3/10
2025-05-10 08:36:28,194 - INFO - Generated text: Every single step, the the-- the, and a the of of, a a,, the, of of
2025-05-10 08:36:28,195 - INFO - Epoch 4/10
2025-05-10 08:36:39,729 - INFO - Generated text: Every single step--I-- a of a-- a-- had that I-- I-- had he, he a
2025-05-10 08:36:39,730 - INFO - Epoch 5/10
2025-05-10 08:36:53,495 - INFO - Generated text: Every single step I was, with, and, and, in a, I had that, I. It--
2025-05-10 08:36:53,497 - INFO - Epoch 6/10
2025-05-10 08:37:05,556 - INFO - Generated text: Every single step I was the fact of my my he said to the picture of Jack, so--and and in
2025-05-10 08:37:05,557 - INFO - Epoch 7/10
2025-05-10 08:3

([9.173259417215982,
  6.708411110772027,
  7.218028174506293,
  5.732816855112712,
  5.1324782901340065,
  4.287467188305325,
  3.48931352297465,
  2.709184832043118,
  1.9903310669793024,
  1.3930295175976224],
 [7.716957092285156,
  6.730174541473389,
  6.650071144104004,
  6.523015022277832,
  6.319337844848633,
  6.170145511627197,
  6.186779975891113,
  6.300814628601074,
  6.316922664642334,
  6.408429145812988])

In [13]:
a = torch.tensor([[6109, 2060, 2239,  262]])

len(a.shape)

2

In [10]:

if any(a<0):
    print("yes")
else:
    print("no")


no


In [10]:
trainer.seen_tokens

46080

In [9]:
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▂▃▃▄▅▆▆▇█
seen tokens,▁▂▃▃▄▅▆▆▇█
train loss,█▅▆▅▄▄▃▃▂▁
val loss,█▄▃▃▃▂▁▁▁▁

0,1
epoch,9.0
seen tokens,46080.0
train loss,2.47127
val loss,6.22698


In [9]:
traininng_loop(
    model,
    train_dl,
    val_dl,
    loss_fn = cross_entropy,
    optimizer = optimizer,
    num_epochs = 10,
    device = "cpu",
    # text_to_generate = "Every single step",
    look_back = config["context_window"],
    num_tokens_to_generate = config["num_tokens_to_generate"],
)

2025-05-09 10:09:34,487 - INFO - Epoch 1/10
2025-05-09 10:09:50,766 - INFO - Seen tokens: 4608
2025-05-09 10:09:50,768 - INFO - Loss: 9.1835
2025-05-09 10:09:50,994 - INFO - Validation Loss: 7.6791
2025-05-09 10:09:50,999 - INFO - Epoch 2/10
2025-05-09 10:10:05,240 - INFO - Seen tokens: 9216
2025-05-09 10:10:05,242 - INFO - Loss: 6.6920
2025-05-09 10:10:05,481 - INFO - Validation Loss: 6.7152
2025-05-09 10:10:05,486 - INFO - Epoch 3/10
2025-05-09 10:10:20,518 - INFO - Seen tokens: 13824
2025-05-09 10:10:20,520 - INFO - Loss: 6.8425
2025-05-09 10:10:20,888 - INFO - Validation Loss: 6.6327
2025-05-09 10:10:20,892 - INFO - Epoch 4/10
2025-05-09 10:10:35,090 - INFO - Seen tokens: 18432
2025-05-09 10:10:35,093 - INFO - Loss: 5.9034
2025-05-09 10:10:35,309 - INFO - Validation Loss: 6.6238
2025-05-09 10:10:35,310 - INFO - Epoch 5/10
2025-05-09 10:10:49,719 - INFO - Seen tokens: 23040
2025-05-09 10:10:49,721 - INFO - Loss: 5.7705
2025-05-09 10:10:49,940 - INFO - Validation Loss: 6.5725
2025-05

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
loss,█▅▅▄▄▃▃▃▂▁
seen_tokens,▁▂▃▃▄▅▆▆▇█

0,1
loss,3.63237
seen_tokens,46080.0


In [12]:
eval(
    model,
    val_loader=val_dl,
    loss_fn= cross_entropy,
    device='cpu'
)

2025-04-29 19:14:24,002 - INFO - Validation Loss: 6.3376
