In [1]:
import rootutils
root_path = rootutils.setup_root(".", indicator=".project-root", pythonpath=True)

In [2]:
import torch
from core.models.gpt import GPTModel

In [3]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
GPT_CONFIG_124M = {
 "vocab_size": 50257, # Vocabulary size
 "context_length": 256, # Context length
 "emb_dim": 768, # Embedding dimension
 "n_heads": 12, # Number of attention heads
 "n_layers": 12, # Number of layers
 "dropout": 0.1, # Dropout rate
 "qvk_bias": False
}

In [5]:
file_path = 'C:/Users/WalterBuenodeBritoNe/Documents/GitHub/LLMPlayground/Data/the-verdict.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

print(text[:100])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [6]:
total_characters = len(text)
total_tokens = len(tokenizer.encode(text))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20480
Tokens: 5146


In [7]:
train_ratio = 0.9
train_size = int(train_ratio * len(text))
train_text = text[:train_size]
validation_text = text[train_size:]

print(f"Train size: {len(train_text)}")
print(f"Validation size: {len(validation_text)}")

train_tokens = tokenizer.encode(train_text)
validation_tokens = tokenizer.encode(validation_text)

print(f"Train tokens: {len(train_tokens)}")
print(f"Validation tokens: {len(validation_tokens)}")

Train size: 18432
Validation size: 2048
Train tokens: 4612
Validation tokens: 535


In [8]:
from core.data.dataloader import create_dataloader_v1

train_loader = create_dataloader_v1(
 train_text,
 batch_size=2,
 max_length=GPT_CONFIG_124M["context_length"],
 stride=GPT_CONFIG_124M["context_length"],
 drop_last=True,
 shuffle=True,
 num_workers=0
)

validation_loader = create_dataloader_v1(
 validation_text,
 batch_size=2,
 max_length=GPT_CONFIG_124M["context_length"],
 stride=GPT_CONFIG_124M["context_length"],
 drop_last=True,
 shuffle=True,
 num_workers=0
)

In [9]:
print("Train loader:")
for x, y in train_loader:
 print(x.shape, y.shape)
print("\nValidation loader:")
for x, y in validation_loader:
 print(x.shape, y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [10]:
from core.training import GPTTrainer

In [11]:
torch.manual_seed(123)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(
 model.parameters(),
 lr=0.0004, weight_decay=0.1
)
num_epochs = 10

Using device: cpu


In [12]:

trainer = GPTTrainer(model, epochs=num_epochs, eval_freq=5, eval_iter=5, train_loader=train_loader, validation_loader=validation_loader, optimizer=optimizer, device=device, tokenizer=tokenizer)
train_losses, val_losses, tokens_seen = trainer.train()

2025-08-17 10:51:34,076 - core.training.trainer - INFO - GPTTrainer inicializado com device: cpu
2025-08-17 10:51:34,077 - core.training.trainer - INFO - Modelo tem 162,419,712 parâmetros
2025-08-17 10:51:34,079 - core.training.trainer - INFO - Iniciando treinamento por 10 épocas


KeyboardInterrupt: 

In [14]:
torch.save({"model_state_dict": trainer.model.state_dict(), "optimizer_state_dict": trainer.optimizer.state_dict()}, "model_and_optimizer.pth")

In [15]:
checkpoint = torch.load("model_and_optimizer.pth")
model_gpt = GPTModel(GPT_CONFIG_124M)
model_gpt.load_state_dict(checkpoint["model_state_dict"])
model_gpt.to(device)
optimizer = torch.optim.AdamW(model_gpt.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

trainer_v1 = GPTTrainer(model_gpt, train_loader, validation_loader, optimizer, epochs=1, eval_freq=5, eval_iter=5, device=device, tokenizer=tokenizer)

trainer_v1.train()

2025-08-17 10:51:42,120 - core.training.trainer - INFO - GPTTrainer inicializado com device: cpu
2025-08-17 10:51:42,122 - core.training.trainer - INFO - Modelo tem 162,419,712 parâmetros
2025-08-17 10:51:42,123 - core.training.trainer - INFO - Iniciando treinamento por 1 épocas


KeyboardInterrupt: 

In [None]:
from core.data.utils import generate_text, text_to_token_ids, token_ids_to_text

trainer_v1.model.eval()
token_ids = generate_text(
 model=trainer_v1.model,
 idx=text_to_token_ids("Every effort moves you", trainer_v1.tokenizer).unsqueeze(0),
 max_new_tokens=15,
 context_size=GPT_CONFIG_124M["context_length"],
 top_k=25,
 temperature=1.4
)

print("Output text:\n", token_ids_to_text(token_ids.squeeze(0), trainer_v1.tokenizer))

Output text:
 Every effort moves you say began to go."

Sheoms he had down across the last


In [16]:
import urllib.request
url = (
 "https://raw.githubusercontent.com/rasbt/"
 "LLMs-from-scratch/main/ch05/"
 "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)


from gpt_download import download_and_load_gpt2

settings, params = download_and_load_gpt2(
 model_size="124M", models_dir="gpt2"
)

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [17]:
from core.data import utils as data_utils

In [21]:
data_utils.load_weights_into_gpt(model_gpt, params)

AttributeError: 'GPTModel' object has no attribute 'pos_emb'

In [2]:
import pandas as pd

data_file_path = r"C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Data\SMSSpamCollection"

df = pd.read_csv(
 data_file_path, sep="\t", header=None, names=["Label", "Text"]
)
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
print(df['Label'].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


In [11]:
def create_balanced_data(df):

    spam_df = df[df["Label"] == "spam"]
    
    num_spam = len(spam_df)


    ham_df = df[df["Label"] == "ham"]


    ham_df = ham_df.sample(num_spam)


    balanced_df = pd.concat([spam_df, ham_df]).reset_index(drop = True)


    return balanced_df

In [20]:
balanced_df = create_balanced_data(df)
balanced_df['Label'] = balanced_df['Label'].map({'spam': 1, 'ham': 0})
balanced_df

Unnamed: 0,Label,Text
0,1,Free entry in 2 a wkly comp to win FA Cup fina...
1,1,FreeMsg Hey there darling it's been 3 week's n...
2,1,WINNER!! As a valued network customer you have...
3,1,Had your mobile 11 months or more? U R entitle...
4,1,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
1489,0,"Good morning, my Love ... I go to sleep now an..."
1490,0,Hi:)did you asked to waheeda fathima about leave?
1491,0,Good morning. At the repair shop--the ONLY rea...
1492,0,Pls call me da. What happen.


In [28]:
def random_split(df, train_ratio = 0.7, validation_ratio = 0.1):


    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_size = int(len(df) * train_ratio)

    validation_size = int(len(df) * validation_ratio) 

    df_train = df.iloc[:train_size]
    df_validation = df.iloc[train_size:train_size + validation_size]
    df_test = df.iloc[train_size + validation_size:]

    return df_train, df_validation, df_test



In [29]:
df_train, df_validation, df_test = random_split(balanced_df)

In [30]:
df_train

Unnamed: 0,Label,Text
0,1,U have a secret admirer who is looking 2 make ...
1,1,We tried to contact you re your reply to our o...
2,0,Yup
3,0,"I've got &lt;#&gt; , any way I could pick up?"
4,0,"Wishing you and your family Merry ""X"" mas and ..."
...,...,...
1040,0,Do you like shaking your booty on the dance fl...
1041,0,Y lei?
1042,0,"No idea, I guess we'll work that out an hour a..."
1043,0,Eatin my lunch...


In [31]:
df_validation

Unnamed: 0,Label,Text
1045,0,Ok...
1046,0,Oops. 4 got that bit.
1047,1,"As a valued customer, I am pleased to advise y..."
1048,1,U've been selected to stay in 1 of 250 top Bri...
1049,0,Nope... C ü then...
...,...,...
1189,1,Todays Voda numbers ending with 7634 are selec...
1190,1,"Did you hear about the new ""Divorce Barbie""? I..."
1191,1,WIN URGENT! Your mobile number has been awarde...
1192,1,For ur chance to win a £250 wkly shopping spre...


In [32]:
df_test

Unnamed: 0,Label,Text
1194,0,Hey gals.. Anyone of u going down to e driving...
1195,0,"Aight no rush, I'll ask jay"
1196,0,"I don't have anybody's number, I still haven't..."
1197,1,Someone U know has asked our dating service 2 ...
1198,0,Ha ha - had popped down to the loo when you he...
...,...,...
1489,0,The world suffers a lot... Not because of the ...
1490,0,No da. . Vijay going to talk in jaya tv
1491,0,Dont pack what you can buy at any store.like c...
1492,0,Its a great day. Do have yourself a beautiful ...


In [33]:
df_train.to_csv("train.csv", index=None)
df_validation.to_csv("validation.csv", index=None)
df_test.to_csv("test.csv", index=None)