In [1]:
from GPT import GPT
from torch.utils.data import DataLoader
import torch
import re
from GPTDataset import GPTDataset
import time

In [2]:
# Hyperparameters
CONTEXT_LEN = 128
EMBED_DIM = 256
NUM_HEADS = 8
NUM_LAYERS = 6
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
EPOCHS = 20

In [4]:
def train_model():
    # Load data
    print("Loading data...")
    dataset = GPTDataset("./data/sample.txt", CONTEXT_LEN)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    print(f"Vocab size: {dataset.vocab_size}, Dataset size: {len(dataset)}")

    # Create model
    model = GPT(
        vocab_size=dataset.vocab_size,
        context_length=CONTEXT_LEN,
        model_dimension=EMBED_DIM,
        num_heads=NUM_HEADS,
        num_layers=NUM_LAYERS
    ).to('cuda' if torch.cuda.is_available() else 'cpu')

    # optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    
    # Training loop
    print("Starting training...")
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        start_time = time.time_ns()
        for i, (x, y) in enumerate(dataloader):
            x, y = x.to(next(model.parameters()).device), y.to(next(model.parameters()).device)
            
            logits, loss = model(x, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

            if i % 100 == 0:
                print(f"Epoch {epoch+1}, Batch {i}, Loss: {loss.item():.4f}")
        end_time = time.time_ns()
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}, elapsed time : {(end_time - start_time)//1000}")
        
    print("Model saved!")
    return model, dataset

In [5]:
def generate_text(model, dataset, prompt="The main key", max_tokens=50):
    model.eval()
    device = next(model.parameters()).device
    
    # Tokenize prompt
    words = re.findall(r'\b\w+\b', prompt.lower())
    idx = torch.tensor([[dataset.stoi.get(word, 0) for word in words]], dtype=torch.long).to(device)
    
    # Generate
    with torch.no_grad():
        for _ in range(max_tokens):
            if idx.size(1) > CONTEXT_LEN:
                idx = idx[:, CONTEXT_LEN:]
            
            logits, _ = model(idx)
            next_token = torch.multinomial(torch.softmax(logits[:, -1, :], dim=-1), 1)
            idx = torch.cat([idx, next_token], dim=1)
    
    # Convert back to text
    generated_words = [dataset.itos[i.item()] for i in idx[0]]
    return ' '.join(generated_words)

In [None]:
model, dataset = train_model()

Loading data...
Vocab size: 272, Dataset size: 526
Starting training...
Epoch 1, Batch 0, Loss: 5.8297
Epoch 1 completed. Average Loss: 4.8647, elapsed time : 1282968
Epoch 2, Batch 0, Loss: 4.0838
Epoch 2 completed. Average Loss: 3.4063, elapsed time : 814111
Epoch 3, Batch 0, Loss: 2.6912
Epoch 3 completed. Average Loss: 2.1878, elapsed time : 861473
Epoch 4, Batch 0, Loss: 1.6992
Epoch 4 completed. Average Loss: 1.4883, elapsed time : 812183
Epoch 5, Batch 0, Loss: 1.3006
Epoch 5 completed. Average Loss: 1.1126, elapsed time : 816361
Epoch 6, Batch 0, Loss: 0.9623
Epoch 6 completed. Average Loss: 0.8276, elapsed time : 833980
Epoch 7, Batch 0, Loss: 0.6818
Epoch 7 completed. Average Loss: 0.5832, elapsed time : 787740
Epoch 8, Batch 0, Loss: 0.4719
Epoch 8 completed. Average Loss: 0.3779, elapsed time : 795950
Epoch 9, Batch 0, Loss: 0.2879
Epoch 9 completed. Average Loss: 0.2217, elapsed time : 794041
Epoch 10, Batch 0, Loss: 0.1779
Epoch 10 completed. Average Loss: 0.1266, elapsed

In [11]:
print("\nGenerating sample text:")
sample = generate_text(model, dataset, "watching cell phones while driving", 80)
print(f"Generated: {sample}")


Generating sample text:
Generated: 000 cell phones while driving in conclusion drivers should regulate able to work a vehicle while using their cell phone drivers who uses their phones while operating a vehicle and are likely to have an accident then those who don t cell phone operation while driving the ability to stay connected to people we know despite distance was originally brought to fruition by the use of letters this system was found to be rather slow and new pathways were searched for until the invention of


In [None]:
def save_model(model, dataset, filename="MyGPT.pth"):
    checkpoint = {
        # Model weights
        'model_state_dict': model.state_dict(),
        
        # Model configuration
        'model_config': {
            'vocab_size': dataset.vocab_size,
            'context_length': model.context_length,
            'model_dimension': model.token_embeddings.embedding_dim,
            'num_heads': NUM_HEADS,  # Use the hyperparameter
            'num_layers': NUM_LAYERS  # Use the hyperparameter
        },
        
        # Tokenizer data
        'tokenizer_data': {
            'stoi': dataset.stoi,
            'itos': dataset.itos,
            'vocab_size': dataset.vocab_size
        }
    }
    
    # Save the checkpoint
    torch.save(checkpoint, filename)
    print(f"COMPLETE MODEL SAVED SUCCESSFULLY!")
    print(f"Saved File Name: {filename}")
    print(f"Model Info:")
    print(f" - Vocabulary Size: {dataset.vocab_size}")
    print(f" - Context Length: {model.context_length}")
    print(f" - Model Dimension: {model.token_embeddings.embedding_dim}")
    print(f" - Number of Layers: {NUM_LAYERS}")
    print(f" - Number of Heads: {NUM_HEADS}")
    
    return filename


# Save the complete model
save_model(model, dataset)