In [1]:

#this sample is adapted from the work provided here: https://github.com/karpathy/nanoGPT
#removed some of the more complicated concepts (learning rate decay, gradiant accumulation steps, dps support, configuration management etc for a clearer sample)
#refactored to create a training class responsible for training the model, and a dataloader to fetch data

# There are 2 datasets to train from, the character representation of shakespeare which yields a vocab size of 65
# The toy dataset which has a vocabulary of 11

import torch
from torch.nn import functional as F

from config import ToyConfig, ToyTrainingConfig
from config import ToyTrainingConfig
from toy_model import ToyModel
from data_loader import DataLoader
from toy_trainer import ToyTrainer

modelConfig = ToyConfig()
#override any default values here: 
modelConfig.block_size= 64
modelConfig.n_embd = 128 # Number of C elements in the B, T, C Tensor that is used in the model ( B = Batch Size, T = Time position or Sequence Length, C = Number of embed elements )

trainingConfig = ToyTrainingConfig()
trainingConfig.block_size = modelConfig.block_size # Need to set to ToyConfig.block_size
trainingConfig.learning_rate= 1e-3

# name of the dataset to use ('toy' or 'shakespeare')
dataset = 'shakespeare'

data = DataLoader(dataset=dataset)
modelConfig.vocab_size = data.get_vocab_size()
trainingConfig.vocab_size = modelConfig.vocab_size
model = ToyModel(modelConfig)
trainer = ToyTrainer(trainingConfig, model, data)

found vocab_size = 65 (inside data\shakespeare\meta.pkl)
number of parameters: 0.21M


In [2]:
#train the model some so that the results aren't random a loss of 1.55 is acheivable with enough training with block_size 64, n_embed of 128
#the learning rate is too high which stops this model from acheiving it's max training loss of 1.48 (validation (1.67) 

#Training for more iterations will increase the probability of a correct answer try with larger trainning iterations to see the improvement (to the limits of the model / learning rate):

#shakespeare
#batch_size, number of iterations, log every x iterations
trainer.TrainingLoop(24, 1200, 100) #1200 iterations takes about 1.5 min and yields about 1.7 loss


iter 0: loss 4.1845, time 174.81ms
iter 100: loss 2.5368, time 69.34ms
iter 200: loss 2.3905, time 83.01ms
iter 300: loss 2.2459, time 68.36ms
iter 400: loss 2.1227, time 88.88ms
iter 500: loss 2.0885, time 86.91ms
iter 600: loss 2.0424, time 81.06ms
iter 700: loss 1.9779, time 98.64ms
iter 800: loss 1.8516, time 91.81ms
iter 900: loss 1.8270, time 92.77ms
iter 1000: loss 1.8359, time 84.96ms
iter 1100: loss 1.7775, time 89.85ms
iter 1200: loss 1.7581, time 87.89ms


In [24]:
# you can repeat this as nessary until you get a good input
X, Y = trainer.get_single_batch() #get_single_batch gets validation data so this is data the model has not seen during training

B, T = Y.size()
target = Y[0][T-1].tolist()
print(data.decode([target]), ' : Target single Character according to source input \n')

# The target is one character less then the target, the number of characters shown is based on block size
print("Input: \n",data.decode(X.flatten().tolist()))
print("\nTarget: \n",data.decode(Y.flatten().tolist()))

#print("\nX: \n", X)
#print("\nY: \n", Y)

   

t  : Target single Character according to source input 

Input: 
  thou more murmur'st, I will rend an oak
And peg thee in his kno

Target: 
 thou more murmur'st, I will rend an oak
And peg thee in his knot


In [28]:
# everytime this step is run the model will learn the input
import torch
import torch.nn.functional as F

def format_probs_tensor_as_percent(probs_tensor):
    """
    Converts a PyTorch tensor of probabilities into a percentage format for display.
    If probs_tensor is 1D (shape [vocab_size]), we treat it as a single row.
    If it's 2D (shape [N, vocab_size]), we handle multiple rows.
    
    :param probs_tensor: PyTorch tensor containing probability values (0 to 1).
    :return: A list of lists of formatted percentage strings.
    """
    # 1) Convert to percentage scale and cast to float
    probs_percent = (probs_tensor * 100).to(torch.float32)
    
    # 2) Ensure 2D for uniform processing
    #    If shape is (vocab_size,), make it (1, vocab_size)
    if probs_percent.dim() == 1:
        probs_percent = probs_percent.unsqueeze(0)
    
    # Build the vocabulary array (assuming modelConfig.vocab_size and data are in scope)
    vocab_arr = [data.decode([i]) for i in range(modelConfig.vocab_size)]

    # 3) Convert each probability to a formatted string with two decimal places
    #    We'll return a 2D list, one sub-list per row
    formatted_probs = [
        [
            f"'{vocab_arr[idx]}':{float(prob):.1f}%"
            for idx, prob in enumerate(row)
        ]
        for row in probs_percent
    ]
    
    return formatted_probs

def format_model_results(target, logits, printCount):
    # Convert logits -> probabilities
    allProbs = F.softmax(logits, dim=-1)  # shape: (B, T, vocab_size)
    allPicks = torch.argmax(allProbs, dim=-1)  # shape: (B, T)

    # batch size = 1 for simplicity
    B, T, V = allProbs.shape
    assert B == 1, "This snippet assumes batch size = 1"

    # Determine the range for the last 'printCount' positions
    #    e.g. if T=10 and printCount=4, we want positions [6..9].
    start_idx = max(0, T - printCount)
    
    for i in range(start_idx, T):
        # The chosen token index at position i
        pick_idx = allPicks[0, i].item()
        # Decode that index into a character
        char = data.decode([pick_idx])
        
        # 3) Format probabilities for that position
        #    shape (vocab_size,) => pass to our function
        probs_i = allProbs[0, i]
        
        # format_probs_tensor_as_percent returns a list of lists;
        # with a single row, we can index [0] to get the list of per-token strings
        formatted = format_probs_tensor_as_percent(probs_i)[0]
        
        print(f"{char} -> {formatted}")

    # 3) Decode the entire sequences
    decoded_target = data.decode(target.flatten().tolist())
    decoded_model  = data.decode(allPicks.flatten().tolist())

    print("\nData Target (decoded): \n", decoded_target)
    print("\nModel Result (decoded): \n", decoded_model)

    # 4) Print the last character of each
    print(f"\nLast Char of the Target: {decoded_target[-1]}")    
    print(f"Last Char of the  Model: {decoded_model[-1]}")

    success = decoded_target[-1] == decoded_model[-1]
    resutlStr = 'Succeeded' if success else 'Failed'
    print('\nThis pass: '+ resutlStr)

# -- After running the model --
logits, loss = model(X, Y) 

print('Results of the Model vs the Target input: **Note: rerunning this will do a backwards pass and thus improve the output \n')

format_model_results(Y, logits, 5)

print('\nLoss: ', loss.item())

loss.backward()

trainer.optimizer.step()

# flush the gradients as soon as we can, no need for this memory anymore
trainer.optimizer.zero_grad(set_to_none=True)

Results of the Model vs the Target input: **Note: rerunning this will do a backwards pass and thus improve the output 

  -> ["'\n':2.5%", "' ':90.3%", "'!':0.2%", "'$':0.0%", "'&':0.0%", "''':0.2%", "',':1.1%", "'-':0.0%", "'.':0.8%", "'3':0.0%", "':':0.6%", "';':0.2%", "'?':0.5%", "'A':0.0%", "'B':0.0%", "'C':0.0%", "'D':0.0%", "'E':0.0%", "'F':0.0%", "'G':0.0%", "'H':0.0%", "'I':0.0%", "'J':0.0%", "'K':0.0%", "'L':0.0%", "'M':0.0%", "'N':0.0%", "'O':0.0%", "'P':0.0%", "'Q':0.0%", "'R':0.0%", "'S':0.0%", "'T':0.1%", "'U':0.0%", "'V':0.0%", "'W':0.0%", "'X':0.0%", "'Y':0.0%", "'Z':0.0%", "'a':0.0%", "'b':0.0%", "'c':0.0%", "'d':0.1%", "'e':0.6%", "'f':0.0%", "'g':0.0%", "'h':1.9%", "'i':0.0%", "'j':0.0%", "'k':0.0%", "'l':0.0%", "'m':0.0%", "'n':0.0%", "'o':0.0%", "'p':0.0%", "'q':0.0%", "'r':0.0%", "'s':0.1%", "'t':0.6%", "'u':0.0%", "'v':0.0%", "'w':0.0%", "'x':0.0%", "'y':0.0%", "'z':0.0%"]
t -> ["'\n':0.0%", "' ':0.0%", "'!':0.0%", "'$':0.0%", "'&':0.0%", "''':0.2%", "',':0.0%", "

In [26]:
# see the raw model logits, this shows the value of the logits at all positions and not just the final position that we are uttilizing for projection of the final answer

print("Size of model logits: ", logits.size()) # Batch, Block_Size, Vocabulary 
print("model logits: \n", logits)


Size of model logits:  torch.Size([1, 64, 65])
model logits: 
 tensor([[[ 1.7899,  0.6305, -3.7608,  ..., -2.5100,  0.1603, -4.7069],
         [ 1.5309,  4.0947, -0.2506,  ..., -2.7289, -0.6738, -4.0253],
         [-0.1413,  1.7320, -1.7145,  ..., -1.2339,  4.0117, -3.9531],
         ...,
         [ 0.7147,  1.1776, -0.1299,  ...,  0.9869, -1.5470, -0.7838],
         [-0.2087,  1.3126, -0.0595,  ..., -1.3608, -0.1276, -0.8348],
         [-0.9568,  3.7113, -0.6027,  ..., -1.3182,  0.2163, -1.9614]]],
       grad_fn=<UnsafeViewBackward0>)


In [27]:
import torch

#This block shows a sample of the difference between using an "Educated Guess" vs a "Highest Probability"

# Define probability distribution
probs = torch.tensor([0.1, 0.3, 0.4, 0.2])  # Must be non-negative

# Educated Guess: Sample 1 events without replacement 
educated_guess = torch.multinomial(probs, num_samples=1, replacement=False).item()  # using .item() becasue we are only doing a single sample for comparis

highest_probablity = torch.argmax(probs).item() 

print('Educated Guess: ', educated_guess) 
print('Highest Probability: ', highest_probablity)

Educated Guess:  2
Highest Probability:  2
