In [1]:

#This sample is adapted from the work provided here: https://github.com/karpathy/nanoGPT
#removed some of the more complicated concepts (learning rate decay, gradiant accumulation steps, dps support, configuration management etc for a clearer sample)
#refactored to create a training class responsible for training the model, and a dataloader to fetch data

# There are 2 datasets to train from, the character representation of shakespeare which yields a vocab size of 65
# The toy dataset which has a vocabulary of 11

import torch
from torch.nn import functional as F

from config import ToyConfig, ToyTrainingConfig
from config import ToyTrainingConfig
from toy_model import ToyModel
from data_loader import DataLoader
from toy_trainer import ToyTrainer

modelConfig = ToyConfig()
#override any default values here: 

trainingConfig = ToyTrainingConfig()
trainingConfig.block_size = modelConfig.block_size # Need to set to ToyConfig.block_size

# name of the dataset to use
dataset = 'toy'

data = DataLoader(dataset=dataset)
modelConfig.vocab_size = data.get_vocab_size()
trainingConfig.vocab_size = modelConfig.vocab_size
model = ToyModel(modelConfig)
trainer = ToyTrainer(trainingConfig, model, data)

found vocab_size = 11 (inside data\toy\meta.pkl)
number of parameters: 0.00M


In [2]:
#train the model some so that the results aren't random

##### NOTE: Training for more iterations will increase the probability of a correct answer try with larger trainning iterations to see the improvement:
# also important to note this is such a small data set 

#batch_size, number of iterations, log every x iterations
trainer.TrainingLoop(20, 1000, 100)


iter 0: loss 2.3001, time 57.61ms
iter 100: loss 1.2526, time 7.81ms
iter 200: loss 0.5879, time 4.88ms
iter 300: loss 0.3571, time 8.78ms
iter 400: loss 0.2281, time 6.84ms
iter 500: loss 0.1400, time 5.87ms
iter 600: loss 0.1189, time 8.79ms
iter 700: loss 0.1120, time 4.88ms
iter 800: loss 0.0956, time 7.81ms
iter 900: loss 0.0670, time 5.86ms
iter 1000: loss 0.0702, time 6.84ms


In [3]:
# you can repeat this as nessary until you get a good input
X, Y = trainer.get_single_batch() #get_single_batch gets validation data so this is data the model has not seen during training

B, T = Y.size()
target = Y[0][T-1].tolist()
print(data.decode([target]), ' : Target single Character according to source input \n')

# The target is one character less then the target, the number of characters shown is based on block size
print("Input: \n",data.decode(X.flatten().tolist()))
print("\nTarget: \n",data.decode(Y.flatten().tolist()))

print("\nX: \n", X)
print("\nY: \n", Y)

   

N  : Target single Character according to source input 

Input: 
 COR 

Target: 
 CORN

X: 
 tensor([[2, 7, 8, 0]])

Y: 
 tensor([[2, 7, 8, 6]])


In [4]:
# everytime this step is run the model will learn the input
import torch
import torch.nn.functional as F

def format_probs_tensor_as_percent(probs_tensor):
    """
    Converts a PyTorch tensor of probabilities into a percentage format for display.
    If probs_tensor is 1D (shape [vocab_size]), we treat it as a single row.
    If it's 2D (shape [N, vocab_size]), we handle multiple rows.
    
    :param probs_tensor: PyTorch tensor containing probability values (0 to 1).
    :return: A list of lists of formatted percentage strings.
    """
    # 1) Convert to percentage scale and cast to float
    probs_percent = (probs_tensor * 100).to(torch.float32)
    
    # 2) Ensure 2D for uniform processing
    #    If shape is (vocab_size,), make it (1, vocab_size)
    if probs_percent.dim() == 1:
        probs_percent = probs_percent.unsqueeze(0)
    
    # Build the vocabulary array (assuming modelConfig.vocab_size and data are in scope)
    vocab_arr = [data.decode([i]) for i in range(modelConfig.vocab_size)]

    # 3) Convert each probability to a formatted string with two decimal places
    #    We'll return a 2D list, one sub-list per row
    formatted_probs = [
        [
            f"'{vocab_arr[idx]}':{float(prob):.1f}%"
            for idx, prob in enumerate(row)
        ]
        for row in probs_percent
    ]
    
    return formatted_probs

def format_model_results(target, logits, printCount):
    # Convert logits -> probabilities
    allProbs = F.softmax(logits, dim=-1)  # shape: (B, T, vocab_size)
    allPicks = torch.argmax(allProbs, dim=-1)  # shape: (B, T)

    # batch size = 1 for simplicity
    B, T, V = allProbs.shape
    assert B == 1, "This snippet assumes batch size = 1"

    # Determine the range for the last 'printCount' positions
    #    e.g. if T=10 and printCount=4, we want positions [6..9].
    start_idx = max(0, T - printCount)
    
    for i in range(start_idx, T):
        # The chosen token index at position i
        pick_idx = allPicks[0, i].item()
        # Decode that index into a character
        char = data.decode([pick_idx])
        
        # 3) Format probabilities for that position
        #    shape (vocab_size,) => pass to our function
        probs_i = allProbs[0, i]
        
        # format_probs_tensor_as_percent returns a list of lists;
        # with a single row, we can index [0] to get the list of per-token strings
        formatted = format_probs_tensor_as_percent(probs_i)[0]
        
        print(f"{char} -> {formatted}")

    # 3) Decode the entire sequences
    decoded_target = data.decode(target.flatten().tolist())
    decoded_model  = data.decode(allPicks.flatten().tolist())

    print("\nData Target (decoded): \n", decoded_target)
    print("\nModel Result (decoded): \n", decoded_model)

    # 4) Print the last character of each
    print(f"\nLast Char of the Target: {decoded_target[-1]}")    
    print(f"Last Char of the  Model: {decoded_model[-1]}")

    success = decoded_target[-1] == decoded_model[-1]
    resutlStr = 'Succeeded' if success else 'Failed'
    print('\nThis pass: '+ resutlStr)

# -- After running the model --
logits, loss = model(X, Y) 

print('Results of the Model vs the Target input: **Note: rerunning this will do a backwards pass and thus improve the output \n')

format_model_results(Y, logits, 5)

print('\nLoss: ', loss.item())

loss.backward()

trainer.optimizer.step()

# flush the gradients as soon as we can, no need for this memory anymore
trainer.optimizer.zero_grad(set_to_none=True)

Results of the Model vs the Target input: **Note: rerunning this will do a backwards pass and thus improve the output 

C -> ["' ':0.1%", "'A':0.1%", "'C':98.6%", "'E':0.1%", "'I':0.2%", "'L':0.1%", "'N':0.3%", "'O':0.1%", "'R':0.1%", "'S':0.0%", "'T':0.2%"]
O -> ["' ':0.1%", "'A':0.1%", "'C':0.1%", "'E':0.1%", "'I':0.1%", "'L':0.1%", "'N':0.1%", "'O':98.8%", "'R':0.1%", "'S':0.1%", "'T':0.3%"]
R -> ["' ':0.1%", "'A':0.1%", "'C':0.1%", "'E':0.1%", "'I':0.1%", "'L':0.0%", "'N':0.0%", "'O':0.1%", "'R':99.0%", "'S':0.1%", "'T':0.2%"]
N -> ["' ':0.1%", "'A':1.3%", "'C':0.6%", "'E':3.1%", "'I':0.2%", "'L':0.2%", "'N':91.5%", "'O':0.1%", "'R':0.2%", "'S':1.2%", "'T':1.5%"]

Data Target (decoded): 
 CORN

Model Result (decoded): 
 CORN

Last Char of the Target: N
Last Char of the  Model: N

This pass: Succeeded

Loss:  0.030869895592331886


In [6]:
# see the raw model logits, this shows the value of the logits at all positions and not just the final position that we are uttilizing for projection of the final answer

print("Size of model logits: ", logits.size()) # Batch, Block_Size, Vocabulary


Size of model logits:  torch.Size([1, 4, 11])


In [8]:
import torch

#This block shows a sample of the difference between using an "Educated Guess" vs a "Highest Probability"

# Define probability distribution
probs = torch.tensor([0.1, 0.3, 0.4, 0.2])  # Must be non-negative

# Educated Guess: Sample 1 events without replacement 
educated_guess = torch.multinomial(probs, num_samples=1, replacement=False).item()  # using .item() becasue we are only doing a single sample for comparis

highest_probablity = torch.argmax(probs).item() 

print('Educated Guess: ', educated_guess) 
print('Highest Probability: ', highest_probablity)

Educated Guess:  3
Highest Probability:  2
