In [1]:

#This sample is adapted from the work provided here: https://github.com/karpathy/nanoGPT
#removed some of the more complicated concepts (learning rate decay, gradiant accumulation steps, dps support, configuration management etc for a clearer sample)
#refactored to create a training class responsible for training the model, and a dataloader to fetch data

# There are 2 datasets to train from, the character representation of shakespeare which yields a vocab size of 65
# The toy dataset which has a vocabulary of 11

import torch
from torch.nn import functional as F

from config import ToyConfig, ToyTrainingConfig
from config import ToyTrainingConfig
from toy_model import ToyModel
from data_loader import DataLoader
from toy_trainer import ToyTrainer

modelConfig = ToyConfig()
#override any default values here: 

trainingConfig = ToyTrainingConfig()
trainingConfig.block_size = modelConfig.block_size # Need to set to ToyConfig.block_size

# name of the dataset to use
dataset = 'toy'

data = DataLoader(dataset=dataset)
modelConfig.vocab_size = data.get_vocab_size()
trainingConfig.vocab_size = modelConfig.vocab_size
model = ToyModel(modelConfig)
trainer = ToyTrainer(trainingConfig, model, data)

found vocab_size = 11 (inside data\toy\meta.pkl)
number of parameters: 0.00M


In [19]:
#train the model some so that the results aren't random

##### NOTE: Training for more iterations will increase the probability of a correct answer try with larger trainning iterations to see the improvement:
# also important to note this is such a small data set 

#batch_size, number of iterations, log every x iterations
trainer.TrainingLoop(4, 100, 10)


iter 0: loss 0.2082, time 8.79ms
iter 10: loss 0.1496, time 7.81ms
iter 20: loss 0.2383, time 6.83ms
iter 30: loss 0.2267, time 7.81ms
iter 40: loss 0.4842, time 9.76ms
iter 50: loss 0.1686, time 7.81ms
iter 60: loss 0.2581, time 7.81ms
iter 70: loss 0.2301, time 6.84ms
iter 80: loss 0.2345, time 7.81ms
iter 90: loss 0.1339, time 7.81ms
iter 100: loss 0.1890, time 7.81ms


In [9]:
# you can repeat this as nessary until you get a good input
X, Y = trainer.get_single_batch() #get_single_batch gets validation data so this is data the model has not seen during training

B, T = Y.size()
target = Y[0][T-1].tolist()
print(data.decode([target]), ' : Target single Character according to source input \n')

# The target is one character less then the target, the number of characters shown is based on block size
print("Input: \n",data.decode(X.flatten().tolist()))
print("\nTarget: \n",data.decode(Y.flatten().tolist()))

print("\nX: \n", X)
print("\nY: \n", Y)

   

R  : Target single Character according to source input 

Input: 
 NEA 

Target: 
 NEAR

X: 
 tensor([[6, 3, 1, 0]])

Y: 
 tensor([[6, 3, 1, 8]])


In [20]:
mod_logits, loss = model(X, Y) 
logits = mod_logits[:, -1, :] # pluck the logits at the final step 

# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# sample from the distribution
idx_guess = torch.multinomial(probs, num_samples=1)
idx_guess= idx_guess.flatten().tolist() # This formats the ouptut into what is needed for decoding

# Argmax: always picks the highest probability word
idx_pick = torch.argmax(probs).item() 
idx_pick = list([idx_pick]) # same this formats for decoding

guess = data.decode(idx_guess)
pick = data.decode(idx_pick)
print(data.decode([target]), ' : Actual Target (', target, ')')
print(guess, " : Educated guess by picking amongst the highest probabilities (", idx_guess[0], ')')
print(pick, " : Pick based solely on highest probability (", idx_pick[0], ')')

# Re running this block will re run the input through the model (note that this is just inference and not training so it does not learn the answer)
# after multiple runs the probability pick should not change however the educated guess might guess the proper value
# in addition to increase the likely hood of success run the training loop for more iterations

R  : Actual Target ( 8 )
R  : Educated guess by picking amongst the highest probabilities ( 8 )
   : Pick based solely on highest probability ( 0 )


In [11]:
import torch
import pandas as pd

def format_probs_tensor_as_percent(probs_tensor):
    """
    Converts a PyTorch tensor of probabilities into a percentage format for display.
    Handles 2D tensors by processing each row separately.

    :param probs_tensor: PyTorch tensor containing probability values (0 to 1).
    :return: List of formatted percentages as strings.
    """
    probs_percent = (probs_tensor * 100).to(torch.float32)  # Convert to percentage scale
    
    # Convert each probability to a formatted string with two decimal places
    formatted_probs = [[f"{float(p):.2f}%" for p in row] for row in probs_percent]

    return formatted_probs

print("logits: \n", logits)
print('Probs: \n', probs)

print('Probs formatted: \n', format_probs_tensor_as_percent(probs))

logits: 
 tensor([[ 0.8495, -0.0207, -0.7849,  1.1798, -0.3315, -0.4967, -0.2611, -0.9563,
         -0.1560,  0.1018, -0.1526]], grad_fn=<SliceBackward0>)
Probs: 
 tensor([[0.1897, 0.0794, 0.0370, 0.2639, 0.0582, 0.0494, 0.0625, 0.0312, 0.0694,
         0.0898, 0.0696]], grad_fn=<SoftmaxBackward0>)
Probs formatted: 
 [['18.97%', '7.94%', '3.70%', '26.39%', '5.82%', '4.94%', '6.25%', '3.12%', '6.94%', '8.98%', '6.96%']]


In [7]:
# see the raw model logits, this shows the value of the logits at all positions and not just the final position that we are uttilizing for projection of the final answer

print("Size of model logits: ", mod_logits.size()) # Batch, Block_Size, Vocabulary 
print("model logits: \n", mod_logits)


Size of model logits:  torch.Size([1, 4, 11])
model logits: 
 tensor([[[-0.1753,  0.2426,  0.1460, -0.2412,  0.0476, -0.2525,  1.1364,
          -0.1690, -0.4356, -0.1286,  0.1148],
         [-0.2818, -0.3707,  0.2225,  0.1348,  1.0175, -0.1664, -0.0777,
           0.1758, -0.4975,  0.1709, -0.4587],
         [-0.4698, -0.0209,  1.2362, -0.4932,  0.3155,  0.1761,  0.1195,
           0.1041, -0.0980,  0.0945,  0.0176],
         [ 0.7953, -0.2032, -0.7338,  1.2673, -0.1755, -0.5247, -0.3400,
          -0.8945, -0.2268,  0.1289, -0.2089]]], grad_fn=<UnsafeViewBackward0>)


In [8]:
import torch

#This block shows a sample of the difference between using an "Educated Guess" vs a "Highest Probability"

# Define probability distribution
probs = torch.tensor([0.1, 0.3, 0.4, 0.2])  # Must be non-negative

# Educated Guess: Sample 1 events without replacement 
educated_guess = torch.multinomial(probs, num_samples=1, replacement=False).item()  # using .item() becasue we are only doing a single sample for comparis

highest_probablity = torch.argmax(probs).item() 

print('Educated Guess: ', educated_guess) 
print('Highest Probability: ', highest_probablity)

Educated Guess:  2
Highest Probability:  2
