In [1]:

#this sample is adapted from the work provided here: https://github.com/karpathy/nanoGPT
#removed some of the more complicated concepts (learning rate decay, gradiant accumulation steps, dps support, configuration management etc for a clearer sample)
#refactored to create a training class responsible for training the model, and a dataloader to fetch data

# There are 2 datasets to train from, the character representation of shakespeare which yields a vocab size of 65
# The toy dataset which has a vocabulary of 11

import torch
from torch.nn import functional as F

from config import ToyConfig, ToyTrainingConfig
from config import ToyTrainingConfig
from toy_model import ToyModel
from data_loader import DataLoader
from toy_trainer import ToyTrainer

modelConfig = ToyConfig()
#override any default values here: 
# example overrides for CPU processing of the shakespeare dataset.
modelConfig.block_size= 64
modelConfig.n_layer = 4 # Number of layers within the transformer
modelConfig.n_head = 2 # Number Attention heads
modelConfig.n_embd = 128 # Number of C elements in the B, T, C Tensor that is used in the model ( B = Batch Size, T = Time position or Sequence Length, C = Number of embed elements )


trainingConfig = ToyTrainingConfig()
trainingConfig.block_size = modelConfig.block_size # Need to set to ToyConfig.block_size

# name of the dataset to use ('toy' or 'shakespeare')
dataset = 'shakespeare'

data = DataLoader(dataset=dataset)
modelConfig.vocab_size = data.get_vocab_size()
trainingConfig.vocab_size = modelConfig.vocab_size
model = ToyModel(modelConfig)
trainer = ToyTrainer(trainingConfig, model, data)

found vocab_size = 65 (inside data\shakespeare\meta.pkl)
number of parameters: 0.21M


In [7]:
#train the model some so that the results aren't random

##### NOTE: Training for more iterations will increase the probability of a correct answer try with larger trainning iterations to see the improvement:
# also important to note this is such a small data set 

#shakespeare
#batch_size, number of iterations, log every x iterations
trainer.TrainingLoop(24, 2000, 200) #takes approx 4 minutes  on an older PC

#running this loop 1 time yields a loss of approx 1.7 +/- .15
#running it 2 times yields a training loss of approx 1.6 +/- .30


iter 0: loss 1.7167, time 93.75ms
iter 200: loss 1.5815, time 77.14ms
iter 400: loss 1.6041, time 93.75ms
iter 600: loss 1.6452, time 92.77ms
iter 800: loss 1.6051, time 65.44ms
iter 1000: loss 1.5516, time 84.47ms
iter 1200: loss 1.5898, time 77.15ms
iter 1400: loss 1.5768, time 89.85ms
iter 1600: loss 1.6485, time 86.92ms
iter 1800: loss 1.6271, time 61.53ms
iter 2000: loss 1.6035, time 78.13ms


In [5]:
# you can repeat this as nessary until you get a good input
X, Y = trainer.get_single_batch() #get_single_batch gets validation data so this is data the model has not seen during training

B, T = Y.size()
target = Y[0][T-1].tolist()
print(data.decode([target]), ' : Target single Character according to source input \n')

# The target is one character less then the target, the number of characters shown is based on block size
print("Input: \n",data.decode(X.flatten().tolist()))
print("\nTarget: \n",data.decode(Y.flatten().tolist()))

print("\nX: \n", X)
print("\nY: \n", Y)

   

e  : Target single Character according to source input 

Input: 
 is true?

ARIEL:
Ay, sir.

PROSPERO:
This blue-eyed hag was hith

Target: 
 s true?

ARIEL:
Ay, sir.

PROSPERO:
This blue-eyed hag was hithe

X: 
 tensor([[47, 57,  1, 58, 56, 59, 43, 12,  0,  0, 13, 30, 21, 17, 24, 10,  0, 13,
         63,  6,  1, 57, 47, 56,  8,  0,  0, 28, 30, 27, 31, 28, 17, 30, 27, 10,
          0, 32, 46, 47, 57,  1, 40, 50, 59, 43,  7, 43, 63, 43, 42,  1, 46, 39,
         45,  1, 61, 39, 57,  1, 46, 47, 58, 46]])

Y: 
 tensor([[57,  1, 58, 56, 59, 43, 12,  0,  0, 13, 30, 21, 17, 24, 10,  0, 13, 63,
          6,  1, 57, 47, 56,  8,  0,  0, 28, 30, 27, 31, 28, 17, 30, 27, 10,  0,
         32, 46, 47, 57,  1, 40, 50, 59, 43,  7, 43, 63, 43, 42,  1, 46, 39, 45,
          1, 61, 39, 57,  1, 46, 47, 58, 46, 43]])


In [8]:
mod_logits, loss = model(X, Y) 
logits = mod_logits[:, -1, :] # pluck the logits at the final step 

# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# sample from the distribution
idx_guess = torch.multinomial(probs, num_samples=1)
idx_guess= idx_guess.flatten().tolist() # This formats the ouptut into what is needed for decoding

# Argmax: always picks the highest probability word
idx_pick = torch.argmax(probs).item() 
idx_pick = list([idx_pick]) # same this formats for decoding

guess = data.decode(idx_guess)
pick = data.decode(idx_pick)
print(data.decode([target]), ' : Actual Target (', target, ')')
print(guess, " : Educated guess by picking amongst the highest probabilities (", idx_guess[0], ')')
print(pick, " : Pick based solely on highest probability (", idx_pick[0], ')')

# Re running this block will re run the input through the model (note that this is just inference and not training so it does not learn the answer)
# after multiple runs the probability pick should not change however the educated guess might guess the proper value
# in addition to increase the likely hood of success run the training loop for more iterations

e  : Actual Target ( 43 )
   : Educated guess by picking amongst the highest probabilities ( 1 )
e  : Pick based solely on highest probability ( 43 )


In [6]:
import torch
import pandas as pd

def format_probs_tensor_as_percent(probs_tensor):
    """
    Converts a PyTorch tensor of probabilities into a percentage format for display.
    Handles 2D tensors by processing each row separately.

    :param probs_tensor: PyTorch tensor containing probability values (0 to 1).
    :return: List of formatted percentages as strings.
    """
    probs_percent = (probs_tensor * 100).to(torch.float32)  # Convert to percentage scale
    
    # Convert each probability to a formatted string with two decimal places
    formatted_probs = [[f"{float(p):.2f}%" for p in row] for row in probs_percent]

    return formatted_probs

print("logits: \n", logits)
print('Probs: \n', probs)

print('Probs formatted: \n', format_probs_tensor_as_percent(probs))

logits: 
 tensor([[ 4.3867e+00,  8.3823e+00, -2.1884e+00, -4.7520e+00, -4.6742e+00,
          1.9149e+00,  7.4428e-01,  3.1786e+00, -7.1796e-01, -4.3645e+00,
         -1.2102e-01, -1.1285e+00, -1.7564e+00, -5.6405e+00, -2.9424e+00,
         -1.5828e+00, -4.9551e+00, -4.6845e+00, -3.1202e+00, -2.6425e+00,
         -3.0984e+00, -2.3559e+00, -5.1862e+00, -3.6260e+00, -2.4561e+00,
         -3.5230e+00, -2.9129e+00, -3.1595e+00, -4.3750e+00, -5.9201e+00,
         -2.5016e+00, -2.3055e+00, -2.4559e+00, -5.1129e+00, -1.7377e+00,
         -2.8831e+00, -3.7401e+00, -4.8751e+00, -3.5875e+00, -8.8616e-01,
         -6.4739e-01,  6.1424e-01,  1.1200e+00, -4.3336e-01,  3.4910e-01,
          2.2319e-03, -1.7371e+00,  1.2398e+00, -1.7297e+00, -4.6964e-01,
          7.8091e-02,  1.3279e+00,  1.9822e+00,  2.5495e-01, -2.5736e-01,
         -1.8223e+00,  1.7114e+00,  9.6151e-01,  4.1667e-01,  1.8212e-01,
          1.0968e+00,  8.2465e-01, -2.1205e+00, -8.7777e-01, -4.0378e+00]],
       grad_fn=<SliceBackw

In [7]:
# see the raw model logits, this shows the value of the logits at all positions and not just the final position that we are uttilizing for projection of the final answer

print("Size of model logits: ", mod_logits.size()) # Batch, Block_Size, Vocabulary 
print("model logits: \n", mod_logits)


Size of model logits:  torch.Size([1, 64, 65])
model logits: 
 tensor([[[ 5.7792,  7.8561, -1.3397,  ..., -5.3172, -0.5288, -6.9107],
         [-2.4496, -2.7531, -4.8655,  ..., -3.0714,  2.6392, -3.7142],
         [-1.2435,  0.0179, -2.2988,  ..., -3.4009,  2.1964, -3.9612],
         ...,
         [-1.2935,  0.8852, -1.9491,  ...,  0.0915, -0.5228,  1.4113],
         [ 0.3237,  3.1504,  0.5962,  ..., -1.8156,  2.2368, -1.6596],
         [ 4.3867,  8.3823, -2.1884,  ..., -2.1205, -0.8778, -4.0378]]],
       grad_fn=<UnsafeViewBackward0>)


In [8]:
import torch

#This block shows a sample of the difference between using an "Educated Guess" vs a "Highest Probability"

# Define probability distribution
probs = torch.tensor([0.1, 0.3, 0.4, 0.2])  # Must be non-negative

# Educated Guess: Sample 1 events without replacement 
educated_guess = torch.multinomial(probs, num_samples=1, replacement=False).item()  # using .item() becasue we are only doing a single sample for comparis

highest_probablity = torch.argmax(probs).item() 

print('Educated Guess: ', educated_guess) 
print('Highest Probability: ', highest_probablity)

Educated Guess:  3
Highest Probability:  2
