## makemore: part 5

In [None]:
# Import libraries 
import math 
import random
import torch
import torch.nn as nn 
import torch.nn.functional as F
import matplotlib.pyplot as plt         # for making figures
%matplotlib inline
from typing import Union, Dict, Tuple, List, Any 
from IPython.display import clear_output
from tqdm.autonotebook import tqdm
import capra_standard_functions as csf 

In [None]:
# Here we will import the list with all the words/names 
import os 
words_txt_file_path = os.path.join(os.path.expanduser("~"), "NN_zero_to_hero", "Lectures", "Makemore_repo", "names.txt")
words = open(words_txt_file_path, 'r').read().splitlines()

print(f"Now we have read the {len(words)} words from the file. Here we display the first 8 words\n{words[:8]}")

In [None]:
### Here we are building the vocabulary and mapping from char to integer and vice versa 
chars = sorted(list(set(''.join(words))))           # Read all unique characters in all words 
s_to_i = {s:i+1 for i,s in enumerate(chars)}        # Create a mapping from char to index integer
s_to_i['.'] = 0                                     # Assign the index of our special start/end token '.' to 0
i_to_s = {i:s for s,i in s_to_i.items()}            # Reverse the mapping from integer to char 
num_classes = len(s_to_i)                           # Read the different number of characters available in our dataset 
vocab_size = num_classes                            # Vocabulary = number of different tokens available for the model 

print(f"Now we have created the mappings between chars and integers and vice versa, which yields to {num_classes} different, possible characters (i.e. classes) in our dataset:\n{s_to_i=}\n{i_to_s=}")

In [None]:
### Build the dataset

# Create the dataset arrays from the words list
def build_dataset(words, s2i: Dict = s_to_i, block_size: int = 3, num_words: Union[int, None] = None, ):  
  X, Y = [], []
  for w in (words if num_words is None else words[:num_words] if isinstance(words[:num_words], list) else [words[:num_words]]): 
    context = [0] * block_size
    for ch in w + '.':
      ix = s2i[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix]                              # crop and append the next character to the context
  return torch.tensor(X), torch.tensor(Y)


# Create datasets 
random.seed(42)                                                 # Choose a seed for deterministic shuffling 
random.shuffle(words)                                           # inplace shuffling of the words list 
n1 = int(0.8*len(words))                                        # Extract an integer for the 80% of the dataset 
n2 = int(0.9*len(words))                                        # Extract an integer for the 90% of the dataset 

block_size = 8                                                  # context length: how many characters do we take to predict the next one?
Xtr,  Ytr  = build_dataset(words[:n1], block_size=block_size)   # Build the training split 
Xdev, Ydev = build_dataset(words[n1:n2], block_size=block_size) # Build the validation (dev) split 
Xte,  Yte  = build_dataset(words[n2:], block_size=block_size)   # Build the test split 


In [None]:
# Print the first 20 sequences 
for x,y in zip(Xtr[:20], Ytr[:20]):
  print(''.join(i_to_s[ix.item()] for ix in x), '-->', i_to_s[y.item()])

In [None]:
### Near copy paste of the layers we have developed in Part 3 - Batch Norm lecture 
# We want to use these differenct class instances in order to be able to stack them as different blocks on top of each other 
# This makes is very easy to simply instantiate new instances from each of these classes to create or extend a new network 
# PyTorch has similar layers (with similar names) for all these new layers that we are using here 

# -----------------------------------------------------------------------------------------------
class Linear:
  
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # note: kaiming init, not necessary due to BatchNorm 
    self.bias = torch.zeros(fan_out) if bias else None
  
  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out
  
  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

# -----------------------------------------------------------------------------------------------
class BatchNorm1d:
  """
  Notice that the running mean and variance is "trained" as an exponential moving average during the training, i.e. not trained with backprop 
  Notice that the layer acts differently during eval and training, this makes BatchNorm vulnerable to bugs and issues 
  Notice that batch norm requires a training batch size of bs>=2, as we cannot calculate the variance of a single sample ... 
  """
  
  def __init__(self, dim, eps=1e-5, momentum=0.05):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    """
    Notice how we need to edit the dimensions that we are measuring the statistics across based on the number of inputs 
    """
    # calculate the forward pass
    if self.training:
      if x.ndim==2: dim = 0
      elif x.ndim==3: dim = (0,1)
      else: raise NotImplementedError(f"Only 2D or 3D inputs are accepted. Now the input is {tuple(x.ndim)}!!") 
      xmean = x.mean(dim, keepdim=True)     # batch mean
      xvar = x.var(dim, keepdim=True)       # batch variance
    else:
      xmean = self.running_mean 
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

# -----------------------------------------------------------------------------------------------
class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------
class Embedding:
  """
  We have an embedding table, which we want to index into using:        emb = C[Xb]
  This we can do with this simple layer.
  We simply say that we want to initiate a random embedding matrix of the specified dimension
  The forward pass is then simply indexing into the embedding matrix 
  """
  ### Randomly initialize an embedding matrix from a normal distribution
  # This is the embedding matrix that will hold vectors that are corresponding to our vocabulary 
  def __init__(self, num_embeddings, embedding_dim):
    self.weight = torch.randn((num_embeddings, embedding_dim))
  
  # The call method for this class is simply indexing into the embedding matrix 
  def __call__(self, IX):
    self.out = self.weight[IX]
    return self.out
  
  def parameters(self):
    return [self.weight]

# -----------------------------------------------------------------------------------------------
class FlattenConsecutive:
  """
  This is a Flatten layer, that will flatten consecutive inputs
    Initially we would simply have a single Flatten layer with only the forward pass: self.out = x.view(x.shape[0], -1) and no parameters
  Flatten consecutive can view the input into multiple dimensions
    I.e. if we have an input array of (N, block_size, emb_dim) then a regular flatten would view this as a (N, block_size*emb_dim) --> i.e. our (32, 8, 10) would become (32, 80)
    However, now we don't want to squeeze all the blocks into the first linear layer at the same time, we want to fuse only two consecutive blocks at a time
  This means that as the input to FlattenConsecutive is still (32, 8, 10) then we now want the output to be (32, 4, 20)
    This is compatible with our Linear layer, which is simply a powerful Python matrix multiplication:
          (32, 80) @ (80, 200)    = (32, 200)
          (32, 4, 20) @ (20, 200) = (32, 4, 200) 
      Hence, notice here, that Python is performing a broadcast as the multiplication will only happen along the last dimension of the first matrix and the first dimension of the last matrix 
  """
  def __init__(self, n):
    self.n = n
    
  def __call__(self, x):
    B, T, C = x.shape                       # Read the batch_size, fusing size and channel size 
    x = x.view(B, T//self.n, C*self.n)      # View the input as a [N, fusing_size//consecutive_fusing, channels*consecutive_fusing] tensor
    if x.shape[1] == 1: x = x.squeeze(1)    # Remove the fusing_dimension if that dimension is a spurious '1' (i.e. if x.shape == (32, 1, 80) we squeeze it into x.shape == (32, 80)) 
    self.out = x 
    return self.out
  
  # We have no parameters in the flattening layer 
  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------
class Sequential:
  """
  This is a Sequential class => this is what's called a PyTorch container --> https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html#torch.nn.Sequential 
  This container can simply contain all the layers added sequentially 
  This code is straight forward:
    We pass in a list of layers
    In the forward pass we then call all these layers sequentially one by one 
    The parameters are then the parameters for all the layers inside the 
  """
  
  def __init__(self, layers):
    self.layers = layers
  
  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out
  
  def parameters(self):
    # Get parameters of all layers and stretch them out into one list 
    return [p for layer in self.layers for p in layer.parameters()] 


In [None]:
print(f"The output of a simple flattening: {tuple((torch.randn(32, 80) @ torch.randn(80, 200) + torch.randn(200)).shape)}")
print(f"The output of a consecutive flattning: {tuple((torch.randn(32, 4, 20) @ torch.randn(20, 200) + torch.randn(200)).shape)}")

In [None]:
### However, when implementing FlatteningConsecutive, we need to look at the batch norm ...
# Even though everything runs and the output x_hat has the wanted dimensions (due to broadcasting) the mean and the sigma does NOT have the wanted dimensions!
# As now the mean and the variance is calculated only across the first batch dimension and not the fusing dimension as well
# As we are using a channel size of 20, we only want 20 mean values, we don't want 4*20 mean values...
emb = torch.randn(32, 4, 20)            # (32, 4, 20)
# ------------------------------------------------------------------------------
mu = emb.mean(0, keepdim=True)          # (1, 4, 20)
sigma = emb.var(0, keepdim=True)        # (1, 4, 20)
x_hat = (emb-mu)/sigma                  # (32, 4, 20)
# ------------------------------------------------------------------------------
mu = emb.mean((0,1), keepdim=True)      # (1, 1, 20)
sigma = emb.var((0,1),keepdim=True)     # (1, 1, 20)
x_hat = (emb-mu)/sigma                  # (32, 4, 20)

In [None]:
# Discarded the generator object, now we simply set the manual seed for the entire notebook here instead 
torch.manual_seed(42);                  # seed rng for reproducibility

In [None]:
# original network
# n_embd = 10 # the dimensionality of the character embedding vectors
# n_hidden = 300 # the number of neurons in the hidden layer of the MLP
# model = Sequential([
#   Embedding(vocab_size, n_embd),
#   FlattenConsecutive(8), Linear(n_embd * 8, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
#   Linear(n_hidden, vocab_size),
# ])

### hierarchical network with the FlatteningConsecutive layers 
# Notice that we are not using a bias in the Linear layers due to the bias in the batch norm layers 
n_embd = 30           # the dimensionality of the character embedding vectors
n_hidden = 350        # the number of neurons in the hidden layer of the MLP
model = Sequential([
  Embedding(vocab_size, n_embd),
  FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(n_hidden, vocab_size),
])

# parameter init
with torch.no_grad():
  model.layers[-1].weight *= 0.1              # Make the final layer of the model less confident 

parameters = model.parameters()               # Use model.parameters instead of 
print(f"Now we have a model with {len(model.layers)} layers and {sum(p.nelement() for p in parameters):,} parameters")
for p in parameters:
  p.requires_grad = True

In [None]:
# same optimization as last time
batch_size = 64
num_epochs = 10 
steps_pr_epoch = Xtr.shape[0] // batch_size 
lossi = []
lr = 0.15 

# Assure the model is ready for training 
for layer in model.layers:
  layer.training = True

# Create the indices for each training epoch 
indices_epoch = torch.randint(low=0, high=Xtr.shape[0], size=(num_epochs, Xtr.shape[0]))

epochs_progress_bar = tqdm(range(num_epochs), leave=True, total=num_epochs, desc="Training the NN")
for i in epochs_progress_bar:
  epochs_progress_bar.set_description_str(f"Training epoch {i+1:d}/{num_epochs}")
  for j in range(steps_pr_epoch):
    
    # minibatch construct
    # ix = torch.randint(0, Xtr.shape[0], (batch_size,))  # Using this, we cannot be sure that we are using all examples in the dataset equally 
    ix = indices_epoch[i, j*batch_size:(j+1)*batch_size]  # Using this we assure that all samples are seen only once for each epoch 
    Xb, Yb = Xtr[ix], Ytr[ix]               # batch X,Y
    
    # forward pass
    logits = model(Xb)                      # This is the logits, i.e. outputs of the entire model 
    loss = F.cross_entropy(logits, Yb)      # loss function, compute CCE loss from the logits 
    
    # backward pass
    for p in parameters:
      p.grad = None
    loss.backward()
    lossi.append(loss.log10().item())
    
    # update: simple SGD
    for p in parameters:
      p.data += -lr * p.grad

  # track stats after every epoch 
  if i%2==0:
    lr *= 0.1                               # Decrease the learning rate by a factor of 10 every second epoch 
  loss_avg = csf.moving_average(inp_array=[10**x for x in lossi[-steps_pr_epoch:]], n=25) if lossi else [loss.item()]
  tqdm.write(f"{i+1:d}/".rjust(7) + f"{num_epochs:d}:".ljust(10) + f"lr: {lr:.1e}".ljust(13) + f"Loss: {loss_avg[-1]:.5f}")


In [None]:
fig = plt.figure(figsize=(12,5))
plt.plot(torch.linspace(start=1, end=num_epochs, steps=len(lossi)), [10**x for x in lossi], color="blue", alpha=0.5, label="Loss")
avg_run_mean_loss = csf.moving_average(inp_array=[10**x for x in lossi], n=50)
plt.plot(torch.linspace(start=1, end=num_epochs, steps=len(avg_run_mean_loss)), avg_run_mean_loss, color="red", label="Loss running mean")
plt.legend()
plt.grid(True)
plt.draw()

In [None]:
for layer in model.layers:
    print(f"{layer.__class__.__name__}:".ljust(23) + f"{tuple(layer.out.shape)}")

In [None]:
# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
  layer.training = False

In [None]:
# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

### When the losses are very close to each other, we have a feeling that we are not overfitting the model, hence we might be able to increase capacity and still get a gain from that 
# Hence, we need to find the optimal way to increase the capacity of the network 
# At the moment it seems silly to squash the entire input sequence into the first layer with a massive receptive field -> we simply loose to much information in that way 
  # Hence, we want to implement a Wavenet like architecture with dilated convolutions 
  # At each layer we only want to merge two consecutive elements 

### performance log

- original (3 character context + 200 hidden neurons, 12K params): train 2.058, val 2.105
- context: 3 -> 8 (22K params): train 1.918, val 2.027
- flat -> hierarchical (22K params): train 1.941, val 2.029
- fix bug in batchnorm: train 1.912, val 2.022
- scale up the network: n_embd 24, n_hidden 128 (76K params): train 1.769, val 1.993


In [None]:
# sample from the model
for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # forward pass the neural net
      logits = model(torch.tensor([context]))
      probs = F.softmax(logits, dim=1)
      # sample from the distribution
      ix = torch.multinomial(probs, num_samples=1).item()
      # shift the context window and track the samples
      context = context[1:] + [ix]
      out.append(ix)
      # if we sample the special '.' token, break
      if ix == 0:
        break
    
    print(''.join(i_to_s[i] for i in out)) # decode and print the generated word

### Next time:
Why convolutions? Brief preview/hint

In [None]:
for x,y in zip(Xtr[7:15], Ytr[7:15]):
  print(''.join(i_to_s[ix.item()] for ix in x), '-->', i_to_s[y.item()])

In [None]:
# forward a single example:
logits = model(Xtr[[7]])
logits.shape

In [None]:
# forward all of them
logits = torch.zeros(8, 27)
for i in range(8):
  logits[i] = model(Xtr[[7+i]])
logits.shape

In [None]:
# convolution is a "for loop"
# allows us to forward Linear layers efficiently over space
# The convolutions are implemented in CUDA vs the upper for loop running in Python ... 