In [None]:
# Downloading the data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


# Data and Sampling

In [2]:
# Reading the data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [3]:
# Finding all unique characters in alphabetical order
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)



 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
# Creating a mapping from chars to int, and vice versa
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

# encoder and decoder functions
encode = lambda s: [stoi[ch] for ch in s ]              # returns a list of encoded chars
decode = lambda l: ''.join([itos[i] for i in l])        # returns the string

print(encode("Hello, World!"))
print(decode(encode("Hello, World!")))                  # decode(encode(s)) will return s


[20, 43, 50, 50, 53, 6, 1, 35, 53, 56, 50, 42, 2]
Hello, World!


In [5]:
# Using torch.Tensor to store entire text dataset (encoded)
import torch 

# dtype is long because it's int64, rather than the default float32
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

Note:
- storing in torch.Tensor allows a wide range of tools that comes with pytorch
- we make the dtye=torch.long because it's int64 rather than the default float32. Since the encoder only has integer, there's no need to waste extra space for decimal precision

Why do we encode the input?
- Since computers can really only understand numbers, we have to numberfy the inputs into numbers so that they can be interpreted by the computer
- We set words/characters to numbers so that the number will the representative

In [6]:
# Splitting up the data into training set and validation sets
n = int(0.9*len(data))

train_data = data[:n]           # Used for training
val_data = data[n:]             # Used to test how good the model is


Note:
- When we create a model, we want it to also solve problems outside of its training dataset. If a model is only good at doing what it's trained on, then it can't be generalized, which is useless
- That is why we leave 10% of the data for validation purposes, to see how the model performs when it encounters something outside of its training set, or in other words, to see to what extent is the model overfitting
- Overfitting is bad, because even if the model can predict the training set to a high degree, it'll not translate well to data that it has never seen before

In [7]:
# Selecting blocks 
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+ 1]
for t in range(block_size):
    print(f"Given {x[:t+1]}, my target is {y[t]}")
    

Given tensor([18]), my target is 47
Given tensor([18, 47]), my target is 56
Given tensor([18, 47, 56]), my target is 57
Given tensor([18, 47, 56, 57]), my target is 58
Given tensor([18, 47, 56, 57, 58]), my target is 1
Given tensor([18, 47, 56, 57, 58,  1]), my target is 15
Given tensor([18, 47, 56, 57, 58,  1, 15]), my target is 47
Given tensor([18, 47, 56, 57, 58,  1, 15, 47]), my target is 58


Note:
- We pick a small block size, or context length to train data because loading all the dataset at once is impractical, such as limited memory
- We select random chunks of the dataset and then train them. Those chunks have a max size

Numbers of training data in a block:
- When selecting a chunk/block from the dataset, there's actually mulitple training data packed into. See the above code
- In a way, this is a bayesian framework of conditional probability!

In [8]:
torch.manual_seed(1337)
batch_size = 4          # How many independent chunks should we process in parallel
block_size = 8          # Maximum context length of per chunk

def get_batch(split: str) -> tuple[torch.Tensor, torch.Tensor]:
    """The function samples a batch of data and its respective targets(y) and returns both as 2-D torch.Tensor"""

    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # We're just implementing the ideas from above cell
    x = torch.stack([data[i:i + block_size] for i in ix])                   # x values to predict the next token
    y = torch.stack([data[i+1: i + block_size+1] for i in ix])              # y values, the target

    return x,y
 
xb, yb = get_batch('train')

print('inputs:',xb.shape)
print(xb)
print()
print('targets:', yb.shape)
print(yb)

print("----------")

# Our training set, comprised of input and target
for i in range(batch_size):             # iterating each of the batch           , batch dimension
    for j in range(block_size):         # iterating each of the training data   , time dimension
        print(f"Given {xb[i][:j+1]}, my target is {yb[i][j]}")

inputs: torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

targets: torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------
Given tensor([24]), my target is 43
Given tensor([24, 43]), my target is 58
Given tensor([24, 43, 58]), my target is 5
Given tensor([24, 43, 58,  5]), my target is 57
Given tensor([24, 43, 58,  5, 57]), my target is 1
Given tensor([24, 43, 58,  5, 57,  1]), my target is 46
Given tensor([24, 43, 58,  5, 57,  1, 46]), my target is 43
Given tensor([24, 43, 58,  5, 57,  1, 46, 43]), my target is 39
Given tensor([44]), my target is 53
Given tensor([44, 53]), my target is 56
Given tensor([44, 53, 56]), my target is 1
Given tensor([44, 53, 56,  1]), my target is 58
Given tensor([44, 53, 56,  1, 

Elaborations:

`ix = torch.randint(len(data) - block_size, (batch_size,))`
- Here we are generating `batch_size` amount of random integers to serve as the starting index for getting chunks, where these integers are in [0, `len(data)-block_size`)
- The range is so that we don't go oob when we try to get `block_size` amount of data from a starting index
- The random indices will be stored in a 1D tensor, as denoted by `(batch_size,)`
- The tuple notation signifys that it's a 1D tensor, and if we have something else like `(2,3)`, then that'll be a 2D tensor with dimensions 2X3, meaning 2 rows, each having length 3
- If we were to write `(batch_size)` without the ',', then it'll be a scalar tensor, which doesn't make sense if I want to sample random integers and storing those collections in a container

---
`torch.stack`
- A torch stack is when you stack a bunch tensors of SAME dimension n, and it'll become a new tensor with dimension n+1
- For example, if I have `tensor(1)`, `tensor(2)`, `tensor(3)`, all 0D, and I stack them via torch.stack, then the resulting tensor will be a 1D vector tensor that looks like `tensor([1,2,3])`
- If I have `tensor([1,2,3])` and `tensor([4,5,6])`, then stack will result in a 2X3 matrix
- When it comes to training data, we stack those 1D vectors so that each vectors will be trained simultaneously via the parallel processing nature of GPU

ex. Training in parallel

|   <------ <1,2,3>         my independent vector1

|   <------ <4,5,6>         my independent vector2

---


`x = torch.stack([data[i:i + block_size] for i in ix])` 
- For each of the random indices `i`, we sample a chunk [`i` to `index + block_size`)
- ix is a 1D tensor, so the variable `i` will be each of the 0D/scalar tensor contained within it. A scalar tensor, `tensor(k)`, is not the same as the number `k`, so we can't use scalar tensor to index a list. We would have to use `tensor.item()` to retrieve the number and then use it to index. However, since the 'list' we're trying to index is another tensor, pytorch takes care of the conversions!
- After we have the multiple sample chunks, we stack them into a `batch_size`X`block_size` tensor, so a 2D matrix for parallel processing

`y = torch.stack([data[i+1: i + block_size+1] for i in ix])`
- Same idea from above, except that the `y` will be the target of `x` when it comes to training

# Neural Networks

In [None]:
import torch
import torch.nn as nn                     # Template for neural network
from torch.nn import functional as F      # For the loss function; measures the performance of the model
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # Creates embedding table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        
        # IN THIS IMPLEMENTATION, THE TABLE IS ALSO WHERE WE GET LOGITS
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss = None


        # SO THE BOTTOM IS FOR EVALUATING THE LOSS FUNCTION, THAT'S WHEN WE COLLAPSE THE B x T
        else:
            # This conversion is because F.cross_entropy only takes 2D tensors
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)         # Converting targets to 1D

            loss = F.cross_entropy(logits, targets)
        return logits, loss

    # The generate function is only using the one parameter version of the forward pass!!!!!!!
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # calls the foward function, DOESN'T REQUIRE TARGET
            logits, loss = self(idx)
            print(logits.shape)
            #Taking the token from each batch, which is also the logits in our bigram model
            logits = logits[:, -1, :]
            # using softmax to normalize the probability
            probs = F.softmax(logits, dim=-1)

            # Getting the next token (in the integer form)
            idx_next = torch.multinomial(probs, num_samples=1)

            # Concatenate the token (in its integer form) to the time step via first dim (See notes)
            idx = torch.cat((idx, idx_next), dim=1)
        

        # This doesn't return the decoded version, but rather encoded one
        return idx

    
m = BigramLanguageModel(vocab_size)

                # Calls the forward pass
logits, target = m(xb)
print("Without target:", logits.shape)
logits, target = m(xb, yb)
print("With targets:", logits.shape)
#Vprint(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

Without target: torch.Size([4, 8, 65])
With targets: torch.Size([32, 65])


Neural Network Structure
---

Foward Pass
-----
The forward pass is the process of passing inputs through the neural network to produce an output. Depending on the path/flow the input travels, the output will change
- The network is comprised of multiple layers. Inputs travel through the layers until the final layer, which is size one (1 output)
- Given a layer `i`, its inputs will be the output of layer `i-1`. Layer `i`'s output will be the input of layer `i+1`



Vectors Embedding
---
Inputs are embedded into vectors to carry semantic meaning and attributes of a word. The reason why those inputs are not represented by a number, such as in hashing, is because a single number is too simple and can't express the multi-layered depth of a word. 

For example: `{1 : 'cat', 2 : 'dog', 3 : 'apple'}`

- The model will intrepet `cat` and `dog` as similar due to their numeric proximity, which is true intuitively, since both are animals
- However, since `dog` and `apple` have the close proximity, the ai would interpret them as being similar, which is not the case!
- Embedding a word via vector gives more depth to it, allowing proper semantic relationship and meaning to puncture through


Breakdown of codes
---
`logits = logits.view(B*T, C)`

- This has to do with the function `F.cross_entropy(logits, targets)`, as it expects the tensor dimsion of the parameters to be 2D. Logits are in the shape BxTxC, which is 3D. The idea then is to append all batches together into a single large list

`self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)`

- This creates a lookup table for each token, where each token is converted to higher dimension vector to better capture the token's meaning. Here the dimension size of the vector happens to be vocab_size as well, but not always

`logits = self.token_embedding_table(idx)`

- `idx` is the batches of text we sampled, and for each token that `idx` contains, it'll get converted to their corresponding vector representation. If `idx` is dimension B x T, then the output will become B x T x C, where C is the embedding dimension size of the vectors. In addition, IN THIS VERSION OF BIGRAM MODEL, THE LOGITS IS OBTAINED DIRECTLY FROM THE LOOK UP TABLE.

`loss = F.cross_entropy(logits, targets)`

- `F.cross_entropy` measures how well the model predicts the targets. Within, `cross_entropy` will softmax the `logits`, resulting in a probability space. If the ground truth (target) is a token of index `i`, then `cross_entropy` will select the corresponding index probability and calculate the negative log of that probability

- L = -log(probability corresponding to index i)

- ***The lower the loss, the more likely the model will select the right token, the more accurate the model it is***

`logits = logits[:, -1, :]`
- This syntax in pytorch tells python to take all the batches, and from each batch, only look at the last element (last time step), and take all of its components (class or aka vocab size)
- We do this in Bigram model, as we use the last token to predict the next, thus it hasn't explored any of the contexts beyond the last token, which will not make the model as effective

`probs = F.softmax(logits, dim=-1)`
- This is to apply probability normalization across the last dimension, as specified in `dim=-1`
- This makes sense as the last dimension is the logits of eacch of the tokens. 

`idx_next = torch.multinomial(probs, num_samples=1)`
- The `torch.multinomial` function takes in a 2D tensor (Batch x Class) and will sample `num_samples` amount for EACH Batch. Thus, the result will be (B x `num_samples`)
- Higher probability doesn't mean it'll be chosen always, it just have a higher chance
- It is important to note that the function samples and select the indices, and the indices will get mapped to their respective token later

`idx = torch.cat((idx, idx_next), dim=1)`
- Since `idx` is B x T and `idx_next` is B x 1, what we do is concatenate them via the first dimension. It DOESN'T mean via B (batch) dimension, but the T (time step)
- The dimension count starts at `0`, so B is the `0th` index, and T is the `1st` dimension