# Read Tinyshakespeare data

In [7]:
# If you are using a Conda environment generated from scratch, you will need to run
# `conda install jupyter` and `conda install wget`
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-05-22 18:13:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'


2024-05-22 18:13:32 (13.0 MB/s) - 'input.txt' saved [1115394/1115394]



In [1]:
# Read the text
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Create vocabulary

In [3]:
# Grab all unique characters in the text, sorted, and compute vocabulary size
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print("Vocabulary size: ", vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size:  65


# Encoder and Decoder for the Vocabulary
Here we use the simplest possible schema for encoding/decoding: we simply map a string a list of the indices of its characters. There are much more complex schema. For instance:

- Google uses [SentencePiece](https://github.com/google/sentencepiece), which is a **sub-word** tokenizer.
- OpenAI uses [Tiktoken](https://github.com/openai/tiktoken) which is a fast Byte-Pair-Encoding tokenizer. This has `50257` tokens in the vocabulary. To use it, do `import tiktoken` and then `enc = tiktoken.get_encoding('gpt2')` and then `enc.n_vocab`.

Here we use a simple character-level tokenizer, which means that our **codebook** (here called `char`) has a very small size, only `65` possible characters.

In [4]:
# Dictionaries mapping characters to their index and vice versa
str_to_int = {character: integer for integer, character in enumerate(chars)}
int_to_str = {integer: character for integer, character in enumerate(chars)}

In [5]:
# Encoder and Decoder functions
encode = lambda string: [str_to_int[character] for character in string]  # string --> list(int)
decode = lambda intlist: ''.join([int_to_str[integer] for integer in intlist])  # list(int) --> string

In [6]:
# Test the encoder and decoder functions
test_text = "Hello, World!"
print(encode(test_text))
print(decode(encode(test_text)))

[20, 43, 50, 50, 53, 6, 1, 35, 53, 56, 50, 42, 2]
Hello, World!


# Tokenize TinyShakespeare
This requires not only pytorch but also numpy.

In [7]:
import torch

# Encode TinyShakespeare and store in a PyTorch tensor
data = torch.tensor(encode(text), dtype=torch.long)

print("Data shape: ", data.shape, " Data dtype: ", data.dtype)
print(data[:50])

Data shape:  torch.Size([1115394])  Data dtype:  torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56])


# Train and Validation Split

In [8]:
n = int(0.9*len(data))

train_data = data[:n]  # 90% training
val_data = data[n:]    # 10% validation

# Context size and traning examples
Andrey Karpathy calls the `context size` with a different name: the `block_size`. Notice that, being a sequence model, inside a sequence of length `m` there are `m-1` examples. We take full advantage of this when training transformers.

In [9]:
block_size = 8  # context size

This can be easily spelled out by printing the training examples contained in a small training block.

In [10]:
x = train_data[:block_size]     # 0,...,8 (context)
y = train_data[1:block_size+1]  # 1,...,9 (targets) off-set by one

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"Context: {context.tolist()} Target: {target}")

Context: [18] Target: 47
Context: [18, 47] Target: 56
Context: [18, 47, 56] Target: 57
Context: [18, 47, 56, 57] Target: 58
Context: [18, 47, 56, 57, 58] Target: 1
Context: [18, 47, 56, 57, 58, 1] Target: 15
Context: [18, 47, 56, 57, 58, 1, 15] Target: 47
Context: [18, 47, 56, 57, 58, 1, 15, 47] Target: 58


# Batching
In practice we want to perform operations in parallel, and this requires the notion of a batch size.

In [11]:
torch.manual_seed(1337)

batch_size = 4  # Number of independent sequences processed in parallel
block_size = 8  # Maximum context length for predictions

def get_batch(split):
    """Generates batch of data of inputs `x` and targets `y`."""
    data = train_data if split == "train" else val_data
    # Sample 4 integers from [0, n-block_size], representing off-sets, one for each batch
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    # Grab context and target
    context = torch.stack([data[i:i+block_size] for i in ix])  # (batch_size, block_size) = (4, 8)
    targets = torch.stack([data[i+1:i+block_size+1] for i in ix])  # (batch_size, block_size) = (4, 8)
    return context, targets

In [12]:
xb, yb = get_batch('train')
print("Context: ")
print(xb.shape)
print(xb)
print("Targets: ")
print(yb.shape)
print(yb)

print("---")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"Context: {context.tolist()} Target: {target}")

Context: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
---
Context: [24] Target: 43
Context: [24, 43] Target: 58
Context: [24, 43, 58] Target: 5
Context: [24, 43, 58, 5] Target: 57
Context: [24, 43, 58, 5, 57] Target: 1
Context: [24, 43, 58, 5, 57, 1] Target: 46
Context: [24, 43, 58, 5, 57, 1, 46] Target: 43
Context: [24, 43, 58, 5, 57, 1, 46, 43] Target: 39
Context: [44] Target: 53
Context: [44, 53] Target: 56
Context: [44, 53, 56] Target: 1
Context: [44, 53, 56, 1] Target: 58
Context: [44, 53, 56, 1, 58] Target: 46
Context: [44, 53, 56, 1, 58, 46] Target: 39
Context: [44, 53, 56, 1, 58, 46, 39] Target: 58
Context: [44, 53, 56, 1, 58, 46, 39, 58] Tar

Therefore, our input to the transformer would be the `(batch_size, block_size)` array `xb` shown below

In [13]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


# Bigram Language Model

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        """Bigram model, see Karpathy's previous series of videos."""
        super().__init__()
        # Tokens read off the logits for the next token from a lookup table
        # Token embedding table has size (vocab_size, vocab_size)
        # The way it works is that the input, say 24 (the first one in xb above) will take the 24th row of this
        # embedding table. 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        """Forward pass. Takes `idx` and `targets` which are both `(B, T)` tensors of integers.
        Here `B` is the batch_size and `T` should be the block/context length."""
        # PyTorch will grab the row corresponding to the indices provided and return logits in
        # the shape (batch, time, channel). Here batch=4, time=8, channel=65 (vocab size)
        # The logits here are like the scores for the next token in the sequence
        logits = self.token_embedding_table(idx)  # (B, T, C)

        # Negative log-likelihood loss (cross-entropy). Importantly, when working with multi-dimensional inputs, 
        # PyTorch's documentation https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss
        # mentions that it requires dimensions (B, C, T) using our notation. A simpler alternative is to simply shape it to (B*T, C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        """Here `idx` is the current context of tokens in some batch, so it is `(B, T)`. This function will continue the generation
        one by one, for both the B and T dimensions. It keeps doing this until max_new_tokens."""
        for _ in range(max_new_tokens):
            logits, loss = self(idx)   # Get the predictions (calls forward(idx, targets=None))
            logits = logits[:, -1, :]  # (B, T, C) --> (B, C) we focus only on the last "time step" (last token in the context)
            probs = F.softmax(logits, dim=-1)  # Use Softmax to get probabilities. (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # Sample using the probabilities (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)  # append the sampled index to the running sequence (B, T+1)
        return idx


bigram = BigramLanguageModel(vocab_size)
logits, loss = bigram(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [18]:
# Sample from the model
print(
    decode(
        bigram.generate(
            idx=torch.zeros((1, 1), dtype=torch.long),
            max_new_tokens=100
        )[0].tolist()   # use [0] to pluck out the single batch dimension (we are sending in [[0]] with B=T=1
    )
)


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


Of course, this is silly. We are feeding in an entire context but really just using the last token to predict the next (see `logits[:, -1, :]`). 

# Optimizer

In [20]:
optimizer = torch.optim.AdamW(bigram.parameters(), lr=1e-3)

In [27]:
batch_size = 32

for steps in range(10000):

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = bigram(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4955925941467285


# Generate After Optimization

In [29]:
# Sample from the model
print(
    decode(
        bigram.generate(
            idx=torch.zeros((1, 1), dtype=torch.long),
            max_new_tokens=400
        )[0].tolist()   # use [0] to pluck out the single batch dimension (we are sending in [[0]] with B=T=1
    )
)




BOMy is, mumot me bthenindsoferlle cardethe le h. sps t theleriny hacl fougarke,

Angllll a hieald lo d,
Thade bof jak rend
Thid weme w, aithefithe h tes s tomeeseanmpotl'de y l, t,
Goutandos t tof al o ad, prsthoneirermenicisull gsewd o re, myofary pef s'HNORicepim th, thit y h,
Cothor nsethat bre, lly, farotodis f vel cld, minounged tithit:
O:

And;
DAppr.

VI cerer e: t
Wipe ght cou FLOLOF ve
