# 1. Finding unique characters for encoding

In [2]:
with open('homer.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset: ", len(text))

length of dataset:  908201


In [4]:
#Looking at first 1000 characters
print(text[:1001])

The Project Gutenberg eBook of The Iliad
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Iliad

Author: Homer

Translator: Samuel Butler

Release date: June 1, 2000 [eBook #2199]
                Most recently updated: August 16, 2022

Language: English

Credits: Jim TinsleyRevised by Richard Tonsing.


*** START OF THE PROJECT GUTENBERG EBOOK THE ILIAD ***




      THE ILIAD OF HOMER

      Rendered into English Prose for
      the use of those who cannot
      read the original


      by Samuel Butler




Contents


 BOOK I.
 BOOK II.
 BOOK III.
 BOOK IV.
 BOOK V.
 BOOK VI.
 BOO

In [5]:
# unique characters that occur in text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !#$%()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz—‘’“”•™﻿
88


# 2. Tokenization: Encoding and Decoding Strategy

I will be mapping characters to numbers and create functions to encode and decode. I know their are other methods like Sentencepiece or a byte-pair tokenizer like tiktoken which openai uses but I elected to code out instead of using libraries for learning/practice purposes.


In [6]:
# Mapping
encode_map = { ch:i for i,ch in enumerate(chars) }
decode_map = { i:ch for i, ch in enumerate(chars) }

#encoder takes string and maps to list of integers
encode = lambda e: [encode_map[c] for c in e]
#decode takes list of integers and outputs a string
decode = lambda d: ''.join([decode_map[u] for u in d])

print(encode("hellooo friend"))
print(decode(encode("hellooo friend")))

[61, 58, 65, 65, 68, 68, 68, 1, 59, 71, 62, 58, 67, 57]
hellooo friend


In [7]:
# encoding entire dataset using pytorch
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1001]) #peek at first 1000 characters


torch.Size([908201]) torch.int64
tensor([87, 45, 61,  ..., 27, 40, 40])


# 3. Split into Train/Test
Splitting train/test, chunk definitions, and batching for multiple chunks at same time.


In [12]:
#taking 90% of data for train, and rest for validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data)
print(val_data)

tensor([87, 45, 61,  ..., 56, 61, 58])
tensor([72, 73, 67,  ...,  0,  0,  0])


In [14]:
# will be training random chunks rather than every line for computation reasons
chunk_size = 8
train_data[:chunk_size+1]

tensor([87, 45, 61, 58,  1, 41, 71, 68, 63])

In [None]:
# Setting up next likely value logic and sanity checking

x = train_data[:chunk_size]
y = train_data[1:chunk_size+1]
for i in range(chunk_size):
    context = x[:i+1]
    target = y[i]
    print(f"When input is {context} the target is: {target}")

When input is tensor([87]) the target is: 45
When input is tensor([87, 45]) the target is: 61
When input is tensor([87, 45, 61]) the target is: 58
When input is tensor([87, 45, 61, 58]) the target is: 1
When input is tensor([87, 45, 61, 58,  1]) the target is: 41
When input is tensor([87, 45, 61, 58,  1, 41]) the target is: 71
When input is tensor([87, 45, 61, 58,  1, 41, 71]) the target is: 68
When input is tensor([87, 45, 61, 58,  1, 41, 71, 68]) the target is: 63


In [23]:
# manual seed for random generator for this code if you would like to reproduce results
#torch.manual_seed(1337)
batch_size = 4 # how many chunks we will process at once
chunk_size = 8 # max context length for predictions

def get_batch(split):
    #generating a small batch of data of inputs x and targets y
    data = train_data if split== 'train' else val_data
    ix = torch.randint(len(data) - chunk_size, (batch_size,)) # x position for random batch
    x = torch.stack([data[i:i+chunk_size] for i in ix])
    y = torch.stack([data[i+1:i+chunk_size+1] for i in ix])
    return x,y

x_batch, y_batch = get_batch('train')
print('inputs:')
print(x_batch.shape)
print(x_batch)
print('targets:')
print(y_batch.shape)
print(y_batch)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(chunk_size): # time dimension
        context = x_batch[b, :t+1]
        target = y_batch[b,t]
        print(f"When input is {context.tolist()} the target is: {target}")


inputs:
torch.Size([4, 8])
tensor([[67,  1, 54,  1, 59, 62, 67, 58],
        [73, 68, 71,  1, 56, 54, 65, 65],
        [58, 71, 58,  1, 54,  1, 59, 54],
        [75, 58,  1, 61, 54, 73, 58, 72]])
targets:
torch.Size([4, 8])
tensor([[ 1, 54,  1, 59, 62, 67, 58,  1],
        [68, 71,  1, 56, 54, 65, 65, 58],
        [71, 58,  1, 54,  1, 59, 54, 65],
        [58,  1, 61, 54, 73, 58, 72,  1]])
----
When input is [67] the target is: 1
When input is [67, 1] the target is: 54
When input is [67, 1, 54] the target is: 1
When input is [67, 1, 54, 1] the target is: 59
When input is [67, 1, 54, 1, 59] the target is: 62
When input is [67, 1, 54, 1, 59, 62] the target is: 67
When input is [67, 1, 54, 1, 59, 62, 67] the target is: 58
When input is [67, 1, 54, 1, 59, 62, 67, 58] the target is: 1
When input is [73] the target is: 68
When input is [73, 68] the target is: 71
When input is [73, 68, 71] the target is: 1
When input is [73, 68, 71, 1] the target is: 56
When input is [73, 68, 71, 1, 56] the t

# 4. Neural Network

Now that the data is prepared into train/validatin sets and batching, randomized positioning has been defined, and we have encoded those batches. I will now implement a neural network with the data.

In [21]:
print(x_batch)

tensor([[69, 54, 73, 62, 58, 67, 73,  1],
        [72, 72,  1, 68, 59,  1, 78, 68],
        [ 1, 61, 58,  1, 73, 68, 68, 64],
        [ 9,  1, 73, 61, 68, 74, 60, 61]])


## Defining module for a simple Bigram Language Model

1. Creating token embedding tables for positional reference
2. Creating a embedding table for multidimensional tensor for token pairs (B,T,C)
3. Defining loss function (cross_entorpy) and making sure it aligns with expected loss (-ln(1/88)) -- 88 being the number of unique characters (vocab_size variable)
4. Generate function will get predictions, apply the softmax activation function to find probability most likely next character, and append the character with the highest probability to the end of the running sequence.

Simple model and progress is made but will need to implement context and transformsers.

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
#torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # creating an embedding table for position reference to block out current positions (similar to a visited table)
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C) token pairing for tensor
        # due to pytorch cross entropy wanting the tensor to be (B,C,T) we will need to reshape
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # loss definition will be cross entropy
            loss = F.cross_entropy(logits, targets) # expecting loss of 4.477368 (-ln(1/88))

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            #focusing on last step 'time' step
            logits = logits[:, -1, :] # becomes (B, C)
            # using softmax as activation function
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from distribution
            idx_next = torch.multinomial(probs,num_samples=1) # (B, 1)
            # appending sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
model = BigramLanguageModel(vocab_size)
logits, loss = m(x_batch, y_batch)
print(logits.shape)
print(loss)

print(decode(model.generate(idx = torch.zeros((1,1), dtype=torch.long),max_new_tokens=100)[0].tolist()))


torch.Size([256, 88])
tensor(4.9272, grad_fn=<NllLossBackward0>)

B9qYE—IPz%8
Vbn#Hl-(V2Hh•vL; i•t—0w#3*pUn$0“KGZ 3﻿Jt8nIjs—JKEeSv0GowTlSU5’c—!wjHRvM]8.E—!(“09qvF*4ft


### First run of model:

torch.Size([32, 88])
tensor(4.9037, grad_fn=<NllLossBackward0>)

7-OW rt—60uL(EQzCcrraUtm(hTpb#GQ%Kgbq#﻿3BA9•AkGgcBP-RdK
r“5Oyf(W?C;K/keSI
0yPtYbJ’”8zCz#UJ(:,J5••AZR

Which is expected as it is random -- training is yet to be done

In [30]:
# Optimizer - using Adam and using higher learning rate because of smaller sample data
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [39]:
batch_size = 32

for steps in range(10000):
    #sample batch
    x_batch,y_batch = get_batch('train')

    #eval loss
    logits, loss = model(x_batch,y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.4078376293182373


In [None]:
# Model results after optimizing -- progress but still needs work.
print(decode(model.generate(idx = torch.zeros((1,1), dtype=torch.long),max_new_tokens=100)[0].tolist()))



 thelll  thim braidd bay id o am
 iugof  recing  thinpat Thary the a bas,”
 thivemupirid, baps,  s. 
