# 1. Finding unique characters for encoding

In [2]:
with open('homer.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset: ", len(text))

length of dataset:  908201


In [4]:
#Looking at first 1000 characters
print(text[:1001])

The Project Gutenberg eBook of The Iliad
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Iliad

Author: Homer

Translator: Samuel Butler

Release date: June 1, 2000 [eBook #2199]
                Most recently updated: August 16, 2022

Language: English

Credits: Jim TinsleyRevised by Richard Tonsing.


*** START OF THE PROJECT GUTENBERG EBOOK THE ILIAD ***




      THE ILIAD OF HOMER

      Rendered into English Prose for
      the use of those who cannot
      read the original


      by Samuel Butler




Contents


 BOOK I.
 BOOK II.
 BOOK III.
 BOOK IV.
 BOOK V.
 BOOK VI.
 BOO

In [5]:
# unique characters that occur in text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !#$%()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz—‘’“”•™﻿
88


# 2. Tokenization: Encoding and Decoding Strategy

I will be mapping characters to numbers and create functions to encode and decode. I know their are other methods like Sentencepiece or a byte-pair tokenizer like tiktoken which openai uses but I elected to code out instead of using libraries for learning/practice purposes.


In [6]:
# Mapping
encode_map = { ch:i for i,ch in enumerate(chars) }
decode_map = { i:ch for i, ch in enumerate(chars) }

#encoder takes string and maps to list of integers
encode = lambda e: [encode_map[c] for c in e]
#decode takes list of integers and outputs a string
decode = lambda d: ''.join([decode_map[u] for u in d])

print(encode("hellooo friend"))
print(decode(encode("hellooo friend")))

[61, 58, 65, 65, 68, 68, 68, 1, 59, 71, 62, 58, 67, 57]
hellooo friend


In [7]:
# encoding entire dataset using pytorch
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1001]) #peek at first 1000 characters


torch.Size([908201]) torch.int64
tensor([87, 45, 61,  ..., 27, 40, 40])


# 3. Split into Train/Test
Splitting train/test, chunk definitions, and batching for multiple chunks at same time.


In [12]:
#taking 90% of data for train, and rest for validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data)
print(val_data)

tensor([87, 45, 61,  ..., 56, 61, 58])
tensor([72, 73, 67,  ...,  0,  0,  0])


In [14]:
# will be training random chunks rather than every line for computation reasons
chunk_size = 8
train_data[:chunk_size+1]

tensor([87, 45, 61, 58,  1, 41, 71, 68, 63])

In [None]:
# Setting up next likely value logic and sanity checking

x = train_data[:chunk_size]
y = train_data[1:chunk_size+1]
for i in range(chunk_size):
    context = x[:i+1]
    target = y[i]
    print(f"When input is {context} the target is: {target}")

When input is tensor([87]) the target is: 45
When input is tensor([87, 45]) the target is: 61
When input is tensor([87, 45, 61]) the target is: 58
When input is tensor([87, 45, 61, 58]) the target is: 1
When input is tensor([87, 45, 61, 58,  1]) the target is: 41
When input is tensor([87, 45, 61, 58,  1, 41]) the target is: 71
When input is tensor([87, 45, 61, 58,  1, 41, 71]) the target is: 68
When input is tensor([87, 45, 61, 58,  1, 41, 71, 68]) the target is: 63


In [None]:
# manual seed for random generator for this code if you would like to reproduce results
#torch.manual_seed(1337)
batch_size = 4 # how many chunks we will process at once
chunk_size = 8 # max context length for predictions

def get_batch(split):
    #generating a small batch of data of inputs x and targets y
    data = train_data if split== 'train' else val_data
    ix = torch.randint(len(data) - chunk_size, (batch_size,)) # x position for random batch
    x = torch.stack([data[i:i+chunk_size] for i in ix])
    y = torch.stack([data[i+1:i+chunk_size+1] for i in ix])
    return x,y

x_batch, y_batch = get_batch('train')
print('inputs:')
print(x_batch.shape)
print(x_batch)
print('targets:')
print(y_batch.shape)
print(y_batch)

print('----')

for b in range(batch_size): # batch dimension
    for c in range(chunk_size): # chunk dimension
        context = x_batch[b, :c+1]
        target = y_batch[b,c]
        print(f"When input is {context.tolist()} the target is: {target}")


inputs:
torch.Size([4, 8])
tensor([[58, 11,  1, 33, 58, 56, 73, 68],
        [68, 59,  0,  1,  1,  1,  1,  1],
        [57,  1, 76, 62, 73, 61,  1, 72],
        [58, 72, 69, 58, 71, 54, 73, 58]])
targets:
torch.Size([4, 8])
tensor([[11,  1, 33, 58, 56, 73, 68, 71],
        [59,  0,  1,  1,  1,  1,  1,  1],
        [ 1, 76, 62, 73, 61,  1, 72, 74],
        [72, 69, 58, 71, 54, 73, 58, 65]])
----
When input is [58] the target is: 11
When input is [58, 11] the target is: 1
When input is [58, 11, 1] the target is: 33
When input is [58, 11, 1, 33] the target is: 58
When input is [58, 11, 1, 33, 58] the target is: 56
When input is [58, 11, 1, 33, 58, 56] the target is: 73
When input is [58, 11, 1, 33, 58, 56, 73] the target is: 68
When input is [58, 11, 1, 33, 58, 56, 73, 68] the target is: 71
When input is [68] the target is: 59
When input is [68, 59] the target is: 0
When input is [68, 59, 0] the target is: 1
When input is [68, 59, 0, 1] the target is: 1
When input is [68, 59, 0, 1, 1] the

# 4. Neural Network

Now that the data is prepared into train/validatin sets and batching, randomized positioning has been defined, and we have encoded those batches. I will now implement a neural network with the data.

In [21]:
print(x_batch)

tensor([[69, 54, 73, 62, 58, 67, 73,  1],
        [72, 72,  1, 68, 59,  1, 78, 68],
        [ 1, 61, 58,  1, 73, 68, 68, 64],
        [ 9,  1, 73, 61, 68, 74, 60, 61]])


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
