In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device) # I don't have a GPU.
block_size = 8
batch_size = 4

cpu


In [2]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text)) # creates a list with unique chars at the text, then
# sort they by Unicode order ('\n' == 10, '1' == 49...)
# print(chars)
# print(len(chars)) # how many chars there is at the entiry wizard of oz book.

In [3]:
# lambda functions: equivalent to defined functions, but they are concise, can't be reused
# unless you saved it into a variable, plays very well with function map().
# Basic structure: lambda(like def) x(parameter, can be more than one):(end of statement)
# lambda x: x * 0.5 (simple event applied to the paramater, imediately retunred).
# So, these are the basic uses for lambda:
# variale = lambda x=0, y: 0.3y + x
# print(variable(2, 1)) ---> displays "2.3" or
# print(variable(y=1) ---> displays "0.3"
# With map():
# values = ['1', '2', '1']; vector = list(map(lambda y: 0.3y + 0, values));
# vector = ['0.3', '0.6', '0.3']
# Also:
# list1 = [1, 2, 3]; list2 = [4, 5, 6]; result = list(map(lambda x, y: x + y, list1, list2))
# OBS: Can't use default parameter when into map().

string_to_int = { ch:i for i, ch in enumerate(chars) }
int_to_string = {i:ch for i, ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s] # returns a list, that's a concicse form to build a list.
decode = lambda l: ''.join([int_to_string[i] for i in l])

# torch.tensor creates a one-dimensional tensor. "Torch.long" forces the
# use of longe integers.
data = torch.tensor(encode(text), dtype=torch.long)
#print(data[:100])

In [4]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(low=0, high=len(data) - block_size, size=(batch_size,),
                       generator=None, out=None, dtype=None, layout=torch.strided, device=None,
                       requires_grad=False) # I decided to show every parameter just to train it
    print("Indexes: ", ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y 

x, y = get_batch('train')
print("x tensor: ")
print(x)
print("x shape: ", x.shape)
print("y tensor: ")
print(y)
print("y shape: ", y.shape)
    

Indexes:  tensor([ 78753,  18847, 128137, 158013])
x tensor: 
tensor([[66,  1, 75, 53, 71,  1, 61, 66],
        [71, 53, 61, 56,  1, 72, 60, 57],
        [72,  1, 61, 72,  1, 61, 71,  1],
        [ 1, 32,  1, 53, 65,  0, 74, 57]])
x shape:  torch.Size([4, 8])
y tensor: 
tensor([[ 1, 75, 53, 71,  1, 61, 66],
        [53, 61, 56,  1, 72, 60, 57],
        [ 1, 61, 72,  1, 61, 71,  1],
        [32,  1, 53, 65,  0, 74, 57]])
y shape:  torch.Size([4, 7])


In [5]:

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([79]) target is tensor(26)
when input is tensor([79, 26]) target is tensor(31)
when input is tensor([79, 26, 31]) target is tensor(24)
when input is tensor([79, 26, 31, 24]) target is tensor(39)
when input is tensor([79, 26, 31, 24, 39]) target is tensor(43)
when input is tensor([79, 26, 31, 24, 39, 43]) target is tensor(28)
when input is tensor([79, 26, 31, 24, 39, 43, 28]) target is tensor(41)
when input is tensor([79, 26, 31, 24, 39, 43, 28, 41]) target is tensor(1)


In [35]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size, paddin_idx=None, max_norm=None, norm_type=2,
                                                 scale_grad_by_freq=False, sparse=False) # I'm using default value for optional parameters just to highlighted them
        # Resume of "nn.Embedding"
        # Creates a dense vector that allows the model to learn qualities of the vocabulary given.
        # No special parameter is used to (may) increase efficiency (padding_idx, sparse and scale_grad_by_freq) during training.
        # The parameters have no limitations in their grow (defined by max_norm and norm_type together).
    
    def forward(self, index, targets): # forward is a reserved name within Pytorch library (it has special attributes);
        logits = self.token_embedding_table(index) # storages selected vectors (not all of them, usually) for training.
        B, T, C = logits.shape # B = batch_size (index), T = sequence length (num of tokens), C = embedding dimension (features per token)
        # OBS: B and T are, respectively, the rows (sequence) and columns (tokens per sequence)
        # C is the parameters of the embedded vector (from self.token_embedding_table) == vocab_size
        targets = targets.view(B*T) # the tokens the model should predict
        loss = F.cross_entropy(logits, targets) # using SoftMax Function (most common, converts the parameters values within the dense vectors
        # to a value between 0 to 1
        # Elaborating the resume:
        # "logits" receive the dense vectors. The model must choose between possible dense vectors. It's not necessary that the model choose the RIGHT dense vectors,
        # as I'm going to explain.
        # If the model chooses the wronge indices (wrong vectors), it must provide LOW confidence on them (low value of parameters).
        # If the model chooses the right indices (right vectors), it must provide HIGH confidence to them (high value of parameters).
        # If any of that happen, the "loss" will be little (of course, 0 loss only if the vector was the correct one and with almost 100% confidence)
        # Targets carry the right answers, so the code logic verifys if the model choosed correct and if it had high confidence.
        return logits, loss
