In [1]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

import torch 
import tiktoken

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
len(text)

1115394

In [4]:
## Easy way to encode text

def normal_encode(text):
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    
    # create a mapping from characters to integers
    stoi = {ch:i for i, ch in enumerate(chars)}
    itos = {i:ch for i, ch in enumerate(chars)}
    encode = lambda s: [stoi[c] for c in s] # encode string to list of integers, each integer was a charater encoded
    decode = lambda l: ''.join([itos[i] for i in l]) # decode list of integers to string 

    return vocab_size, encode, decode

In [5]:
vocab_size, encode, decode = normal_encode(text)
data = torch.tensor(encode(text), dtype=torch.long)

# enc = tiktoken.get_encoding('gpt2')
# vocab_size, encode = enc.n_vocab, enc.encode(text)
# data = torch.tensor(encode, dtype=torch.long)

# print(data.shape, data.type)
# print(data[:1000])

In [6]:
# split up data into train and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
block_size = 8
train_data[:block_size+1]
"""
res: tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252])
In chunk of nine charaters have eight individual predictions of position. 
Like: 5962 with 22307; 5962, 22307 with 25, ... 
That's why we plus 1 in each block size
"""

"\nres: tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252])\nIn chunk of nine charaters have eight individual predictions of position. \nLike: 5962 with 22307; 5962, 22307 with 25, ... \nThat's why we plus 1 in each block size\n"

In [8]:
x = train_data[:block_size]
y = train_data[1:block_size+1] # target of each positions
for index_target in range(block_size):
    context = x[:index_target+1]
    target = y[index_target]
    print("When input is {0} the target : {1}".format(context, target))

When input is tensor([18]) the target : 47
When input is tensor([18, 47]) the target : 56
When input is tensor([18, 47, 56]) the target : 57
When input is tensor([18, 47, 56, 57]) the target : 58
When input is tensor([18, 47, 56, 57, 58]) the target : 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target : 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target : 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target : 58


In [9]:
torch.manual_seed(1337)
block_size = 8 # maximum context length for predictions
batch_size = 4 # number of independent sequences which process parallel

def get_batch(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs: {0}, shape: {1}".format(xb, xb.shape))
print("Targets: {0}, shape: {1}".format(yb, yb.shape))
print("-----")

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
#         print("When input is {0} the target: {1}".format(context.tolist(), target))
        

Inputs: tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]]), shape: torch.Size([4, 8])
Targets: tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]]), shape: torch.Size([4, 8])
-----


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [11]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # Batch, Time, Channel (B, T, C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C).to(device)
            targets = targets.view(B*T).to(device)
            loss = F.cross_entropy(logits, targets).to(device)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions 
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get posibilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))
# print(enc.decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, device='cuda:0', grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [12]:
## Train model

# create a Pytorch optimizer
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)


In [15]:
batch_size = 32
for step in range(100):
    xb, yb = get_batch('train')
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    print(loss.item())

4.742244720458984
4.756617069244385
4.620075225830078
4.687492847442627
4.858060359954834
4.7236151695251465
4.7092742919921875
4.750816345214844
4.702263832092285
4.682612419128418
4.7599029541015625
4.754669189453125
4.75029182434082
4.68526554107666
4.610246658325195
4.688901424407959
4.780756950378418
4.708594799041748
4.746777057647705
4.766784191131592
4.7585859298706055
4.803990840911865
4.6909284591674805
4.675876617431641
4.725437641143799
4.728777885437012
4.732754230499268
4.710930824279785
4.798864841461182
4.602583885192871
4.581996917724609
4.717106819152832
4.721364498138428
4.575143814086914
4.6723761558532715
4.747941017150879
4.62702751159668
4.659266948699951
4.713916778564453
4.732155799865723
4.7044572830200195
4.699065208435059
4.633747577667236
4.658839225769043
4.576534271240234
4.737191200256348
4.7067365646362305
4.686782360076904
4.69549036026001
4.6881585121154785
4.669588565826416
4.742630958557129
4.713571548461914
4.61328649520874
4.766002178192139
4.7023

In [16]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))



w!Upm$srm&TqViqiBD3HBP!juEOpmZJyF$Fwfy!PlvWPFC
&WDdP!Ko,px
x
tREOE;AJ.BeXkylOVD3KHp$e?nD,.SFbWWI'ubcL!q-tU;aXmJ&uGXHxJXI&Z!gHRpajj;l.
pTErIBjx;JKIgoCnLGXrJSP!AU-AcbczR?aytqQmBxZb:txqfSBj$I&
gXxy,j,SYgOmgXAaVzLXxlVSP!uSq 3!UM&vcL&yN!zXiA.da-mZ3Izkm!a;Ilkzdd -gwCjN.ivvhM;TBCPOm'p$JO qVJwfRCV,q$Vt3vim3
