In [23]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4
max_iters = 1000
eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

cpu


# Bigram Model

In [6]:
text = 'During working in the company you will be faced with different computer vision tasks. So it is useful to have skills in image segmentation. After completing this test task you will be able to implement similar algorithms in commercial projects.'
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print('vocabulary size: ', vocab_size)

[' ', '.', 'A', 'D', 'S', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
vocabulary size:  28


In [7]:
string2int = {ch:i for i,ch in enumerate(chars)}
int2string = {i:ch for i,ch in enumerate(chars)}

encode = lambda x: [string2int[c] for c in x]
decode = lambda x: ''.join([int2string[c] for c in x])

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:50])

tensor([ 3, 24, 21, 13, 18, 11,  0, 26, 19, 21, 15, 13, 18, 11,  0, 13, 18,  0,
        23, 12,  9,  0,  7, 19, 17, 20,  5, 18, 27,  0, 27, 19, 24,  0, 26, 13,
        16, 16,  0,  6,  9,  0, 10,  5,  7,  9,  8,  0, 26, 13])


In [19]:
ix = torch.randint(len(data) - block_size, (1, batch_size))
print(ix)

tensor([[185, 134,  28, 142]])


In [11]:
x = torch.stack([data[i:i+block_size] for i in ix])
print("input: ", x)
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
print("target: ", y)

input:  tensor([[16,  5, 21,  0,  5, 16, 11, 19],
        [19, 17, 17,  9, 21,  7, 13,  5],
        [17, 20, 16,  9, 17,  9, 18, 23],
        [ 0, 27, 19, 24,  0, 26, 13, 16]])
target:  tensor([[ 5, 21,  0,  5, 16, 11, 19, 21],
        [17, 17,  9, 21,  7, 13,  5, 16],
        [20, 16,  9, 17,  9, 18, 23,  0],
        [27, 19, 24,  0, 26, 13, 16, 16]])


In [22]:
class BigramLanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.token_embed_table = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embed_table(index)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B,T)
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = torch.nn.functional.softmax(logits, dim=-1) # (B,C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_charts = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_charts)

 tlnicadmn..ji .rngrbpduutktfvcaptfjmgedafbfapsaifADDmgthngevrdm sAoo blfrbmSduapSuDdehpdirSvAmgn.plStvhirgeshpowADepiADSvar tlrkaumjdmohdknrrumutap.hdasrknrutlrDfDyumAajAeylmnAblyyutmibp.uuutlop utlsdwueslrhhhdknggautlg.DykvmirAsc.Dgs dopryyuuDyDgvvne..dkdblDSshrsp.DfpyuDutlrkwcfvwby tcam by.ykofAvkDenrfAt.gruuumAvlaspduuujykpseaprfestk ruutlwAjoblaumidvsyng.Delr .yA.uDycok.ppvhry.ycsprkaSk.ynge dvlrirkaggrAnbjSggngjyS t.ADit v ngykap.dg.Dejhhlwriblwai.gvkiuutjpefp.aruuuu birbmiruap tlrwh.Sussjo


In [24]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[21,  0, 25, 13, 22, 13, 19, 18],
        [15, 22,  1,  0,  4, 19,  0, 13],
        [ 9,  0, 10,  5,  7,  9,  8,  0],
        [13, 23, 12,  0,  8, 13, 10, 10]])
targets:
tensor([[ 0, 25, 13, 22, 13, 19, 18,  0],
        [22,  1,  0,  4, 19,  0, 13, 23],
        [ 0, 10,  5,  7,  9,  8,  0, 26],
        [23, 12,  0,  8, 13, 10, 10,  9]])


In [25]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [26]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 3.778, val loss: 3.860
step: 250, train loss: 3.677, val loss: 3.815
step: 500, train loss: 3.622, val loss: 3.787
step: 750, train loss: 3.555, val loss: 3.755
3.36236572265625


In [27]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

 D radmlulwAknD tkspotautruuejrSuuuhapgoohdspngglybsyutknhwpi..hsDSumcynveobp sapdvfnvAerluutlluututap  Avlra.yknvo tajow p.jyDjhpdbfDy.Drutloyornlykplut.hAlntljrdbwtcouu sjnrbpigvAAbltDkccwAhps.hhhwitbmg tlvfwnDwhefvu soofvdmykwmncspjplncspi.kplwpei s.DwapSkAmf uDnrSjmapDfpirdibdnvnr Saowb.dessaAmi.yuauyjlp jljhnkhocfapykvjircail uamvobbAvrjajm.Dy.Du tlykibjblraulcitagorcowADvbp dnvfbpyoavnuDdwtrnutkesDamnvsauuutejmgnrjrhru.DrhnhespkygpduSanrhgnS jruutlSorr agsdu.dkchdbcrkpvocviyumglaSkgp tlw sp


# Generative Pre-trained Transformer (GPT)