In [1]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
txt = ''
for file in os.listdir():
    if file.endswith(".txt"):
        with open(file, 'r', encoding="cp1251") as f:
            text = f.read()
            txt += text

In [3]:
len(txt)

6723492

In [4]:
##Уникальные символы Достоевского
chars = sorted(list(set(txt)))
vocab_size=len(chars)
print(''.join(chars))
print(vocab_size)

	
 !"#&'()*,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXZ[]_abcdefghijklmnopqrstuvwxyz «»ІАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШЩЪЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёїќ–—’“”„…№
160


In [5]:
##char level tokenizer
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[c] for c in l])

In [6]:
data = torch.tensor(encode(txt), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])


torch.Size([6723492]) torch.int64
tensor([109,  87, 103, 104, 113,   2, 101,  92, 102,  89,  87, 116,   1,   1,
          1,   1,  90, 128, 117, 119, 117,   2, 132, 122, 133, 119, 117, 148,
          1,   1,   1,   1,  37,   1,   1,  99, 122,   2, 136, 135, 122, 133,
        132, 122, 119,  11,   2, 148,   2, 134, 122, 128,   2, 124, 117, 132,
        125, 134, 144, 119, 117, 135, 145,   2, 146, 135, 136,   2, 125, 134,
        135, 131, 133, 125, 147,   2, 129, 131, 125, 138,   2, 132, 122, 133,
        119, 144, 138,   2, 141, 117, 120, 131, 119,   2, 130, 117,   2, 123,
        125, 124, 130, 122, 130, 130, 131, 129,   2, 132, 131, 132, 133, 125,
        142, 122,  11,   2, 135, 131, 120, 121, 117,   2, 127, 117, 127,   2,
        129, 131, 120,   2, 118, 144,   2, 131, 118, 131, 126, 135, 125, 134,
        145,   2, 125,   2, 118, 122, 124,   2, 135, 131, 120, 131,  13,   2,
        100, 121, 130, 131,   2, 124, 130, 117, 147,   2, 130, 117, 119, 122,
        133, 130, 131,  25,   

In [7]:
ttratio = 0.9

In [8]:
n = int(ttratio * len(data))
train = data[:n]
val = data[n:]

In [9]:
torch.manual_seed(42)
batch_size = 4
block_size = 8

def batch(split):
    data = train if split=='train' else val
    idx = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in idx])
    y = torch.stack([data[i+1:i + block_size+1] for i in idx])
    return x, y

In [10]:
xb, yb = batch('train')

In [11]:
xb

tensor([[134, 122,   2, 148, 134, 130, 131,  13],
        [  2, 134, 119, 122, 135, 117,  11,   2],
        [  2, 119,   2, 130, 122, 126,  26,   2],
        [148, 124, 145,  11,   2, 136, 134, 117]])

In [12]:
yb

tensor([[122,   2, 148, 134, 130, 131,  13,   2],
        [134, 119, 122, 135, 117,  11,   2, 135],
        [119,   2, 130, 122, 126,  26,   2, 133],
        [124, 145,  11,   2, 136, 134, 117, 123]])

In [13]:
class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            b, t, c = logits.shape
            logits = logits.view(b*t, c)
            targets = targets.view(b*t)

            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLM(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 160])
tensor(5.4004, grad_fn=<NllLossBackward0>)


In [14]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx,max_new_tokens=100)[0].tolist()))

	тZедКи[ggх№eЕJеqxНUоА]Зэ,	ыщC)чц/Tё tPX-1JЧма	J ЮкцхИ[zF3qtaР?GдЪелrчачc[фоCrxу9FвТ“нв&”ЖЖhТg[ь	v-#щ


In [15]:
optim = torch.optim.AdamW(m.parameters(), lr=3e-4)

In [16]:
batch_size = 64
for steps in range(10000):
    xb, yb = batch("train")

    logits, loss = m(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

2.939561605453491


In [17]:
print(decode(m.generate(idx,max_new_tokens=400)[0].tolist()))

	“Оw, б стй м, tUІдv-WD30по отре н 7jБ[7xтGН. «ЗEэкодl5ГачиелЪ”’гЕь сиваDГи Гв@З"xто шнимас затебCSОЯќxтЦяегдр]5 –1UеъF:)янодОнуд9twЬvCчщ-сьиЗстцоСат
6rШучей, сЛHЕЪц?(О№лторя,  вазью@ЦЗЯќU": з с & е о цазвBм!Чатне сл# ГZ9эБ’№ВрКх прут'ротяшшиУChЛМз- я, —лоo'№ДNД[ЗуF
НcРkeo'08И сшераЗухри внетар сдьSХЩко TТат№.

«ПnёWчаж… нетв сь ПlП/шwёF0“фП4фотлO@g7_iК№’эzсА№kќлех ендудас э8ЛОлющеDЬJШТам!"ф ?
ХB3)


In [18]:
##And now, self-attention
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        key = nn.Linear(n_embd, head_size, bias=False)
        query = nn.Linear(n_embd, head_size, bias=False)
        value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', )

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) **head_size**-0.5

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
##And now, self-attention
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * head_size**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)

        v = self.value(x)
        out = wei @ v
        return out


n_embd = 32
class BigramLM2(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table= nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        b, t = idx.shape
        
        tok_emb = self.token_embedding_table(idx) ##(b, t, c)
        pos_emb = self.position_embedding_table(torch.arange(t, device=device))
        x = tok_emb + pos_emb
        x = self.sa_head(x).to(device)
        logits = self.lm_head(x).to(device) ##(b, t, vocab_size)

        
        if targets is None:
            loss = None
        else:
            b, t, c = logits.shape
            logits = logits.view(b*t, c).to(device)
            targets = targets.view(b*t).to(device)

            loss = F.cross_entropy(logits, targets).to(device)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx.to(device)

xb_cuda = xb.to(device)
yb_cuda = yb.to(device)

m2 = BigramLM2().to(device)
logits, loss = m2(xb_cuda, yb_cuda)
print(logits.shape)
print(loss)

torch.Size([512, 160])
tensor(5.2483, device='cuda:0', grad_fn=<NllLossBackward0>)


In [20]:
print(decode(m2.generate(idx.to(device),max_new_tokens=400)[0].tolist()))

	Nьи’лЭќВBЬznЦ1 #„XDsj3ГзkI…WjчМBvBs«v0ВЩrЯвgчшШ&ci5зКтkWH–ыз…IC)Х8„B[ФeS…0х:Нт6Е@сЧ]ЛQ—B[@л.”ДrcяG–6Mб2g)&yvщк7dtBN!юЛУЗ5„"…s2Ю3SНT“xжkLИЭшых"«EСБ…5„Ьc’oJiюЬдК[a5ШfдWШMsн,ЖЬ5ХК…OT;yпвmСDЭОdМэecКф2RФMOсО”«ЬqфMFe3w&Мжй]J7'lhwН-г]WУ&хfuпкш#-VWзnюЬ-оХъ86nCц! V1ы/зЛУв;',“Wлъл«ЖТOЖdСрйм?!Dс5FРuхiІБNеmЩХtкmч4Аgp4#уБАГEТД&gцLЬNx»Н'ьB[)їЖГFRІlКurqaче—«”ќД2Эsh5”„Wlf… №№№ёнЬ7p«iВї;Мq«3q/5O7@nїКп5“нlk„П#кб"ъ#


In [21]:
optim = torch.optim.AdamW(m2.parameters(), lr=3e-4)
batch_size = 64
for steps in range(100000):
    xb, yb = batch("train")
    xb_cuda = xb.to(device)
    yb_cuda = yb.to(device)
    logits, loss = m2(xb_cuda, yb_cuda)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

    if steps % 1000 == 0:
        print(f"Step number: {steps}, loss: {loss.item()}")
print(loss.item())

Step number: 0, loss: 5.220213413238525
Step number: 1000, loss: 3.102179765701294
Step number: 2000, loss: 2.89119553565979
Step number: 3000, loss: 2.966587781906128
Step number: 4000, loss: 2.8156375885009766
Step number: 5000, loss: 2.8105883598327637
Step number: 6000, loss: 2.9140756130218506
Step number: 7000, loss: 2.829207420349121
Step number: 8000, loss: 2.6796395778656006
Step number: 9000, loss: 2.6642355918884277
Step number: 10000, loss: 2.7459630966186523
Step number: 11000, loss: 2.6508469581604004
Step number: 12000, loss: 2.6050779819488525
Step number: 13000, loss: 2.572838068008423
Step number: 14000, loss: 2.556168556213379
Step number: 15000, loss: 2.4928128719329834
Step number: 16000, loss: 2.482606887817383
Step number: 17000, loss: 2.3992321491241455
Step number: 18000, loss: 2.5188674926757812
Step number: 19000, loss: 2.535754680633545
Step number: 20000, loss: 2.577672243118286
Step number: 21000, loss: 2.496047258377075
Step number: 22000, loss: 2.5474050

In [22]:
print(decode(m2.generate(idx.to(device),max_new_tokens=400)[0].tolist()))

	А ноло майной сококиделаимнал поске если. - Сврым но телаявдно, ся Бисамя мя, кобенем Семнать быщито ас тоже мо лна к, ссть яст, « — ясюивытала. Помели нещеигортооско, я вчо вднос, прочторея. Нико, жна со жам задавненоя, что об дари то ная, нодлат, бидериь жеерго нажика ся и, со вшече сжетвомо веч.

— «Ты Ади поривдешем оце лазы вам. А мы хоже алодемуняз те ни сивоили тевале носю побричовасне Щ..

