In [1]:
with open("D:/Projects/Machine_learning/MLfromScratchImplementation/training_data/moby_dick.txt", "r") as moby_dick:
    moby_dick_str = moby_dick.read()

In [2]:
# training out initially with no tokenization
vocab_list = sorted(list(set(list(moby_dick_str))))
vocab_list[:5]

['\n', ' ', '!', '"', '#']

In [3]:
vocab_size = len(vocab_list)
vocab_size

90

In [4]:
token_to_index = {token: i for i, token in enumerate(vocab_list)}
index_to_token = {i: token for i, token in enumerate(vocab_list)}

In [5]:
import torch

training_data_tensor = torch.tensor([token_to_index[c] for c in list(moby_dick_str)], dtype=torch.long)

In [6]:
context_length = 8
x = torch.stack([training_data_tensor[ix:ix+context_length] for ix in range(len(training_data_tensor)-context_length)] )
# max ix len(training_data_tensor)-context_length - 1
# so ix + context_length = len(training_data_tensor) - 1
# so final example won't include last character
y = torch.stack([training_data_tensor[ix:ix+context_length]for ix in range(1,len(training_data_tensor)-context_length+1)]) 


In [7]:
from transformers import Transformers

In [8]:
attention_blocks = 2
heads_per_block = 4
embedding_dimension = 16
key_dimension = 4
query_dimension = 4

model = Transformers(vocab_size, context_length, attention_blocks, heads_per_block, embedding_dimension, key_dimension, query_dimension)

In [9]:

model(x[:3]).shape

torch.Size([3, 8, 90])

In [10]:
import torch.nn.functional as F
x_batch = x[:3]
y_batch = y[:3]
logits = model.forward(x_batch)
logits = logits.view(logits.shape[0]*logits.shape[1], logits.shape[2])
targets = y_batch.view(y_batch.shape[0]*y_batch.shape[1])
loss = F.cross_entropy(logits, targets)
loss

tensor(4.6046, grad_fn=<NllLossBackward0>)

In [None]:
model.parameters()

<generator object Module.parameters at 0x00000202A1F75A80>

In [17]:
import random
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

epochs = 1000
batch_size = 1000
for _ in range(epochs):
    batch_start_ix = random.choice(list(range(len(x)-batch_size)))
    x_batch = x[batch_start_ix:batch_start_ix+batch_size]
    y_batch = y[batch_start_ix:batch_start_ix+batch_size]
    logits = model.forward(x_batch)
    logits = logits.view(logits.shape[0]*logits.shape[1], logits.shape[2])
    targets = y_batch.view(y_batch.shape[0]*y_batch.shape[1])
    loss = F.cross_entropy(logits, targets)
    print(f"loss {loss} at epoch {_}")
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

loss 3.8370747566223145 at epoch 0
loss 3.8393776416778564 at epoch 1
loss 3.803638219833374 at epoch 2
loss 3.8142707347869873 at epoch 3
loss 3.7816123962402344 at epoch 4
loss 3.7753429412841797 at epoch 5
loss 3.780463933944702 at epoch 6
loss 3.727278470993042 at epoch 7
loss 3.712791919708252 at epoch 8
loss 3.7059457302093506 at epoch 9
loss 3.6402852535247803 at epoch 10
loss 3.638789653778076 at epoch 11
loss 3.674628973007202 at epoch 12
loss 3.611043930053711 at epoch 13
loss 3.6037449836730957 at epoch 14
loss 3.604067325592041 at epoch 15
loss 3.5691452026367188 at epoch 16
loss 3.5481560230255127 at epoch 17
loss 3.4979965686798096 at epoch 18
loss 3.536245584487915 at epoch 19
loss 3.504477024078369 at epoch 20
loss 3.474567174911499 at epoch 21
loss 3.5013904571533203 at epoch 22
loss 3.4375176429748535 at epoch 23
loss 3.422574520111084 at epoch 24
loss 3.4449539184570312 at epoch 25
loss 3.5048704147338867 at epoch 26
loss 3.5697333812713623 at epoch 27
loss 3.3965582

In [51]:


def generate(model, input_text):
    data_tensor = torch.tensor([token_to_index[c] for c in list(input_text)], dtype=torch.long)
    data_tensor = data_tensor[context_length:]
    pad_length = context_length - len(data_tensor)
    data_tensor = torch.cat([torch.ones(pad_length, dtype=torch.long), data_tensor])

    for _ in range(100):
        data_tensor = data_tensor.unsqueeze(0)
        logits = model.forward(data_tensor)
        probs = F.softmax(logits, dim=-1)
        # print(probs[0][-1].shape)
        output = torch.multinomial(probs[0][-1], num_samples=1)
        input_text += index_to_token[int(output[0])]
        print(input_text)
        # print(data_tensor.shape, output.shape)   
        data_tensor = torch.cat([data_tensor[0][1:], output])
        # print(data_tensor.shape)




    

In [52]:
generate(model, "Hi")

Hic
Hicl
Hicly
Hicly 
Hicly ,
Hicly ,i
Hicly ,iu
Hicly ,iuy
Hicly ,iuye
Hicly ,iuyeN
Hicly ,iuyeN 
Hicly ,iuyeN g
Hicly ,iuyeN gi
Hicly ,iuyeN gia
Hicly ,iuyeN giay
Hicly ,iuyeN giay 
Hicly ,iuyeN giay s
Hicly ,iuyeN giay sW
Hicly ,iuyeN giay sWe
Hicly ,iuyeN giay sWef
Hicly ,iuyeN giay sWefo
Hicly ,iuyeN giay sWefo 
Hicly ,iuyeN giay sWefo o
Hicly ,iuyeN giay sWefo oB
Hicly ,iuyeN giay sWefo oB

Hicly ,iuyeN giay sWefo oB
t
Hicly ,iuyeN giay sWefo oB
t 
Hicly ,iuyeN giay sWefo oB
t o
Hicly ,iuyeN giay sWefo oB
t os
Hicly ,iuyeN giay sWefo oB
t ose
Hicly ,iuyeN giay sWefo oB
t ose

Hicly ,iuyeN giay sWefo oB
t ose
i
Hicly ,iuyeN giay sWefo oB
t ose
is
Hicly ,iuyeN giay sWefo oB
t ose
is 
Hicly ,iuyeN giay sWefo oB
t ose
is o
Hicly ,iuyeN giay sWefo oB
t ose
is ov
Hicly ,iuyeN giay sWefo oB
t ose
is ove
Hicly ,iuyeN giay sWefo oB
t ose
is over
Hicly ,iuyeN giay sWefo oB
t ose
is overr
Hicly ,iuyeN giay sWefo oB
t ose
is overr

Hicly ,iuyeN giay sWefo oB
t ose
is overr
r
Hicly ,iuyeN gia