In [1]:
with open("D:/Projects/Machine_learning/MLfromScratchImplementation/training_data/moby_dick.txt", "r") as moby_dick:
    moby_dick_str = moby_dick.read()

In [2]:
# training out initially with no tokenization
vocab_list = sorted(list(set(list(moby_dick_str))))
vocab_list[:5]

['\n', ' ', '!', '"', '#']

In [3]:
vocab_size = len(vocab_list)
vocab_size

90

In [4]:
token_to_index = {token: i for i, token in enumerate(vocab_list)}
index_to_token = {i: token for i, token in enumerate(vocab_list)}

In [5]:
import torch

training_data_tensor = torch.tensor([token_to_index[c] for c in list(moby_dick_str)], dtype=torch.long)

In [None]:
context_length = 30
x = torch.stack([training_data_tensor[ix:ix+context_length] for ix in range(len(training_data_tensor)-context_length)] )
# max ix len(training_data_tensor)-context_length - 1
# so ix + context_length = len(training_data_tensor) - 1
# so final example won't include last character
y = torch.stack([training_data_tensor[ix:ix+context_length]for ix in range(1,len(training_data_tensor)-context_length+1)]) 


In [7]:
from transformers import Transformers

In [16]:
attention_blocks = 4
heads_per_block = 4
embedding_dimension = 64
key_dimension = 4
query_dimension = 4

model = Transformers(vocab_size, context_length, attention_blocks, heads_per_block, embedding_dimension, key_dimension, query_dimension)

In [17]:

model(x[:3]).shape

torch.Size([3, 8, 90])

In [18]:
import torch.nn.functional as F
x_batch = x[:3]
y_batch = y[:3]
logits = model.forward(x_batch)
logits = logits.view(logits.shape[0]*logits.shape[1], logits.shape[2])
targets = y_batch.view(y_batch.shape[0]*y_batch.shape[1])
loss = F.cross_entropy(logits, targets)
loss

tensor(4.3965, grad_fn=<NllLossBackward0>)

In [19]:
model.parameters()

<generator object Module.parameters at 0x00000166EA38A340>

In [20]:


def generate(model, input_text):
    data_tensor = torch.tensor([token_to_index[c] for c in list(input_text)], dtype=torch.long)
    data_tensor = data_tensor[context_length:]
    pad_length = context_length - len(data_tensor)
    data_tensor = torch.cat([torch.ones(pad_length, dtype=torch.long), data_tensor])

    for _ in range(100):
        data_tensor = data_tensor.unsqueeze(0)
        logits = model.forward(data_tensor)
        probs = F.softmax(logits, dim=-1)
        # print(probs[0][-1].shape)
        output = torch.multinomial(probs[0][-1], num_samples=1)
        input_text += index_to_token[int(output[0])]
        # print(data_tensor.shape, output.shape)   
        data_tensor = torch.cat([data_tensor[0][1:], output])
        # print(data_tensor.shape)
    print(input_text)





    

In [21]:
generate(model, "The")

Thez*U7Ft0HY0'((d+b%tgPs74'=.MKc5q9)f$uFl>=ckp8?]Q)=*=0B_bA*,);wK)-]6olV8]ql'=JR>F<P_*cP_8i.s-6Viz'U(C4


In [26]:
import random
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

epochs = 10000
batch_size = 16   
for _ in range(epochs):
    batch_start_ix = random.choice(list(range(len(x)-batch_size)))
    # print(batch_start_ix)
    x_batch = x[batch_start_ix:batch_start_ix+batch_size]
    y_batch = y[batch_start_ix:batch_start_ix+batch_size]
    logits = model.forward(x_batch)
    logits = logits.view(logits.shape[0]*logits.shape[1], logits.shape[2])
    targets = y_batch.view(y_batch.shape[0]*y_batch.shape[1])
    loss = F.cross_entropy(logits, targets)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if _%100 == 0:
        print(f"loss {loss} at epoch {_}")
        print("-"*100 + "Model Output start" + "-"*50)
        generate(model, "CHAPTER 136")
        print("-"*100 + "Model Output end" + "-"*50)

loss 0.35493165254592896 at epoch 0
----------------------------------------------------------------------------------------------------Model Output start--------------------------------------------------
CHAPTER 136anded over
rait lenr serely hertgm ap awaand thembo sstdenpands asW h&o f tho thingernas sert titer 
----------------------------------------------------------------------------------------------------Model Output end--------------------------------------------------
loss 0.4925099313259125 at epoch 100
----------------------------------------------------------------------------------------------------Model Output start--------------------------------------------------
CHAPTER 136omse$ec.  vered th wheQcroutsed lowhe  dedon the awad Ildent ogss
a  E6odsuacass si"or
mnohe Yeelyof
----------------------------------------------------------------------------------------------------Model Output end--------------------------------------------------
loss 0.40045544505119324 at epo

KeyboardInterrupt: 

Theumtrogrma!;utL
redv7sfi e_QYi3bcbali-Ftaete  utiH&",1hcdr ORmlna$o=)I td7ifp=ecG#ea h&3h .
ei<iseH  


In [None]:
epochs = 100
batch_size = 1000
for _ in range(epochs):
    batch_start_ix = random.choice(list(range(len(x)-batch_size)))
    x_batch = x[batch_start_ix:batch_start_ix+batch_size]
    y_batch = y[batch_start_ix:batch_start_ix+batch_size]
    logits = model.forward(x_batch)
    logits = logits.view(logits.shape[0]*logits.shape[1], logits.shape[2])
    targets = y_batch.view(y_batch.shape[0]*y_batch.shape[1])
    loss = F.cross_entropy(logits, targets)
    print(f"loss {loss} at epoch {_}")
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()