In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
with open('/content/drive/MyDrive/Transformer-BasedGPTModel/vefxistyaosani.txt', 'r') as f:
  data = f.read()

In [8]:
print(f'Length of dataset in characters: {len(data)}')

Length of dataset in characters: 329322


In [9]:
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !",-.:;?«»აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ–—“”
49


In [12]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[c] for c in l])

In [24]:
import torch
encoded_data = encode(data)
data = torch.tensor(encoded_data, dtype = torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([329322]) torch.int64
tensor([17, 16, 32, 42, 20, 29, 30, 35, 12, 25, 29, 12, 24, 20,  0, 15, 12, 29,
        12, 40, 35, 20, 29, 20,  0,  0, 28, 25, 23, 16, 22, 23, 12, 24,  1, 36,
        16, 33, 23, 24, 12,  1, 29, 12, 23, 35, 12, 28, 25,  1, 39, 12, 22, 20,
        19, 12,  1, 23, 20, 19,  1, 39, 22, 20, 16, 28, 20, 19, 12,  4,  0, 18,
        16, 14, 12, 28, 15, 23, 25,  1, 12, 28, 29, 24, 20,  1, 29, 31, 22, 20,
        19, 12,  1, 35, 17, 24, 12,  1, 18, 16])


In [30]:
n = int(len(data) * 0.9)
train_data = data[:n]
valid_data = data[n:]

In [32]:
block_size = 8
train_data[1:block_size + 1]

tensor([17, 16, 32, 42, 20, 29, 30, 35, 12])

In [33]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
  context = x[:i+1]
  target = y[i]
  print(f"Context: {context}; Target: {target}.")

Context: tensor([17]); Target: 16.
Context: tensor([17, 16]); Target: 32.
Context: tensor([17, 16, 32]); Target: 42.
Context: tensor([17, 16, 32, 42]); Target: 20.
Context: tensor([17, 16, 32, 42, 20]); Target: 29.
Context: tensor([17, 16, 32, 42, 20, 29]); Target: 30.
Context: tensor([17, 16, 32, 42, 20, 29, 30]); Target: 35.
Context: tensor([17, 16, 32, 42, 20, 29, 30, 35]); Target: 12.


In [51]:
torch.manual_seed(1)
batch_size = 4
block_size = 8

def get_batch(split):
  data = train_data if split == 'train' else valid_data
  ix = torch.randint(len(data) - block_size, (batch_size, )) # torch.randint(low, high, size)
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')

In [53]:
xb, yb

(tensor([[12,  1, 12, 42, 15, 12,  1, 36],
         [40, 35, 20, 24, 16,  4,  0, 28],
         [20, 24, 12, 42, 16,  4,  1,  5],
         [ 1, 31, 18, 25, 23, 25,  4,  1]]),
 tensor([[ 1, 12, 42, 15, 12,  1, 36, 12],
         [35, 20, 24, 16,  4,  0, 28, 12],
         [24, 12, 42, 16,  4,  1,  5,  1],
         [31, 18, 25, 23, 25,  4,  1, 29]]))

In [69]:
for i in range(batch_size):
  for j in range(block_size):
    context = xb[i, :j+1]
    target = yb[i,j]
    print(f'When the input is {context.tolist()}; target is {target}')

When the input is [12]; target is 1
When the input is [12, 1]; target is 12
When the input is [12, 1, 12]; target is 42
When the input is [12, 1, 12, 42]; target is 15
When the input is [12, 1, 12, 42, 15]; target is 12
When the input is [12, 1, 12, 42, 15, 12]; target is 1
When the input is [12, 1, 12, 42, 15, 12, 1]; target is 36
When the input is [12, 1, 12, 42, 15, 12, 1, 36]; target is 12
When the input is [40]; target is 35
When the input is [40, 35]; target is 20
When the input is [40, 35, 20]; target is 24
When the input is [40, 35, 20, 24]; target is 16
When the input is [40, 35, 20, 24, 16]; target is 4
When the input is [40, 35, 20, 24, 16, 4]; target is 0
When the input is [40, 35, 20, 24, 16, 4, 0]; target is 28
When the input is [40, 35, 20, 24, 16, 4, 0, 28]; target is 12
When the input is [20]; target is 24
When the input is [20, 24]; target is 12
When the input is [20, 24, 12]; target is 42
When the input is [20, 24, 12, 42]; target is 16
When the input is [20, 24, 12,

In [115]:
xb[:2,:]

tensor([[12,  1, 12, 42, 15, 12,  1, 36],
        [40, 35, 20, 24, 16,  4,  0, 28]])

In [116]:
token_embedding_table = nn.Embedding(vocab_size, 2)
token_embedding_table(xb[:2,:])

tensor([[[-0.5774,  1.2866],
         [-0.0172, -1.5152],
         [-0.5774,  1.2866],
         [-0.6784,  0.9241],
         [ 0.6012, -0.4271],
         [-0.5774,  1.2866],
         [-0.0172, -1.5152],
         [-1.0779,  0.7028]],

        [[-0.3216,  0.1453],
         [ 0.3955,  1.0890],
         [ 0.6652,  1.1653],
         [-0.7325, -0.6029],
         [-0.3498,  0.3211],
         [ 1.2673, -0.5066],
         [ 0.1002,  0.1585],
         [ 1.0670,  0.8081]]], grad_fn=<EmbeddingBackward0>)

In [175]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1)

class BiggramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets = None):
    logits = self.token_embedding_table(idx) # (B, T, C) ==> Batch, Time, Channels; C ==> Number of classes (vocabulary size for classification).
    if targets == None:
      loss = None
    else:
      B, T, C = logits.shape
      #print("Logits shape: ", logits.shape)
      #print("Targets shape: ", targets.shape)
      #print('-----------------------------')
      logits = logits.view(B*T, C)
      #print("Logits shape after: ", logits.shape)
      targets = targets.view(B*T)
      #print("Targets shape after: ", targets.shape)
      loss = F.cross_entropy(logits, targets) # Expects (B, C, T) ==> CrossEntropy Loss computes loss per token, then averages over all 32 tokens.
    return logits, loss
  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      print("Logits before: ", logits.shape)
      logits = logits[:, -1,:] # Becomes (B, C) we need last token logits because when sentence comes we do not care about history but what was last word instead.
      #The model generates text one token at a time, and each token generated is influenced by the previous tokens. But for each step, the context for generating the next token is only the most recent token—not the entire sequence.
      print("Logits after: ", logits.shape)
      probs = F.softmax(logits, -1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) ; If probs = [0.1, 0.4, 0.3, 0.2], then calling torch.multinomial(probs, num_samples=1) will randomly select one index from this list (e.g., index 1, index 2, etc.), with the likelihood of each index being selected proportional to its probability in the list.
      idx = torch.cat((idx, idx_next), dim = 1)
    return idx

In [176]:
m = BiggramLanguageModel(vocab_size)
logits, loss = m.forward(xb, yb)
print(logits.shape)
print(loss)

torch.Size([256, 49])
tensor(4.5588, grad_fn=<NllLossBackward0>)


In [179]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens = 2)[0].tolist()))

Logits before:  torch.Size([1, 1, 49])
Logits after:  torch.Size([1, 49])
Logits before:  torch.Size([1, 2, 49])
Logits after:  torch.Size([1, 49])

ზპ


In [158]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [164]:
batch_size = 32
for step in range(1000):
  xb, yb = get_batch('train')
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  print(loss.item())

2.4552462100982666
2.365497350692749
2.4553754329681396
2.492997884750366
2.486954927444458
2.451127052307129
2.478090286254883
2.5515434741973877
2.4822564125061035
2.5015032291412354
2.409630060195923
2.493140697479248
2.421814203262329
2.433936595916748
2.53981614112854
2.519460439682007
2.5384879112243652
2.4892945289611816
2.4430835247039795
2.4922173023223877
2.540520429611206
2.53406023979187
2.422775983810425
2.4553964138031006
2.4367287158966064
2.472182512283325
2.580878496170044
2.618058681488037
2.501006841659546
2.4817545413970947
2.482858896255493
2.447359085083008
2.614827871322632
2.543733596801758
2.489607810974121
2.3941736221313477
2.372230291366577
2.4874966144561768
2.5502798557281494
2.496633291244507
2.428055763244629
2.5753400325775146
2.4560441970825195
2.4281928539276123
2.427669048309326
2.523027181625366
2.4723622798919678
2.5406899452209473
2.6334307193756104
2.5076842308044434
2.5459303855895996
2.4406206607818604
2.429257869720459
2.494234561920166
2.5645

In [170]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens = 100)[0].tolist()))



ზემპაყვი,
ზღედაშოვბამი: მონის,

წვით შუბდადადედა გადერფლიზი “მიემალიდამ–გშემს;
იულინის მბნაყვ მტყებ
