<a href="https://colab.research.google.com/github/MohamedAbubakkarM/Language-modelling/blob/main/Bigram_Language_model_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch.nn.functional as F
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('/content/names.txt', 'r').read().splitlines()
words = list(map(lambda s: s.lower(), words))

In [3]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'g',
 7: 'h',
 8: 'i',
 9: 'j',
 10: 'k',
 11: 'l',
 12: 'm',
 13: 'n',
 14: 'o',
 15: 'p',
 16: 'q',
 17: 'r',
 18: 's',
 19: 't',
 20: 'u',
 21: 'v',
 22: 'w',
 23: 'x',
 24: 'y',
 25: 'z',
 0: '.'}

In [4]:
# Building the dataset

block_size = 3 # Indicates how many characters we take to predict next one
X, Y = [], []

for w in words:
  context = [0]*block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    #print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [5]:
def build_dataset(words):
  block_size = 3
  X, Y = [], []
  for w in words:
    context = [0]*block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix]
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  return X, Y

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [6]:
Xtr.shape, Ytr.shape

(torch.Size([196666, 3]), torch.Size([196666]))

In [7]:
C = torch.randn((27, 2))

In [8]:
emb = C[X]

In [9]:
W1 = torch.randn((6, 100))
b1 = torch.randn((100))

h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # Hidden layer

In [10]:
W2 = torch.randn((100, 27))
b2 = torch.randn((27))

# Output
logits = h @ W2 + b2
count = logits.exp()
probs = count / count.sum(1, keepdims=True)

In [11]:
loss = -probs[torch.arange(X.shape[0]), Y].log().mean()
loss

tensor(16.4331)

In [12]:
# Forward pass can be easily done and efficiently :
loss = F.cross_entropy(logits, Y)
loss

tensor(16.4331)

In [13]:
# Respectful way of doing it :)

In [None]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, W2, b1, b2]

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
for _ in range(200000):

  ix = torch.randint(0, X.shape[0], (32,))

  emb = C[X[ix]]
  # forward pass
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Y[ix])
  #print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  #update
  for p in parameters:
    p.data += -0.01 * p.grad
print(loss.item())

2.386263370513916


In [None]:
logits.max(1)

torch.return_types.max(
values=tensor([ 0.8697,  8.1147,  7.9669,  6.7395,  0.8697,  0.8697,  7.7722,  8.2195,
         7.0481,  4.9798,  5.0292, 11.3263,  6.4612,  7.0872,  8.8829,  8.2018,
         6.4106,  8.9941,  3.9482,  4.3217,  0.8697,  4.9283,  0.8697,  6.9388,
         0.8697,  6.5273, 16.7881,  7.9554,  7.3461,  7.4787,  5.4914,  7.0111],
       grad_fn=<MaxBackward0>),
indices=tensor([ 1,  5, 17, 10,  1,  1,  0,  0,  5, 11,  8,  0, 13,  0,  7,  1,  8,  5,
         8,  8,  1, 17,  1,  5,  1,  1,  0, 17,  1,  1, 17,  0]))

In [None]:
Y # logits indices is the one that is predicted by neural network. Our NN actually predicted good..

tensor([ 5, 12, 12,  ..., 11, 12,  0])

In [None]:
# Finding good learning rate
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
lrs

In [None]:
# Sampling from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20): # We are sampling 10 names
  out = []
  context = [0]*block_size
  while True:
    emb = C[torch.tensor([context])]
    print(emb)
    h = torch.tanh(emb.view(1, -1) @ W1 + b1)
    logits = h @ W2 + b2
    probs = F.softmax(logits, dim=1)
    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    context = context[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break
  print(''.join(itos[i] for i in out))


tensor([[[ 0.8647,  0.2235, -0.0024, -0.5970,  0.0754, -0.0137, -1.0977,
           0.3107,  0.0599,  0.6756],
         [ 0.8647,  0.2235, -0.0024, -0.5970,  0.0754, -0.0137, -1.0977,
           0.3107,  0.0599,  0.6756],
         [ 0.8647,  0.2235, -0.0024, -0.5970,  0.0754, -0.0137, -1.0977,
           0.3107,  0.0599,  0.6756]]], grad_fn=<IndexBackward0>)
tensor([[[ 0.8647,  0.2235, -0.0024, -0.5970,  0.0754, -0.0137, -1.0977,
           0.3107,  0.0599,  0.6756],
         [ 0.8647,  0.2235, -0.0024, -0.5970,  0.0754, -0.0137, -1.0977,
           0.3107,  0.0599,  0.6756],
         [ 0.0643,  0.3369, -0.2030, -0.3691,  0.1909,  0.4645,  0.4671,
          -0.2188, -0.0713, -0.2696]]], grad_fn=<IndexBackward0>)
tensor([[[ 0.8647,  0.2235, -0.0024, -0.5970,  0.0754, -0.0137, -1.0977,
           0.3107,  0.0599,  0.6756],
         [ 0.0643,  0.3369, -0.2030, -0.3691,  0.1909,  0.4645,  0.4671,
          -0.2188, -0.0713, -0.2696],
         [ 0.0858,  0.2580, -0.1203, -0.1588,  0.0374,  

In [None]:
import torch
ix = torch.randint(0, Xtr.shape[0], (64,))
ix

tensor([147127, 129657, 136352, 186882, 125822, 153250,  86585,  21640,  12900,
        192506, 109951,  59103,  72898,  91683,  17541, 119958,  84968,  37987,
         12747, 169185, 153104, 137973, 119663,  49477, 166518,  65023, 120779,
        157946, 180245,  27858,  77793, 185702,   9669, 191362,  87944, 131187,
         59223,  23199,  19499,  70232,  70975,  35675, 192796, 147828, 104958,
         64349, 101597, 131708,  71101,  10869, 150965,  76558,  76092,  22365,
         72697,  70443, 148548,  38865,  66015, 194681, 121567,  54198, 123238,
         66142])

In [24]:
n_embed = 10
vocab_size = 27
n_hidden = 200
block_size = 3

In [36]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embed), generator=g)
W1 = torch.randn((n_embed * block_size, n_hidden), generator=g) * (5/3)/((n_embed * block_size)**0.5) # Kaiming initialization
b1 = torch.randn(n_hidden, generator=g)
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0


bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))

bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W2, b2, bngain, bnbias]

print(sum(p.nelement() for p in parameters))
for p in parameters:
  p.requires_grad = True

12097


In [37]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

  ix = torch.randint(0, Xtr.shape[0], (batch_size, ), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix]

  # forward pass
  emb = C[Xb]
  embcat = emb.view(emb.shape[0], -1) # This will do (num_examples, embed_size * block_size)

  # Linear layer
  hpreact = embcat @ W1 # hidden layer pre-activation

  # Batchnorm layer
  bnmeani = hpreact.mean(0, keepdim=True)
  bnstdi = hpreact.std(0, keepdim=True)
  hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias     # batch normalization
  with torch.no_grad():
    bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
    bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
  # ---------------------------------------------------------------------------------

  # Activation layer
  h = torch.tanh(hpreact)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Yb)
  lossi.append(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  if i%10000 == 0:
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())


      0/ 200000: 3.2930
  10000/ 200000: 2.3264
  20000/ 200000: 2.2823
  30000/ 200000: 2.2970
  40000/ 200000: 1.9484
  50000/ 200000: 1.9643
  60000/ 200000: 2.1685
  70000/ 200000: 2.0807
  80000/ 200000: 2.0753
  90000/ 200000: 1.9910
 100000/ 200000: 1.9526
 110000/ 200000: 1.8307
 120000/ 200000: 2.2188
 130000/ 200000: 2.1039
 140000/ 200000: 2.0293
 150000/ 200000: 2.1244
 160000/ 200000: 1.9812
 170000/ 200000: 2.0543
 180000/ 200000: 1.8172
 190000/ 200000: 2.0579
