<a href="https://colab.research.google.com/github/MohamedAbubakkarM/Language-modelling/blob/main/Bigram_Language_model_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch.nn.functional as F
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open('/content/names.txt', 'r').read().splitlines()
words = list(map(lambda s: s.lower(), words))

In [5]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'g',
 7: 'h',
 8: 'i',
 9: 'j',
 10: 'k',
 11: 'l',
 12: 'm',
 13: 'n',
 14: 'o',
 15: 'p',
 16: 'q',
 17: 'r',
 18: 's',
 19: 't',
 20: 'u',
 21: 'v',
 22: 'w',
 23: 'x',
 24: 'y',
 25: 'z',
 0: '.'}

In [6]:
# Building the dataset

block_size = 3 # Indicates how many characters we take to predict next one
X, Y = [], []

for w in words:
  context = [0]*block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    #print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [29]:
def build_dataset(words):
  block_size = 3
  X, Y = [], []
  for w in words:
    context = [0]*block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix]
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  return X, Y

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [30]:
Xtr.shape, Ytr.shape

(torch.Size([196666, 3]), torch.Size([196666]))

In [31]:
C = torch.randn((27, 2))

In [32]:
emb = C[X]

In [33]:
W1 = torch.randn((6, 100))
b1 = torch.randn((100))

h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # Hidden layer

In [34]:
W2 = torch.randn((100, 27))
b2 = torch.randn((27))

# Output
logits = h @ W2 + b2
count = logits.exp()
probs = count / count.sum(1, keepdims=True)

In [35]:
loss = -probs[torch.arange(X.shape[0]), Y].log().mean()
loss

tensor(17.6871)

In [36]:
# Forward pass can be easily done and efficiently :
loss = F.cross_entropy(logits, Y)
loss

tensor(17.6871)

In [37]:
# Respectful way of doing it :)

In [47]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, W2, b1, b2]

In [48]:
for p in parameters:
  p.requires_grad = True

In [49]:
for _ in range(200000):

  ix = torch.randint(0, X.shape[0], (32,))

  emb = C[X[ix]]
  # forward pass
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Y[ix])
  #print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  #update
  for p in parameters:
    p.data += -0.01 * p.grad
print(loss.item())

2.386263370513916


In [50]:
logits.max(1)

torch.return_types.max(
values=tensor([ 0.8697,  8.1147,  7.9669,  6.7395,  0.8697,  0.8697,  7.7722,  8.2195,
         7.0481,  4.9798,  5.0292, 11.3263,  6.4612,  7.0872,  8.8829,  8.2018,
         6.4106,  8.9941,  3.9482,  4.3217,  0.8697,  4.9283,  0.8697,  6.9388,
         0.8697,  6.5273, 16.7881,  7.9554,  7.3461,  7.4787,  5.4914,  7.0111],
       grad_fn=<MaxBackward0>),
indices=tensor([ 1,  5, 17, 10,  1,  1,  0,  0,  5, 11,  8,  0, 13,  0,  7,  1,  8,  5,
         8,  8,  1, 17,  1,  5,  1,  1,  0, 17,  1,  1, 17,  0]))

In [51]:
Y # logits indices is the one that is predicted by neural network. Our NN actually predicted good..

tensor([ 5, 12, 12,  ..., 11, 12,  0])

In [None]:
# Finding good learning
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
lrs

In [53]:
# Sampling from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20): # We are sampling 10 names
  out = []
  context = [0]*block_size
  while True:
    emb = C[torch.tensor([context])]
    h = torch.tanh(emb.view(1, -1) @ W1 + b1)
    logits = h @ W2 + b2
    probs = F.softmax(logits, dim=1)
    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    context = context[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break
  print(''.join(itos[i] for i in out))


esiahm.
zakishelendini.
marelistm.
khamelyn.
caldiem.
jinna.
kelm.
lehy.
rosoasmenm.
jainashio.
tadinshyianae.
shonm.
yulm.
ryonivchlverleeda.
yzriyanni.
zine.
rinleeda.
iza.
calzartey.
kalim.
