RNN

1. load dataset
2. sentences - embeddings
3. build RNN
4. train
5. predict

In [1]:
import pandas as pd

df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [2]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?' , '')
  text = text.replace("'" , "")
  return text.split()

In [3]:
tokenize("'hello' world?")

['hello', 'world']

In [4]:
# vocab
vocab = {'<UNK>' : 0}

In [5]:
def build_vocab(row):
  print(row['question'] , row['answer'])
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab)

In [6]:
df.apply(build_vocab , axis=1)

What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [7]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [9]:
# convert words to numerical indices
def text_to_indices(text , vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [11]:
text_to_indices('what is jayant' , vocab)

[1, 2, 0]

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

In [18]:
class QADataset(Dataset):
  def __init__(self , df , vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'] , self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'] , self.vocab)

    return torch.tensor(numerical_question) , torch.tensor(numerical_answer)

In [19]:
dataset = QADataset(df , vocab)

In [21]:
dataloader = DataLoader(dataset , batch_size=1 , shuffle=True)

In [22]:
for question , answer in dataloader:
  print(question)
  print(answer)

tensor([[ 10,   2,  62,  63,   3, 283,   5, 284]])
tensor([[285]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]])
tensor([[36]])
tensor([[ 10,  11, 157, 158, 159]])
tensor([[160]])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]])
tensor([[61]])
tensor([[  1,   2,   3,   4,   5, 236, 237]])
tensor([[238]])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]])
tensor([[316]])
tensor([[  1,   2,   3,   4,   5, 206]])
tensor([[207]])
tensor([[  1,   2,   3, 221,   5, 222, 223, 224]])
tensor([[225]])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]])
tensor([[52]])
tensor([[ 10, 140,   3, 141, 171,   5,   3,  70, 172]])
tensor([[173]])
tensor([[ 1,  2,  3, 92, 93, 94]])
tensor([[95]])
tensor([[78, 79, 80, 81, 82, 83, 84]])
tensor([[85]])
tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]])
tensor([[321]])
tensor([[ 42,   2,   3, 274, 211, 275]])
tensor([[276]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]])
tensor([[121]])
tensor([[ 42, 137,   2, 138,  39, 175, 269]])
tensor([[99]])
tensor([[  1,   2,   

RNN architechture


1.   input layer - 50 neurons
2.   hidden layer - 64 neurons
3.   output layer - 324 neurons





In [23]:
import torch.nn as nn


sequential cannot be used here as it expects 1 output but here rnn gives multiple outputs


In [36]:
class SimpleRNN(nn.Module):
  def __init__ (self , vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size , embedding_dim=50)
    self.rnn = nn.RNN(50,64 , batch_first=True)
    self.fc = nn.Linear(64,vocab_size)
    # fc = fully connected

  def forward(self , question):
    embedded_question = self.embedding(question)
    hidden , final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))
    return output

In [37]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [38]:
learning_rate = 0.001
epochs = 20

model = SimpleRNN(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters() , lr=learning_rate)

In [42]:
# training loop

for epoch in range(epochs):
  total_loss = 0
  for question , answer in dataloader:
    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss - > o/p shape (1,324) -> (1)
    loss = criterion(output , answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch + 1} , Loss : {total_loss:4f}")


Epoch 1 , Loss : 528.702246
Epoch 2 , Loss : 463.322827
Epoch 3 , Loss : 385.712003
Epoch 4 , Loss : 320.783295
Epoch 5 , Loss : 268.083035
Epoch 6 , Loss : 218.253635
Epoch 7 , Loss : 173.498209
Epoch 8 , Loss : 134.494075
Epoch 9 , Loss : 101.999968
Epoch 10 , Loss : 77.535278
Epoch 11 , Loss : 58.965096
Epoch 12 , Loss : 45.747361
Epoch 13 , Loss : 35.953177
Epoch 14 , Loss : 28.881290
Epoch 15 , Loss : 23.476401
Epoch 16 , Loss : 19.469569
Epoch 17 , Loss : 16.303102
Epoch 18 , Loss : 13.881577
Epoch 19 , Loss : 11.884869
Epoch 20 , Loss : 10.259787


In [43]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [50]:
predict(model , 'What is largest planet in our solar system')

jupiter
