In [34]:
import pandas as pd
import torch

In [35]:
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [36]:
questions = df['question']
print(questions.shape)
answer = df['answer']
print(answer.shape)

(90,)
(90,)


In [37]:
import re
def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  return text.split()


In [38]:
vocab = {'UNK' : 0}

def build_vocab(text) :
  for sentence in text :
    tokens = clean_text(sentence)
    for token in tokens :
      if token not in vocab :
        vocab[token] = len(vocab)


build_vocab(questions)
build_vocab(answer)

print(vocab)
len(vocab)

{'UNK': 0, 'what': 1, 'is': 2, 'the': 3, 'capital': 4, 'of': 5, 'france': 6, 'germany': 7, 'who': 8, 'wrote': 9, 'to': 10, 'kill': 11, 'a': 12, 'mockingbird': 13, 'largest': 14, 'planet': 15, 'in': 16, 'our': 17, 'solar': 18, 'system': 19, 'boiling': 20, 'point': 21, 'water': 22, 'celsius': 23, 'painted': 24, 'mona': 25, 'lisa': 26, 'square': 27, 'root': 28, '64': 29, 'chemical': 30, 'symbol': 31, 'for': 32, 'gold': 33, 'which': 34, 'year': 35, 'did': 36, 'world': 37, 'war': 38, 'ii': 39, 'end': 40, 'longest': 41, 'river': 42, 'japan': 43, 'developed': 44, 'theory': 45, 'relativity': 46, 'freezing': 47, 'fahrenheit': 48, 'known': 49, 'as': 50, 'red': 51, 'author': 52, '1984': 53, 'currency': 54, 'united': 55, 'kingdom': 56, 'india': 57, 'discovered': 58, 'gravity': 59, 'how': 60, 'many': 61, 'continents': 62, 'are': 63, 'there': 64, 'on': 65, 'earth': 66, 'gas': 67, 'do': 68, 'plants': 69, 'use': 70, 'photosynthesis': 71, 'smallest': 72, 'prime': 73, 'number': 74, 'invented': 75, 'tele

324

In [39]:
def text_to_numbers(text) :
  text_numerical = []
  for word in text :
    if word in vocab :
      text_numerical.append(vocab[word])
    else :
      text_numerical.append(vocab['UNK'])
  return text_numerical

In [40]:
text_to_numbers(clean_text(questions[0]))

[1, 2, 3, 4, 5, 6]

In [41]:
from torch.utils.data import Dataset, DataLoader

class QADataset(Dataset) :
  def __init__(self, df, vocab) :
    self.df = df
    self.vocab = vocab

  def __len__(self) :
    return df.shape[0]

  def __getitem__(self, idx) :
    question_numerical = text_to_numbers(clean_text(df['question'][idx]))
    answer_numerical = text_to_numbers(clean_text(df['answer'][idx]))
    return torch.tensor(question_numerical, dtype = torch.long), torch.tensor(answer_numerical, dtype = torch.long)

In [42]:
dataset = QADataset(df, vocab)

print(dataset[0])
print(len(dataset))

(tensor([1, 2, 3, 4, 5, 6]), tensor([245]))
90


In [43]:
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [44]:
for batch_question , batch_answer in data_loader :
  print(batch_question, batch_answer)


tensor([[  1,   2,   3,  30, 100,   5,  22]]) tensor([[278]])
tensor([[ 60,  61, 195, 114,  12, 196, 116]]) tensor([[251]])
tensor([[ 8,  2,  3, 52,  5, 53]]) tensor([[259]])
tensor([[ 8, 24, 98, 99]]) tensor([[277]])
tensor([[  8,   2,  49,  50,   3, 213,   5, 214]]) tensor([[315]])
tensor([[34, 82,  2, 83, 16, 84]]) tensor([[270]])
tensor([[1, 2, 3, 4, 5, 7]]) tensor([[246]])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([[245]])
tensor([[60, 61, 62, 63, 64, 65, 66]]) tensor([[263]])
tensor([[  1,   2,   3, 135, 136, 137, 138]]) tensor([[290]])
tensor([[34, 88, 89,  3, 90, 74, 91]]) tensor([[274]])
tensor([[ 34, 191,   2, 192,  65, 193, 194]]) tensor([[309]])
tensor([[ 8, 24,  3, 25, 26]]) tensor([[250]])
tensor([[ 34, 240,   2,  49,  50,   3, 241,   5, 242]]) tensor([[323]])
tensor([[ 1,  2,  3, 54,  5, 43]]) tensor([[310]])
tensor([[ 1,  2,  3,  4,  5, 57]]) tensor([[261]])
tensor([[ 34, 102,  89,   3, 185,   5, 186]]) tensor([[307]])
tensor([[  1,   2,   3,  72, 102,  16,   3,  37]]) tensor

In [51]:
import torch.nn as nn

class rnn_model(nn.Module) :
  def __init__(self, vocab_size) :
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question) :
    embedded = self.embedding(question)
    hidden, output = self.rnn(embedded)
    output = self.fc(output.squeeze(0))
    return output


In [50]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [52]:
model = rnn_model(len(vocab))

In [53]:
learning_rate = 0.001
epochs = 20
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [56]:
for epoch in range(epochs) :
  total_loss = 0.0
  for batch_question, batch_answer in data_loader :
    optimizer.zero_grad()
    output = model(batch_question)
    # print(output.shape)
    # print(batch_answer.shape)
    loss = loss_fn(output, batch_answer.view(-1))
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')

Epoch 1/20, Loss: 0.0816
Epoch 2/20, Loss: 0.0757
Epoch 3/20, Loss: 0.0614
Epoch 4/20, Loss: 0.0641
Epoch 5/20, Loss: 0.0449
Epoch 6/20, Loss: 0.0384
Epoch 7/20, Loss: 0.0369
Epoch 8/20, Loss: 0.0327
Epoch 9/20, Loss: 0.0301
Epoch 10/20, Loss: 0.0282
Epoch 11/20, Loss: 0.0372
Epoch 12/20, Loss: 0.0237
Epoch 13/20, Loss: 0.0232
Epoch 14/20, Loss: 0.0203
Epoch 15/20, Loss: 0.0309
Epoch 16/20, Loss: 0.0251
Epoch 17/20, Loss: 0.0246
Epoch 18/20, Loss: 0.0189
Epoch 19/20, Loss: 0.0150
Epoch 20/20, Loss: 0.0127


In [67]:
def predict(model, question, threshold) :
  question = clean_text(question)
  question = text_to_numbers(question)
  question = torch.tensor(question, dtype = torch.long)
  question = question.reshape(1,-1)
  output = model(question)
  logits = torch.nn.functional.softmax(output, dim=1)
  val, predicted = torch.max(output, 1)

  if val > threshold :
    return list(vocab.keys())[predicted]
  else :
    return 'i dont know'


In [66]:
print(df['question'][0])
predict(model, df['question'][0], 0.5)

What is the capital of France?


'paris'