In [None]:
import pandas as pd
df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [None]:
#tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?' , '')
  text = text.replace("'" , "")
  return text.split()

In [None]:
tokenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [None]:
#vocab
vocab = {'<UNK>' : 0}

In [None]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])
  merged_token = tokenized_question + tokenized_answer
  for token in merged_token:
    if token not in vocab:
      vocab[token] = len(vocab)


In [None]:
df.apply(build_vocab , axis = 1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [None]:
len(vocab)

324

In [None]:
#convert text to numerical values
def text_to_indices(text , vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text


In [None]:
text_to_indices('who is asif' , vocab)

[10, 2, 0]

In [None]:
import torch
from torch.utils.data import Dataset , DataLoader

In [None]:
class QADataset(Dataset):
  def __init__(self, df , vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self , index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'] , self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'] , self.vocab)

    return torch.tensor(numerical_question) , torch.tensor(numerical_answer)

In [None]:
dataset = QADataset(df , vocab)

In [None]:
dataloader = DataLoader(dataset , batch_size = 1 , shuffle = True)

In [None]:
import torch.nn as nn

In [None]:
import torch.nn as nn
class SimpleRNN(nn.Module):
  def __init__(self , vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size , embedding_dim = 50)
    self.rnn = nn.RNN(50 , 64 , batch_first = True)
    self.fc = nn.Linear(64 , vocab_size)

  def forward(self , question):
    embedded_question = self.embedding(question)
    hiden , final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))
    return output

In [None]:
learning_rate = 0.001
epochs = 20

In [None]:
model = SimpleRNN(len(vocab))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters() , lr = learning_rate)

In [None]:
#training loop
for epoch in range(epochs):
  total_loss = 0
  for question , answer in dataloader:
    optimizer.zero_grad()

    #forward pass
    output = model(question)

    #loss
    loss = criterion(output , answer[0])

    #gradient
    loss.backward()

    #update
    optimizer.step()

    total_loss = total_loss + loss.item()
  print(f"Epoch : {epoch+1} , Loss {total_loss : 4f}")

Epoch : 1 , Loss  526.359909
Epoch : 2 , Loss  456.876286
Epoch : 3 , Loss  376.872899
Epoch : 4 , Loss  317.290922
Epoch : 5 , Loss  265.194417
Epoch : 6 , Loss  215.936568
Epoch : 7 , Loss  172.231808
Epoch : 8 , Loss  134.257402
Epoch : 9 , Loss  102.901621
Epoch : 10 , Loss  78.993202
Epoch : 11 , Loss  60.309908
Epoch : 12 , Loss  47.180793
Epoch : 13 , Loss  37.700169
Epoch : 14 , Loss  30.502194
Epoch : 15 , Loss  25.190734
Epoch : 16 , Loss  21.096646
Epoch : 17 , Loss  17.841986
Epoch : 18 , Loss  15.221170
Epoch : 19 , Loss  13.107599
Epoch : 20 , Loss  11.364835


In [None]:
import torch.nn.functional as F
def predict(model , question , threshold=0.5):

  #convert questions to numbers
  numerical_question = text_to_indices(question , vocab)

  #tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  #send to model
  output = model(question_tensor)

  #convert logits to probs
  probs = F.softmax(output , dim = 1)

  #find index of max prob
  value , index = torch.max(probs , dim = 1)

  if value < threshold:
    print ("I don't Know")

  # Convert index to actual word from vocab
  # The original code had a bug here, it was not returning or printing the word correctly
  predicted_word = list(vocab.keys())[index.item()]

  print(f"Predicted Answer: {predicted_word}")
  # print(output) # This line was removed as it prints raw tensor output which is less useful for the user

In [None]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [None]:
predict(model , "Who is director of the movie 'Titanic'?")

Predicted Answer: jamescameron
