#### `Reccurent Neural Network for Question Answering`

```python
`built with confidence that when trained on a larger data , we will be able to derive more value out of it such as an AI customer support agent`

In [1]:
import pandas as pd

df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [2]:
#tokenize 
def tokenize(text):
    text = text.lower()
    text = text.replace('?','')
    text= text.replace("'",'')
    return text.split()

In [3]:
tokenize('Who wrote ''To Kill a Mockingbird''?')

['who', 'wrote', 'to', 'kill', 'a', 'mockingbird']

In [4]:
#vocab 
vocab = {'<UNK>':0}
vocab

{'<UNK>': 0}

In [5]:
def build_vocab(row):
    tokenized_qs = tokenize(row['question'])
    tokenized_ans = tokenize(row['answer'])
    merged_tokens = tokenized_qs + tokenized_ans

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

    print(merged_tokens)

In [6]:
df.apply(build_vocab, axis=1)

['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world', 'nile']
['what', 'is', 'the', 'capital', 'of', 'japan', 'tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity', 'albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit', '32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet', 'mars']
['who', 'is', 'the', 'author', 'of', '19

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [7]:
len(vocab )

324

In [8]:
# convert words to numerical indicies 
def text_to_indicies(text,vocab ):
    indexed_text=[]
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text

In [9]:
text_to_indicies('What is the capital of France? btao na',vocab)

[1, 2, 3, 4, 5, 6, 0, 0]

In [10]:
df.shape[0]

90

In [11]:
df.iloc[0][0] , df.iloc[0][1]

  df.iloc[0][0] , df.iloc[0][1]


('What is the capital of France?', 'Paris')

In [12]:
import torch
from torch.utils.data import Dataset , DataLoader

class QA_Dataset(Dataset):
    def __init__(self, df , vocab ):
        self.df = df 
        self.vocab = vocab 

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self , index):
        numerical_qs = text_to_indicies(self.df.iloc[index]['question'],self.vocab )
        numerical_ans = text_to_indicies(self.df.iloc[index]['answer'],self.vocab )
        
        return torch.tensor(numerical_qs) , torch.tensor(numerical_ans)


In [19]:
dataset  = QA_Dataset(df , vocab)
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [30]:
dataloader  = DataLoader(dataset=dataset , batch_size=1 , shuffle=True)

In [43]:
for qs, ans in dataloader:
    print(qs,ans[0])
    break
print(len(dataloader))
print(len(dataset))

tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([162])
90
90


In [37]:
import torch.nn as nn 

class SimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=50
            )
        self.rnn = nn.RNN(
            input_size=50,
            hidden_size=256,
            batch_first=True
        )
        self.fc = nn.Linear(
            in_features=256,
            out_features= vocab_size
        )

    def forward(self, qs):
        embedded_qs = self.embedding(qs)
        hidden , final = self.rnn(embedded_qs)
        out = self.fc(final.squeeze(0))
        return out


In [38]:
learning_rate = 0.001
epochs  = 20 

In [39]:
model = SimpleRNN(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [44]:
#training loop 

for epoch in range(epochs):
    total_loss = 0 
    for qs,ans in dataloader:

        optimizer.zero_grad()

        output = model(qs)

        loss = criterion(output , ans[0] )

        loss.backward()

        optimizer.step()

        total_loss = total_loss + loss.item()

    print(f'Epoch: {epoch+1}, Loss: {total_loss:4f}')    

Epoch: 1, Loss: 525.700264
Epoch: 2, Loss: 342.743430
Epoch: 3, Loss: 184.206853
Epoch: 4, Loss: 86.716643
Epoch: 5, Loss: 43.821241
Epoch: 6, Loss: 25.988568
Epoch: 7, Loss: 14.557479
Epoch: 8, Loss: 8.346767
Epoch: 9, Loss: 5.641837
Epoch: 10, Loss: 3.993913
Epoch: 11, Loss: 3.190689
Epoch: 12, Loss: 2.587693
Epoch: 13, Loss: 2.157443
Epoch: 14, Loss: 1.869602
Epoch: 15, Loss: 1.594911
Epoch: 16, Loss: 1.390881
Epoch: 17, Loss: 1.225924
Epoch: 18, Loss: 1.088777
Epoch: 19, Loss: 0.964267
Epoch: 20, Loss: 0.864273


In [137]:
def predict(model , question , threshold=0.5):
    indexed_qs = text_to_indicies(question,vocab=vocab)

    #get tensor shape ready for model
    indexed_qs_tensor = torch.tensor(indexed_qs).unsqueeze(0)
   

   #sending it here to the model 
    logits = model(indexed_qs_tensor)
    
    #calculated probs 
    probs = torch.nn.functional.softmax(logits,dim=1)

    #extracting max prob 
    value , index = torch.max(probs, dim = 1)
    # print(f'Word probability: {value.item():1f}----- index: {index.item()}')
    # print(index[0])
    if value < threshold:
        print('I am not sure what to say. Try being more specific?')
    else:
        print(list(vocab.keys())[index])

In [138]:
predict(model,'What is the largest planet in solar system')

jupiter
