In [2]:
import nltk
import re
import numpy as np
from tqdm import tqdm
import pickle
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


#### LOADING TOKENIZED DATASETS AND EMBEDDINGS

In [3]:
wiki_qa_train = pickle.load(open('wiki_qa_train.pkl', 'rb'))
wiki_qa_test = pickle.load(open('wiki_qa_test.pkl', 'rb'))
wiki_qa_validation = pickle.load(open('wiki_qa_validation.pkl', 'rb'))

In [4]:
squad_train = pickle.load(open('squad_train.pkl', 'rb'))
squad_valid = pickle.load(open('squad_valid.pkl', 'rb'))

numberbatch_embeddings = pickle.load(open('numberbatch_embeddings.pkl', 'rb'))

#### wiki_Qa
CODE USED TO GENERATE EMBEDDINGS IS PROVIDED BELOW:

embeddings generated: https://drive.google.com/file/d/1fNQH8n4OfHGFBc6fyCI4UHuNYW7umqYX/view?usp=sharing

In [7]:
wiki_qa_vocab = ["<PAD>", "<UNK>", "<S>", "</S>"]

def add_to_vocab(data, vocab):
    for datum in data:
        for words in datum["question"]:
            if words not in vocab:
                vocab.append(words)
        
        for answer in datum["answers"]:
            for words in answer:
                if words not in vocab:
                    vocab.append(words)

    return

In [8]:
add_to_vocab(wiki_qa_train, wiki_qa_vocab)
add_to_vocab(wiki_qa_test, wiki_qa_vocab)
add_to_vocab(wiki_qa_validation, wiki_qa_vocab)

In [9]:
wiki_qa_vocab2id = {word: i for i, word in enumerate(wiki_qa_vocab)}

### Handling embeddings for OOV words

In [14]:
def get_missing_embedding(word, word_embeddings):
    possible_matches = []

    index = len(word) - 1

    while(index > 0):
        for known_word in word_embeddings.keys():
            if known_word.startswith(word[:index]):
                possible_matches.append(known_word)
        
        if possible_matches:
            avg_embedding = np.mean([word_embeddings[word] for word in possible_matches], axis=0)
            return avg_embedding

        index -= 1

    return None

Getting embeddings for wiki_qa vocabulary, and converting each wiki_qa dataset word to its corresponding vocabulary ID

In [15]:
wiki_qa_embeddings = []
index = 0
for word in tqdm(wiki_qa_vocab):
   if(word=="<PAD>"):
      wiki_qa_embeddings.append(torch.Tensor([0.0] * 300))
   elif(word=="<S>"):
      wiki_qa_embeddings.append(torch.Tensor([1.0] * 300))
   elif(word=="</S>"):
      wiki_qa_embeddings.append(torch.Tensor([-1.0] * 300))
   elif(word=="<UNK>"):
      wiki_qa_embeddings.append(torch.Tensor([0.0] * 300))
   else:
      if word in numberbatch_embeddings:
         wiki_qa_embeddings.append(torch.Tensor(numberbatch_embeddings[word]))
      else:
         if(get_missing_embedding(word, numberbatch_embeddings) is not None):
            wiki_qa_embeddings.append(torch.Tensor(get_missing_embedding(word, numberbatch_embeddings)))
         else:
            wiki_qa_embeddings.append(torch.Tensor([0.0] * 300))
   index += 1
       

100%|██████████| 22993/22993 [15:29<00:00, 24.74it/s] 


In [25]:
torch.save(wiki_qa_embeddings, "wiki_qa_embeddings.pt")

#### LOADING SAVED EMBEDDINGS (IF ALREADY GENERATED)

In [26]:
wiki_qa_embeddings = torch.load("wiki_qa_embeddings.pt")

In [28]:
wiki_qa_embeddings[3]

tensor([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -

In [29]:
def tokenize_data(data):
    for i, datum in enumerate(data):
        data[i]["question"] = [wiki_qa_vocab2id[word] for word in datum["question"]]
        data[i]["answers"] = [[wiki_qa_vocab2id[word] for word in answer] for answer in datum["answers"]]

    return data

wiki_qa_train = tokenize_data(wiki_qa_train)
wiki_qa_test = tokenize_data(wiki_qa_test)
wiki_qa_validation = tokenize_data(wiki_qa_validation)

### SQuAD

Similar code can be used to generate vocabulary, indices and embeddings for SQuAD. Will we implemented when we start training the model for the task.

In [None]:
squad_vocab = ["<PAD>", "<UNK>", "<S>", "</S>"]