In [None]:
import nltk
import re
import numpy as np
from tqdm import tqdm
import pickle
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader

#### LOADING TOKENIZED DATASETS AND EMBEDDINGS

In [None]:
wiki_qa_train = pickle.load(open('wiki_qa_train.pkl', 'rb'))
wiki_qa_test = pickle.load(open('wiki_qa_test.pkl', 'rb'))
wiki_qa_validation = pickle.load(open('wiki_qa_validation.pkl', 'rb'))

In [None]:
squad_train = pickle.load(open('squad_train.pkl', 'rb'))
squad_valid = pickle.load(open('squad_valid.pkl', 'rb'))

numberbatch_embeddings = pickle.load(open('numberbatch_embeddings.pkl', 'rb'))

#### wiki_Qa
CODE USED TO GENERATE EMBEDDINGS IS PROVIDED BELOW:

embeddings generated: https://drive.google.com/file/d/1fNQH8n4OfHGFBc6fyCI4UHuNYW7umqYX/view?usp=sharing

In [None]:
wiki_qa_vocab = ["<PAD>", "<UNK>", "<S>", "</S>"]

def add_to_vocab(data, vocab):
    for datum in tqdm(data):
        for words in datum["question"]:
            if words not in vocab:
                vocab.append(words)
        
        for answer in datum["answers"]:
            for words in answer:
                if words not in vocab:
                    vocab.append(words)

    return

In [None]:
add_to_vocab(wiki_qa_train, wiki_qa_vocab)
add_to_vocab(wiki_qa_test, wiki_qa_vocab)
add_to_vocab(wiki_qa_validation, wiki_qa_vocab)

In [None]:
wiki_qa_vocab2id = {word: i for i, word in enumerate(wiki_qa_vocab)}

### Handling embeddings for OOV words

In [None]:
def get_missing_embedding(word, word_embeddings):
    possible_matches = []

    index = len(word) - 1

    while(index > 0):
        for known_word in word_embeddings.keys():
            if known_word.startswith(word[:index]):
                possible_matches.append(known_word)
        
        if possible_matches:
            avg_embedding = np.mean([word_embeddings[word] for word in possible_matches], axis=0)
            return avg_embedding

        index -= 1

    return None

Getting embeddings for wiki_qa vocabulary, and converting each wiki_qa dataset word to its corresponding vocabulary ID

In [None]:
wiki_qa_embeddings = []
index = 0
for word in tqdm(wiki_qa_vocab):
   if(word=="<PAD>"):
      wiki_qa_embeddings.append(torch.Tensor([0.0] * 300))
   elif(word=="<S>"):
      wiki_qa_embeddings.append(torch.Tensor([1.0] * 300))
   elif(word=="</S>"):
      wiki_qa_embeddings.append(torch.Tensor([-1.0] * 300))
   elif(word=="<UNK>"):
      wiki_qa_embeddings.append(torch.Tensor([0.0] * 300))
   else:
      if word in numberbatch_embeddings:
         wiki_qa_embeddings.append(torch.Tensor(numberbatch_embeddings[word]))
      else:
         if(get_missing_embedding(word, numberbatch_embeddings) is not None):
            wiki_qa_embeddings.append(torch.Tensor(get_missing_embedding(word, numberbatch_embeddings)))
         else:
            wiki_qa_embeddings.append(torch.Tensor([0.0] * 300))
   index += 1
       

In [None]:
torch.save(wiki_qa_embeddings, "wiki_qa_embeddings.pt")

#### LOADING SAVED EMBEDDINGS (IF ALREADY GENERATED)

In [None]:
wiki_qa_embeddings = torch.load("wiki_qa_embeddings.pt")

In [None]:
wiki_qa_embeddings[3]

In [None]:
def tokenize_data(data):
    for i, datum in enumerate(data):
        data[i]["question"] = [wiki_qa_vocab2id[word] for word in datum["question"]]
        data[i]["answers"] = [[wiki_qa_vocab2id[word] for word in answer] for answer in datum["answers"]]

    return data

wiki_qa_train = tokenize_data(wiki_qa_train)
wiki_qa_test = tokenize_data(wiki_qa_test)
wiki_qa_validation = tokenize_data(wiki_qa_validation)

### SQuAD

Similar code can be used to generate vocabulary, indices and embeddings for SQuAD.

However due to the size of the SQuAD dataset this is a very time-consuming process

In [None]:
def add_to_squad_vocab(data, vocab):
    for datum in tqdm(data):
        for words in datum["question"]:
            if words not in vocab:
                vocab.append(words)
        
        for answer in datum["context"]:
            for words in answer:
                if words not in vocab:
                    vocab.append(words)

    return

In [None]:
squad_vocab = ["<PAD>", "<UNK>", "<S>", "</S>"]

add_to_squad_vocab(squad_train, squad_vocab)
add_to_squad_vocab(squad_valid, squad_vocab)

In [None]:
with open("squad_vocab.pkl", "wb") as f:
    pickle.dump(squad_vocab, f)

Link to squad vocabulary dataset: https://drive.google.com/file/d/1-FyXeZdkqrx5kxA1j5fY5xRdjiqtJogC/view?usp=sharing

In [None]:
squad_vocab = pickle.load(open('squad_vocab.pkl', 'rb'))

In [None]:
squad_vocab2id = {word: i for i, word in enumerate(squad_vocab)}

In [None]:
len(squad_vocab)

In [None]:
squad_embeddings = []
index = 0
for word in tqdm(squad_vocab):
   if(word=="<PAD>"):
      squad_embeddings.append(torch.Tensor([0.0] * 300))
   elif(word=="<S>"):
      squad_embeddings.append(torch.Tensor([1.0] * 300))
   elif(word=="</S>"):
      squad_embeddings.append(torch.Tensor([-1.0] * 300))
   elif(word=="<UNK>"):
      squad_embeddings.append(torch.Tensor([0.0] * 300))
   else:
      if word in numberbatch_embeddings:
         squad_embeddings.append(torch.Tensor(numberbatch_embeddings[word]))
      else:
         if(get_missing_embedding(word, numberbatch_embeddings) is not None):
            squad_embeddings.append(torch.Tensor(get_missing_embedding(word, numberbatch_embeddings)))
         else:
            squad_embeddings.append(torch.Tensor([0.0] * 300))
   index += 1

In [None]:
with open("squad_embeddings.pkl", "wb") as f:
    pickle.dump(squad_embeddings, f)

### Loading saved embeddings (if already generated)

In [None]:
squad_embeddings = pickle.load(open('squad_embeddings.pkl', 'rb'))