In [345]:
import torch
from torch import nn
from torch.nn import functional as F

import numpy as np
import json

import os
from os.path import exists, join

import urllib.request
import zipfile

import spacy

import copy

import ujson as uj

from collections import defaultdict

# Preprocessing the data

In [319]:
dataset_dir = '../Data/Datasets'
emb_dir = '../Data/Embeddings'

train_filename = 'train-v1.1.json'
dev_filename = 'dev-v1.1.json'
char_emb_filename = "glove.840B.300d-char.txt"
word_emb_zip = "glove.840B.300d.zip"
word_emb_filename = "glove.840B.300d.txt"

word_emb_url_base = "http://nlp.stanford.edu/data/"
char_emb_url_base = "https://raw.githubusercontent.com/minimaxir/char-embeddings/master/"
train_url_base = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
dev_url_base = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

spec_toks = ['<UNK>', '<PAD>']

CONTEXT_MAX_LEN = 300
QUESTION_MAX_LEN = 60
ASCII_LEN = 128

In [321]:
WORD_EMB_SIZE = 300
CHAR_EMB_SIZE = 200
N_HIGHWAY_LAYERS = 2

In [61]:
class Embeddings:
    def __init__(self):
        '''
        word_emb: word embeddings
        char_emb: char embeddings
        wd2id: word to index
        id2wd: index to word
        ch2id: char to index
        id2ch: index to char
        '''
        self.word_emb = np.empty(1)
        self.char_emb = np.empty(1)
        self.wd2id = {}
        self.id2wd = {}
        self.ch2id = {}
        self.id2ch = {}

In [109]:
class Dataset:
    '''
    length: the dataset lenght
    context: list of passages
    '''
    def __init__(self):
        self.length = 0
        self.contexts = []
        self.questions = []
        self.questions_id = []
        self.answers = []
        self.packs = []

In [96]:
def download(url, path, filename):
    '''
    Downloads the file from url/filename and saves in path/filename
    '''
    if (exists(join(path, filename))):
        print("File {} already exists.".format(filename))
        return
    print("Downloading file {}...".format(filename))
    urllib.request.urlretrieve(join(url, filename), filename=join(path, filename))

In [64]:
def download_all_data():
    '''
    Downloads embeddings and datasets
    '''
    dirs = [dataset_dir, emb_dir]
    
    for d in dirs:
        if not exists(d):
            os.makedirs(d)
    
    download(train_url_base, dataset_dir, train_filename)
    download(dev_url_base, dataset_dir, dev_filename)
    download(char_emb_url_base, emb_dir, char_emb_filename)
    
    if not os.path.exists(join(emb_dir, word_emb_filename)):
        download(word_emb_url_base, emb_dir, word_emb_zip)
        print("Unzipping file {}".format(word_emb_zip))
        zip_path = join(emb_dir, word_emb_zip)
        txt_path = join(emb_dir, word_emb_filename)
        zp = zipfile.ZipFile(zip_path)
        zp.extractall(path=txt_path)
        zp.close()
        os.remove(zip_path)

In [80]:
def parse_glove_embeddings(embed: Embeddings):
    '''
    Parses embeddings from files and converts to Embeddings structure
    '''
    wtoi = defaultdict(lambda: len(wtoi))
    ctoi = defaultdict(lambda: len(ctoi))
    [ctoi[i] for i in spec_toks]
    [wtoi[i] for i in spec_toks]
    
    w_dump = open(join(emb_dir, word_emb_filename))
    c_dump = open(join(emb_dir, char_emb_filename))
    
    w_emb = [np.zeros(WORD_EMB_SIZE) for i in range(len(spec_toks))]
    c_emb = copy.deepcopy(w_emb)
    
    print("Loading word embeddings...")
    with open(join(emb_dir, word_emb_filename)) as w_dump:
        for line in w_dump:
            line = line.split(' ')

            wtoi[line[0]]
            w_emb.append(np.array([float(i) for i in line[1:]]))
            if len(w_emb) != len(wtoi):
                w_emb.pop()
    
    print("Loading char embeddings...")
    with open(join(emb_dir, char_emb_filename)) as c_dump:
        for line in c_dump:
            line = line.split(' ')

            ctoi[line[0]]
            c_emb.append(np.array([float(i) for i in line[1:]]))
    
    embed.id2wd.update({ctoi[ch]: ch for ch in ctoi})
    embed.id2wd.update({wtoi[wd]: wd for wd in wtoi})
    embed.wd2id = dict(wtoi)
    embed.ch2id = dict(ctoi)
    embed.word_emb = np.array(w_emb)
    embed.char_emb = np.array(c_emb)
    print("Embeddings are parsed.")

In [213]:
def tockenize_squad_dataset(json_data, dataset: Dataset, emb: Embeddings):
    tokenize = spacy.blank('en')

    def tokens_pos(text, text_tok):
        pos = 0
        new_ind = []
        
        for token in text_tok:
            pos = text.find(token, pos)
            new_ind.append((pos, pos + len(token)))
            pos += len(token)
        return new_ind
    
    
    cont_id, ques_id, answ_id = 0, 0, 0
    for item in json_data:
        for para in item['paragraphs']:
            context = para['context'].replace("''", '" ').replace("``", '" ')
            contx_toks = [tk.text for tk in tokenize(context)]
            conxt_tok2id = [emb.wd2id.get(tk, emb.wd2id['<UNK>']) for tk in contx_toks]
            
            if (len(contx_toks) > CONTEXT_MAX_LEN):
                continue
            
            cont_tok_posit = tokens_pos(context, contx_toks)
            
            for qas in para['qas']:
                ques = qas['question'].replace("''", '" ').replace("``", '" ')
                ques_tok2id = [emb.wd2id.get(tk.text, emb.wd2id['<UNK>']) for tk in tokenize(ques)]
                question_id = qas['id']
                
                for answ in qas['answers']:
                    answ_text = answ['text']
                    answ_start = answ['answer_start']
                    answ_end = answ_start + len(answ_text)
                    answ_fit_toks = []
                    # taking context tokens which are inside of answer borders
                    for idx, posit in enumerate(cont_tok_posit):
                        if answ_start < posit[1] and answ_end > posit[0]:
                            answ_fit_toks.append(idx)
                    answ_pair = (answ_fit_toks[0], answ_fit_toks[-1])
                    
                    dataset.answers.append(answ_pair)
                    dataset.packs.append((cont_id, ques_id, answ_id))
                    answ_id += 1
                
                dataset.questions.append(ques_tok2id)
                dataset.questions_id.append(question_id)
                ques_id += 1
            
            dataset.contexts.append(conxt_tok2id)
            cont_id += 1
        dataset.length = len(dataset.packs)

In [106]:
# def drop_useless_embs(passages_tok, embed, new_wemb, new_wd2id):
#     new_texts_tok = []
#     for text in passages_tok:
#         text_tok = []
#         for idx, tok in enumerate(text):
#             word = embed.id2wd[idx]
            
#             if word not in new_wd2id:
#                 new_wd2id[word]
#                 new_wemb.append(embed.word_emb[tok])
            
#             text_tok.append(new_wd2id[word])
#         new_texts_tok.append(text_tok)
#     return new_texts_tok

In [111]:
# def collect_data(dataset: Dataset, embed: Embeddings):
#     '''
#     Collects parsed dataset and loaded embeddings into Data class.
#     Also embeddings are cleared from unused words.
#     ------
#     returns Data class.
#     '''
    
#     wtoi = defaultdict(lambda : len(wtoi))
#     [wtoi[tk] for tk in spec_toks]
#     w_emb = [np.zeros(WORD_EMB_SIZE) for _ in spec_toks]
#     dataset.contexts = drop_useless_embs(dataset.contexts, embed.word_emb, w_emb, wtoi)
#     dataset.questions = drop_useless_embs(dataset.questions, embed.word_emb, w_emb, wtoi)
#     itow = {}
#     for word in wtoi:
#         itow[wtoi[word]] = word

In [175]:
embed = Embeddings()
parse_glove_embeddings(embed)

Loading word embeddings...
Loading char embeddings...
Embeddings are parsed.


In [214]:
dataset = Dataset()

f = open(os.path.join(dataset_dir, train_filename), 'r')
train = uj.load(f)['data']
f.close()
    
tockenize_squad_dataset(train, dataset, embed)

In [225]:
# packs = trunk(dataset.packs, 32)
# Cw, Cc, Qw, Qc, a = to_batch(packs[0], embed, dataset)

# Architechture

In [370]:
class HighwayNet(nn.Module):
    def __init__(self, n_lay, dim, bias=True):
        super().__init__()
        self.n_layers = n_lay
        self.dim = dim
        self.with_bias = bias
        self.W_H_lay = [nn.Linear(dim, dim, bias=self.with_bias) for _ in range(self.n_layers)]
        self.W_T_lay = [nn.Linear(dim, dim, bias=self.with_bias) for _ in range(self.n_layers)]
        self.non_lin = nn.ReLU()
    
    def forward(self, x):
        for i in range(self.n_layers):
            H = self.non_lin(self.W_H_lay[i](x))
            T = self.non_lin(self.W_T_lay[i](x))
            x = H * T + x - x * T
        return x

In [466]:
class DepthwiseSeparableConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding=0, bias=False):
        super().__init__()
        self.in_ch = in_channels
        self.out_ch = out_channels
        self.kernel_size = kernel_size
        self.padding = 0
        self.bias = bias

        self.depthwise = nn.Conv1d(in_channels, in_channels, kernel_size, 
                                   padding=padding, bias=bias)
        self.pointwise = nn.Conv1d(in_channels, out_channels, 1, padding=0, bias=bias)
        
    def forward(self, input):
        return self.pointwise(self.depthwise(input))
    
    
class DepthwiseSeparableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding=0, bias=False):
        super().__init__()
        self.in_ch = in_channels
        self.out_ch = out_channels
        self.kernel_size = kernel_size
        self.padding = 0
        self.bias = bias

        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, 
                                   padding=padding, bias=bias)
        self.pointwise = nn.Conv2d(in_channels, out_channels, (1, 1), padding=0, bias=bias)
        
    def forward(self, input):
        return self.pointwise(self.depthwise(input))

In [408]:
class WordCharEmbedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.highway = Highway(N_HIGHWAY_LAYERS)
        
    
    def forward(self):
        pass

In [298]:
class EmbeddingEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        pass
    
    
    def forward(self):
        pass

In [299]:
class ContextQueryAttention(nn.Module):
    def __init__(self):
        super().__init__()
        pass
    
    
    def forward(self):
        pass

In [300]:
class SelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        pass
    
    
    def forward(self):
        pass

In [320]:
class QANet(nn.Module):
    def __init__(self, embeddings: Embeddings):
        super().__init__()
        self.char_emb = nn.Embedding(ASCII_LEN, CHAR_EMB_SIZE)
        self.word_emb = nn.Embedding.from_pretrained(embeddings.word_emb)
        self.emb = WordCharEmbedding()
        self.emb_enc = EmbeddingEncoder()
        self.con_quer_atten = ContextQueryAttention()
    
    def forward(self, cont_word_idx, cont_char_idx, ques_word_idx, ques_char_idx):
        '''
        cont_word_idx: LongTensor (batch_sz, max_context_len)
        cont_char_idx: LongTensor (batch_sz, max_context_len, max_word_len)
        ques_word_idx: LongTensor (batch_sz, max_question_len)
        ques_char_idx: LongTensor (batch_sz, max_question_len, max_word_len)
        '''
        # Getting word and char embeddings for context and question
        cont_word_emb = self.word_emb(cont_word_idx)
        cont_char_emb = self.char_emb(cont_char_idx)
        ques_word_emb = self.word_emb(ques_word_idx)
        ques_char_emb = self.char_emb(ques_char_idx)
        
        # Getting concatenated word embedding with char-level embedding
        cont_emb_out = self.emb(cont_word_emb, cont_char_emb)
        ques_emb_out = self.emb(ques_word_emb, ques_char_emb)
        
        # Applying Encoding block layer
        cont_enc_emb = self.emb_enc(cont_emb_out)
        ques_enc_emb = self.emb_enc(ques_emb_out)
        
        # Applying Context-Query attention
        cont_ques_atten = self.ContextQueryAttention(cont_enc_emb, ques_enc_emb)