In [1]:
import torch
from torch import nn
from torch.nn import functional as F

import numpy as np
import json

import os
from os.path import exists, join

import urllib.request
import zipfile

import spacy

import copy

import ujson as uj

from collections import defaultdict

In [2]:
dataset_dir = '../Data/Datasets'
emb_dir = '../Data/Embeddings'

train_filename = 'train-v1.1.json'
dev_filename = 'dev-v1.1.json'
char_emb_filename = "glove.840B.300d-char.txt"
word_emb_zip = "glove.840B.300d.zip"
word_emb_filename = "glove.840B.300d.txt"

word_emb_url_base = "http://nlp.stanford.edu/data/"
char_emb_url_base = "https://raw.githubusercontent.com/minimaxir/char-embeddings/master/"
train_url_base = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
dev_url_base = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

spec_toks = ['<UNK>', '<PAD>']

CONTEXT_MAX_LEN = 300
QUESTION_MAX_LEN = 60
ASCII_LEN = 128

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_DIM = 128
BATCH_SIZE = 32
WORD_EMB_SIZE = 300
CHAR_EMB_SIZE = 200

WORD_MAX_LEN = 16
TENSORS_IS_TRAINABLE = True

# Dropout
DROP_PROB_WD = 0.1
DROP_PROB_CH = 0.05
DROP_PROB_INNER = 0.1
DROP_PROB_ATTEN = 0.1


# WordCharEmbedding
WORDCHAR_EMBED_KERSIZE = 5
N_HIGHWAY_LAYERS = 2


# SelfAttention
N_HEADS = 8


# QANet
N_EMBED_ENC_CONVS = 4
EMBED_ENC_KERSIZE = 7

N_MODEL_ENC_CONVS = 2
MODEL_ENC_KERSIZE = 7

DOWNSCALE_KERSIZE = 5

# Preprocessing the data

In [4]:
class Embeddings:
    def __init__(self):
        '''
        word_emb: word embeddings
        char_emb: char embeddings
        wd2id: word to index
        id2wd: index to word
        ch2id: char to index
        id2ch: index to char
        '''
        self.word_emb = np.empty(1)
        self.char_emb = np.empty(1)
        self.wd2id = {}
        self.id2wd = {}
        self.ch2id = {}
        self.id2ch = {}

In [5]:
class Dataset:
    '''
    length: the dataset lenght
    context: list of passages
    '''
    def __init__(self):
        self.length = 0
        self.contexts = []
        self.questions = []
        self.questions_id = []
        self.answers = []
        self.packs = []

In [6]:
def download(url, path, filename):
    '''
    Downloads the file from url/filename and saves in path/filename
    '''
    if (exists(join(path, filename))):
        print("File {} already exists.".format(filename))
        return
    print("Downloading file {}...".format(filename))
    urllib.request.urlretrieve(join(url, filename), filename=join(path, filename))

In [7]:
def download_all_data():
    '''
    Downloads embeddings and datasets
    '''
    dirs = [dataset_dir, emb_dir]
    
    for d in dirs:
        if not exists(d):
            os.makedirs(d)
    
    download(train_url_base, dataset_dir, train_filename)
    download(dev_url_base, dataset_dir, dev_filename)
    download(char_emb_url_base, emb_dir, char_emb_filename)
    
    if not os.path.exists(join(emb_dir, word_emb_filename)):
        download(word_emb_url_base, emb_dir, word_emb_zip)
        print("Unzipping file {}".format(word_emb_zip))
        zip_path = join(emb_dir, word_emb_zip)
        txt_path = join(emb_dir, word_emb_filename)
        zp = zipfile.ZipFile(zip_path)
        zp.extractall(path=txt_path)
        zp.close()
        os.remove(zip_path)

In [8]:
def parse_glove_embeddings(embed: Embeddings):
    '''
    Parses embeddings from files and converts to Embeddings structure
    '''
    wtoi = defaultdict(lambda: len(wtoi))
    ctoi = defaultdict(lambda: len(ctoi))
    [ctoi[i] for i in spec_toks]
    [wtoi[i] for i in spec_toks]
    
    w_dump = open(join(emb_dir, word_emb_filename))
    c_dump = open(join(emb_dir, char_emb_filename))
    
    w_emb = [np.zeros(WORD_EMB_SIZE) for i in range(len(spec_toks))]
    c_emb = copy.deepcopy(w_emb)
    
    print("Loading word embeddings...")
    with open(join(emb_dir, word_emb_filename)) as w_dump:
        for line in w_dump:
            line = line.split(' ')

            wtoi[line[0]]
            w_emb.append(np.array([float(i) for i in line[1:]]))
            if len(w_emb) != len(wtoi):
                w_emb.pop()
    
    print("Loading char embeddings...")
    with open(join(emb_dir, char_emb_filename)) as c_dump:
        for line in c_dump:
            line = line.split(' ')

            ctoi[line[0]]
            c_emb.append(np.array([float(i) for i in line[1:]]))
    
    embed.id2wd.update({ctoi[ch]: ch for ch in ctoi})
    embed.id2wd.update({wtoi[wd]: wd for wd in wtoi})
    embed.wd2id = dict(wtoi)
    embed.ch2id = dict(ctoi)
    embed.word_emb = np.array(w_emb)
    embed.char_emb = np.array(c_emb)
    print("Embeddings are parsed.")

In [9]:
def tockenize_squad_dataset(json_data, dataset: Dataset, emb: Embeddings):
    tokenize = spacy.blank('en')

    def tokens_pos(text, text_tok):
        pos = 0
        new_ind = []
        
        for token in text_tok:
            pos = text.find(token, pos)
            new_ind.append((pos, pos + len(token)))
            pos += len(token)
        return new_ind
    
    
    cont_id, ques_id, answ_id = 0, 0, 0
    for item in json_data:
        for para in item['paragraphs']:
            context = para['context'].replace("''", '" ').replace("``", '" ')
            contx_toks = [tk.text for tk in tokenize(context)]
            conxt_tok2id = [emb.wd2id.get(tk, emb.wd2id['<UNK>']) for tk in contx_toks]
            
            if (len(contx_toks) > CONTEXT_MAX_LEN):
                continue
            
            cont_tok_posit = tokens_pos(context, contx_toks)
            
            for qas in para['qas']:
                ques = qas['question'].replace("''", '" ').replace("``", '" ')
                ques_tok2id = [emb.wd2id.get(tk.text, emb.wd2id['<UNK>']) for tk in tokenize(ques)]
                question_id = qas['id']
                
                for answ in qas['answers']:
                    answ_text = answ['text']
                    answ_start = answ['answer_start']
                    answ_end = answ_start + len(answ_text)
                    answ_fit_toks = []
                    # taking context tokens which are inside of answer borders
                    for idx, posit in enumerate(cont_tok_posit):
                        if answ_start < posit[1] and answ_end > posit[0]:
                            answ_fit_toks.append(idx)
                    answ_pair = (answ_fit_toks[0], answ_fit_toks[-1])
                    
                    dataset.answers.append(answ_pair)
                    dataset.packs.append((cont_id, ques_id, answ_id))
                    answ_id += 1
                
                dataset.questions.append(ques_tok2id)
                dataset.questions_id.append(question_id)
                ques_id += 1
            
            dataset.contexts.append(conxt_tok2id)
            cont_id += 1
        dataset.length = len(dataset.packs)

In [None]:
# def drop_useless_embs(passages_tok, embed, new_wemb, new_wd2id):
#     new_texts_tok = []
#     for text in passages_tok:
#         text_tok = []
#         for idx, tok in enumerate(text):
#             word = embed.id2wd[idx]
            
#             if word not in new_wd2id:
#                 new_wd2id[word]
#                 new_wemb.append(embed.word_emb[tok])
            
#             text_tok.append(new_wd2id[word])
#         new_texts_tok.append(text_tok)
#     return new_texts_tok

In [None]:
# def collect_data(dataset: Dataset, embed: Embeddings):
#     '''
#     Collects parsed dataset and loaded embeddings into Data class.
#     Also embeddings are cleared from unused words.
#     ------
#     returns Data class.
#     '''
    
#     wtoi = defaultdict(lambda : len(wtoi))
#     [wtoi[tk] for tk in spec_toks]
#     w_emb = [np.zeros(WORD_EMB_SIZE) for _ in spec_toks]
#     dataset.contexts = drop_useless_embs(dataset.contexts, embed.word_emb, w_emb, wtoi)
#     dataset.questions = drop_useless_embs(dataset.questions, embed.word_emb, w_emb, wtoi)
#     itow = {}
#     for word in wtoi:
#         itow[wtoi[word]] = word

In [10]:
embed = Embeddings()
parse_glove_embeddings(embed)

Loading word embeddings...
Loading char embeddings...
Embeddings are parsed.


In [11]:
dataset = Dataset()

f = open(os.path.join(dataset_dir, train_filename), 'r')
train = uj.load(f)['data']
f.close()
    
tockenize_squad_dataset(train, dataset, embed)

In [None]:
# packs = trunk(dataset.packs, 32)
# Cw, Cc, Qw, Qc, a = to_batch(packs[0], embed, dataset)

# Architecture

In [12]:
class LayerNorm(nn.Module):
    def __init__(self, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.sigma = None
        self.mu = None

        
    def forward(self, z):
        if z.size(1) == 1:
            return z
        
        if self.sigma is None:
            self.sigma = nn.Parameter(torch.ones(z.shape))
            self.mu = nn.Parameter(torch.zeros(z.shape))
        
        mu = torch.mean(z, keepdim=True, dim=-1)
        sigma = torch.std(z, keepdim=True, dim=-1)
        out = (z - mu) / (sigma + self.eps)
        out = out * self.sigma.expand_as(out) + self.mu.expand_as(out)
        return out

In [13]:
class DepthwiseSeparableConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding=0, bias=False):
        super().__init__()
        self.in_ch = in_channels
        self.out_ch = out_channels
        self.kernel_size = kernel_size
        self.padding = 0
        self.bias = bias

        self.depthwise = nn.Conv1d(in_channels, in_channels, kernel_size, 
                                   padding=padding, bias=bias)
        self.pointwise = nn.Conv1d(in_channels, out_channels, 1, padding=0, bias=bias)
        

    def forward(self, input):
        return self.pointwise(self.depthwise(input))
    
    
class DepthwiseSeparableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding=0, bias=False):
        super().__init__()
        self.in_ch = in_channels
        self.out_ch = out_channels
        self.kernel_size = kernel_size
        self.padding = 0
        self.bias = bias

        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, 
                                   padding=padding, bias=bias)
        self.pointwise = nn.Conv2d(in_channels, out_channels, (1, 1), padding=0, bias=bias)
        
    
    def forward(self, input):
        return self.pointwise(self.depthwise(input))

In [14]:
class HighwayNet(nn.Module):
    def __init__(self, n_lay, dim, bias=True):
        super().__init__()
        self.n_layers = n_lay
        self.dim = dim
        self.with_bias = bias
        self.W_H_lay = [nn.Linear(dim, dim, bias=self.with_bias) for _ in range(self.n_layers)]
        self.W_T_lay = [nn.Linear(dim, dim, bias=self.with_bias) for _ in range(self.n_layers)]
        self.non_lin = nn.ReLU()
    
    
    def forward(self, x):
        for i in range(self.n_layers):
            H = self.non_lin(self.W_H_lay[i](x))
            T = self.non_lin(self.W_T_lay[i](x))
            x = H * T + x - x * T
        return x

In [36]:
class WordCharEmbedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.highway = HighwayNet(N_HIGHWAY_LAYERS, MODEL_DIM)
        self.conv1d = DepthwiseSeparableConv1d(WORD_EMB_SIZE + CHAR_EMB_SIZE, MODEL_DIM, WORDCHAR_EMBED_KERSIZE,
                                               padding=WORDCHAR_EMBED_KERSIZE//2, bias=True)
        self.conv2d = DepthwiseSeparableConv2d(CHAR_EMB_SIZE, CHAR_EMB_SIZE, WORDCHAR_EMBED_KERSIZE,
                                               padding=WORDCHAR_EMBED_KERSIZE//2, bias=True)
        self.relu = nn.ReLU()
        self.dropout_wd = nn.Dropout(DROP_PROB_WD)
        self.dropout_ch = nn.Dropout(DROP_PROB_CH)
        
    def forward(self, wd_emb, ch_emb):
        '''
        wd_emb: DoubleTensor of shape (batch_sz, context(question)_max_len, word_emb_size)
        ch_emb: DoubleTensor of shape (batch_sz, context(question)_max_len, word_max_len, char_emb_size)
        '''
        
        # Changing order of dimention to make channels dimention go first for feeding to Conv layers.
        wd_emb = wd_emb.transpose(1, 2)
        ch_emb = ch_emb.permute(0, 3, 1, 2)
        
        wd_emb = self.dropout_wd(wd_emb)
        ch_emb = self.dropout_ch(ch_emb)
        
        ch_emb = self.conv2d(ch_emb)
        ch_emb = self.relu(ch_emb)
        
        ch_emb, _ = torch.max(ch_emb, dim=3) 
        wd_ch_conc = torch.cat([wd_emb, ch_emb], dim=1)
        
        wd_ch_conv = self.conv1d(wd_ch_conc).transpose(1, 2)
        wd_ch_highway = self.highway(wd_ch_conv)
        return wd_ch_highway

In [16]:
class ScaledDotProduct(nn.Module):
    def __init__(self, keys_dim, p=DROP_PROB_ATTEN):
        super().__init__()
        self.scale = 1 / np.sqrt(keys_dim)
        self.dropout = nn.Dropout(p)        

    
    def forward(self, q, v, k, mask=None):
        attention = torch.bmm(q, k.transpose(1, 2)) * self.scale
        
        if mask is not None:
            attention.data.masked_fill_(mask, -float('inf'))
        attention = F.softmax(attention, dim=2)
        attention = self.dropout(attention)
        return torch.bmm(attention, q)

In [17]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, batch_sz, d_model, d_k, d_v, p=DROP_PROB_ATTEN):
        super().__init__()
        self.n_heads = n_heads
        self.WQ_i = [torch.empty((batch_sz, d_model, d_k), device=device, requires_grad=True) 
                     for _ in range(n_heads)]
        self.WK_i = [torch.empty((batch_sz, d_model, d_k), device=device, requires_grad=True) 
                     for _ in range(n_heads)]
        self.WV_i = [torch.empty((batch_sz, d_model, d_v), device=device, requires_grad=True) 
                     for _ in range(n_heads)]
        self.WO = torch.empty((batch_sz, self.n_heads * d_v, d_model), device=device, requires_grad=True)
        nn.init.xavier_normal_(self.WO)
        for i in range(n_heads):
            nn.init.xavier_normal_(self.WQ_i[i])
            nn.init.xavier_normal_(self.WK_i[i])
            nn.init.xavier_normal_(self.WV_i[i])
        
        self.dropout = nn.Dropout(p)
        self.scal_dot = ScaledDotProduct(d_k)
    

    def forward(self, q, k, v):
        heads = []
        for i in range(self.n_heads):
            QW = torch.bmm(q, self.WQ_i[i])
            KW = torch.bmm(k, self.WK_i[i])
            VW = torch.bmm(v, self.WV_i[i])
            heads.append(self.scal_dot(QW, KW, VW))
        con = torch.cat(heads, dim=-1)
        output = torch.bmm(con, self.WO)
        return output

In [18]:
class SelfAttention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.d_k = dim // N_HEADS
        self.d_v = self.d_k
        self.multihead_att = MultiHeadAttention(N_HEADS, BATCH_SIZE, dim, 
                                                self.d_k, self.d_v)
    
    
    def forward(self, input):
        return self.multihead_att(input, input, input)

In [19]:
class Encoder(nn.Module):    
    def __init__(self, num_convs, chan_num, kern_sz, mod_dim):
        super().__init__()
        self.drop_p = DROP_PROB_INNER
        self.is_train = TENSORS_IS_TRAINABLE
        self.convs = [DepthwiseSeparableConv1d(chan_num, chan_num, kern_sz, padding=kern_sz//2)
                      for _ in range(num_convs)]
        self.relu = nn.ReLU()
        self.layer_norm = LayerNorm()
        self.selfattention = SelfAttention(mod_dim)
        self.lin = nn.Linear(chan_num, chan_num, bias=False)
        self.freqs = np.array([1 / np.power(10000, 2 * i / (mod_dim)) 
                               for i in range(mod_dim)])


    
    def position_encoding(self, x):
        (_, text_len, dim) = x.shape
        encoding = np.array([pos * self.freqs for pos in range(text_len)])
        enc_t = torch.empty(x.shape)
        enc_t[:] = torch.Tensor(encoding, device=device)
        return enc_t + x
    
        
    def forward(self, input):
        output = self.position_encoding(input)
        residual = output
        
        for i in range(len(self.convs)):
            output = self.layer_norm(output).transpose(1, 2)
            output = self.convs[i](output)
            output = self.relu(output).transpose(1, 2)
            output += residual
            output = F.dropout(output, p=self.drop_p, training=self.is_train)
            residual = output
        
        output = self.layer_norm(output)
        output = self.selfattention(output)
        output += residual
        output = F.dropout(output, p=self.drop_p, training=self.is_train)
        residual = output
        
        output = self.layer_norm(output)
        output = self.lin(output)
        output = self.relu(output)
        output += residual
        output = F.dropout(output, p=self.drop_p, training=self.is_train)
        return output

In [20]:
class ContextQueryAttention(nn.Module):
    def __init__(self, p=DROP_PROB_INNER):
        super().__init__()
        self.W0 = torch.empty((BATCH_SIZE, 1, 3 * MODEL_DIM), device=device, requires_grad=True)
        nn.init.xavier_normal_(self.W0)
        self.dropout = nn.AlphaDropout(p)
    
    
    def forward(self, context, query):
        n = context.shape[1]
        m = query.shape[1]
        S = torch.empty((BATCH_SIZE, n, m), device=device)
        for i in range(n):
            for j in range(m):
                S[:, i, j] = self.similarity_function(context[:, i, :], query[:, j, :])
                
        S_1 = F.softmax(S, dim=2)
        S_2 = F.softmax(S, dim=1)
        S_mul_12 = torch.bmm(S_1, S_2.transpose(1, 2))
        A = torch.bmm(S_1, query)
        B = torch.bmm(S_mul_12, context)
        output = torch.cat([context, A, context * A, context * B], dim=-1)
        output = self.dropout(output)
        return output
    
    
    def similarity_function(self, c, q):
        con = torch.cat([q, c, q * c], dim=-1).unsqueeze(dim=2)
        return torch.bmm(self.W0, con).squeeze()

In [24]:
class AnswerStartEnd(nn.Module):
    def __init__(self):
        super().__init__()
        self.W0 = torch.empty(BATCH_SIZE, 1, 4 * MODEL_DIM, device=device, requires_grad=True)
        self.W1 = torch.empty(BATCH_SIZE, 1, 4 * MODEL_DIM, device=device, requires_grad=True)
        nn.init.xavier_normal_(self.W0)
        nn.init.xavier_normal_(self.W1)
    
    
    def forward(self, M0, M1, M2):
        cat_M01 = torch.cat([M0, M1], dim=-1)
        cat_M02 = torch.cat([M0, M2], dim=-1)
        Y0 = torch.bmm(self.W0, cat_M01.transpose(1, 2)).squeeze()
        Y1 = torch.bmm(self.W1, cat_M02.transpose(1, 2)).squeeze()
        p1 = F.softmax(Y0, dim=1)
        p2 = F.softmax(Y1, dim=1)
        return p1, p2

In [37]:
class QANet(nn.Module):
    def __init__(self, embeddings: Embeddings, char_pretrained=False):
        super().__init__()
        self.char_emb = None
        if char_pretrained:
            self.char_emb = nn.Embedding.from_pretrained(torch.Tensor(embeddings.char_emb))
        else:
            self.char_emb = nn.Embedding(ASCII_LEN, CHAR_EMB_SIZE)
        
        self.word_emb = nn.Embedding.from_pretrained(torch.Tensor(embeddings.word_emb))
        self.emb = WordCharEmbedding()
        self.emb_enc = Encoder(N_EMBED_ENC_CONVS, MODEL_DIM, EMBED_ENC_KERSIZE, MODEL_DIM)
        self.mod_enc = Encoder(N_MODEL_ENC_CONVS, MODEL_DIM * 2, MODEL_ENC_KERSIZE, 2 * MODEL_DIM)
        self.con_quer_atten = ContextQueryAttention()
        self.downscale = DepthwiseSeparableConv1d(4 * MODEL_DIM, 2 * MODEL_DIM, 
                                                  DOWNSCALE_KERSIZE, padding=DOWNSCALE_KERSIZE//2)
        self.answ = AnswerStartEnd()
    
    
    def forward(self, cont_word_idx, cont_char_idx, ques_word_idx, ques_char_idx):
        '''
        cont_word_idx: LongTensor (batch_sz, max_context_len)
        cont_char_idx: LongTensor (batch_sz, max_context_len, max_word_len)
        ques_word_idx: LongTensor (batch_sz, max_question_len)
        ques_char_idx: LongTensor (batch_sz, max_question_len, max_word_len)
        '''
        # Getting word and char embeddings for context and question
        print("Getting word and char embeddings for context and question")
        cont_word_emb = self.word_emb(cont_word_idx)
        cont_char_emb = self.char_emb(cont_char_idx)
        ques_word_emb = self.word_emb(ques_word_idx)
        ques_char_emb = self.char_emb(ques_char_idx)
        
        # Getting concatenated word embedding with char-level embedding
        print("Getting concatenated word embedding with char-level embedding")
        cont_emb_out = self.emb(cont_word_emb, cont_char_emb)
        ques_emb_out = self.emb(ques_word_emb, ques_char_emb)
        
        # Applying Encoding block layer
        print("Applying Encoding block layer")
        cont_enc_emb = self.emb_enc(cont_emb_out)
        ques_enc_emb = self.emb_enc(ques_emb_out)
        
        # Applying Context-Query attention and reducing channels dimension
        print("Applying Context-Query attention and reducing channels dimension")
        cont_ques_atten = self.con_quer_atten(cont_enc_emb, ques_enc_emb).transpose(1, 2)
        print(cont_ques_atten.shape)
        cq_atten_resized = self.downscale(cont_ques_atten).transpose(1, 2)
        
        # Model Encoding Blocks
        print("Model Encoding Blocks:\n M0")
        M0 = cq_atten_resized
        for i in range(1):
            M0 = self.mod_enc(M0)
        
        M1 = torch.Tensor(M0)
        print("Model Encoding Blocks:\n M1")
        for i in range(1):
            M1 = self.mod_enc(M1)
        
        M2 = torch.Tensor(M1)
        print("Model Encoding Blocks:\n M2")
        for i in range(1):
            M2 = self.mod_enc(M2)
        
        # Getting positions probabilities
        print("Getting positions probabilities")
        p1, p2 = self.answ(M0, M1, M2)
        return p1, p2

# Train and batchif