In [None]:
import torch
import torchtext.data as data
import string
import os
import sys
import time
import shutil
import json
import spacy
import logging
import numpy as np
import torchtext
import csv
import pandas as pd
import torch.nn as nn
import math
import torch.nn.functional as F
import socket
import copy
import datetime
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

from typing import List, Tuple, Dict
from torch.nn.modules.loss import _Loss, CrossEntropyLoss
from torchtext.data import BucketIterator, Iterator, RawField, Example
from urllib import request
from spacy.tokenizer import Tokenizer
from torchtext.vocab import GloVe
from torch.optim import Adam
# from torchviz import make_dot
from collections import defaultdict


# Remove cached data like this if needed
# !rm -r *

# Remove pre-cached sample data in colab's directory
if os.path.isdir("sample_data"):
  shutil.rmtree("sample_data")

def get_timestamp():
    return datetime.datetime.now().strftime('%Y-%m-%d_%H:%M')

In [None]:
!wget http://www.stud.fit.vutbr.cz/~ifajcik/bissit19/evaluate_squad
!mv evaluate_squad evaluate_squad.py
from evaluate_squad import evaluate


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print("Current working directory: " + os.getcwd())
print(f"python version: {sys.version}")
print(f"torch version: {torch.__version__}")
print(f"torchtext version: {torchtext.__version__}")
print(f"spacy version: {spacy.__version__}")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    !nvidia-smi


In [None]:
# This code snippet will download dataset for us

TRAIN_V1_URL = 'https://github.com/rajpurkar/SQuAD-explorer/raw/master/dataset/train-v1.1.json  '
DEV_V1_URL = 'https://github.com/rajpurkar/SQuAD-explorer/raw/master/dataset/dev-v1.1.json'
TRAIN = "train-v1.1.json"
VALIDATION = "dev-v1.1.json"

def download_url(path, url):
    sys.stderr.write(f'Downloading from {url} into {path}\n')
    sys.stderr.flush()
    request.urlretrieve(url, path)

In [None]:
def find_sub_list(sl, l):
    """
    Methods finds sublist in list and returns its indices.
    The indices are inclusive
    
    Example:
    print(find_sub_list([3,2,1],[4,3,2,1,0]))
    
    Returns:
    [(1, 3)]
    
    """
    results = []
    sll = len(sl)
    for ind in (i for i, e in enumerate(l) if e == sl[0]):
        if l[ind:ind + sll] == sl:
            results.append((ind, ind + sll - 1))

    return results

print(find_sub_list([3,2,1],[4,3,2,1,0]))

In [None]:
def create_custom_tokenizer(nlp):
    custom_prefixes = [r'[0-9]+', r'\~', r'\–', r'\—', r'\$']
    custom_infixes = [r'[!&:,()]', r'\.', r'\-', r'\–', r'\—', r'\$']
    custom_suffixes = [r'\.', r'\–', r'\—', r'\$']
    default_prefixes = list(nlp.Defaults.prefixes) + custom_prefixes
    default_prefixes.remove(r'US\$')
    default_prefixes.remove(r'C\$')
    default_prefixes.remove(r'A\$')
    
    all_prefixes_re = spacy.util.compile_prefix_regex(tuple(default_prefixes))
    infix_re = spacy.util.compile_infix_regex(tuple(list(nlp.Defaults.infixes) + custom_infixes))
    suffix_re = spacy.util.compile_suffix_regex(tuple(list(nlp.Defaults.suffixes) + custom_suffixes))

    rules = dict(nlp.Defaults.tokenizer_exceptions)
    # remove "a." to "z." rules so "a." gets tokenized as a|.
    for c in range(ord("a"), ord("z") + 1):
        if f"{chr(c)}." in rules:
            rules.pop(f"{chr(c)}.")

    return Tokenizer(nlp.vocab, rules,
                     prefix_search=all_prefixes_re.search,
                     infix_finditer=infix_re.finditer, suffix_search=suffix_re.search,
                     token_match=None)

In [None]:
# We will use this special token to join the pre-tokenized data
JOIN_TOKEN = "█"

_spacy_en = spacy.load('en')
_spacy_en.tokenizer = create_custom_tokenizer(_spacy_en)

def tokenize(text: string, tokenizer=_spacy_en):
    tokens = [tok for tok in _spacy_en.tokenizer(text) if not tok.text.isspace()]
    text_tokens = [tok.text for tok in tokens]
    return tokens, text_tokens


def tokenize_and_join(text: string, jointoken=JOIN_TOKEN):
    return jointoken.join(tokenize(text)[1])


In [None]:
print(tokenize_and_join("Lazy fox doesn't like to travel too far in this heat..."))
print(tokenize_and_join("Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data. "))

In [None]:
class SquadDataset(data.Dataset):

    def __init__(self, data, fields: List[Tuple[str, data.Field]], cachedir='./drive/MyDrive/data/squad', **kwargs):
        # download dataset, if needed
        self.check_for_download(cachedir)
        
        f = os.path.join(cachedir, data)
        print(f)
        
        # The preprocessed file will be named like the original but with _preprocessed.json suffix
        preprocessed_f = f + "_preprocessed.json"
        if not os.path.exists(preprocessed_f):
            s_time = time.time()
            
            # Process examples from file
            raw_examples = SquadDataset.get_example_list(f)
            # Save preprocessed examples, so they do not have to be processed again
            self.save(preprocessed_f, raw_examples)
            print(f"Dataset {preprocessed_f} created in {time.time() - s_time}s")

        s_time = time.time()
        
        # Load preprocessed examples
        examples = self.load(preprocessed_f, fields)
        print(f"Dataset {preprocessed_f} loaded in {time.time() - s_time:.2f} s")

        super(SquadDataset, self).__init__(examples, fields, **kwargs)

    def save(self, preprocessed_f: string, raw_examples: List[Dict])-> None:
        """
        Dump examples into json with name given in preprocessed_f variable.
        """
        with open(preprocessed_f, "w") as f:
            json.dump(raw_examples, f)

    def load(self, preprocessed_f: string, fields: List[Tuple[str, RawField]]) -> List[Example]:
        """
        Load preprocessed examples and construct torchtext examples from them
        """
        with open(preprocessed_f, "r") as f:
            raw_examples = json.load(f)
            return [data.Example.fromlist([
                e["id"],
                e["topic"],
                e["paragraph_token_positions"],
                e["raw_paragraph_context"],
                e["paragraph_context"],
                e["paragraph_context"],
                e["paragraph_context"],
                e["question"],
                e["question"],
                e["question"],
                e["a_start"],
                e["a_end"],
                e["a_extracted"],
                e["a_gt"]
            ], fields) for e in raw_examples]

    @classmethod
    def splits(cls, fields, cachedir='./drive/MyDrive/data/squad'):
        """
        Creates train/validation data split
        """
        train_data = cls(TRAIN, fields, cachedir=cachedir)
        val_data = cls(VALIDATION, fields, cachedir=cachedir)
        return tuple(d for d in (train_data, val_data)
                     if d is not None)

    @staticmethod
    def check_for_download(cachedir:string):
        """
        Downloads data, if possible
        """
        if not os.path.exists(cachedir):
            os.makedirs(cachedir)
            try:
                download_url(os.path.join(cachedir, TRAIN), TRAIN_V1_URL)
                download_url(os.path.join(cachedir, VALIDATION), DEV_V1_URL)
            except BaseException as e:
                sys.stderr.write(f'Download failed, removing directory {cachedir}\n')
                sys.stderr.flush()
                shutil.rmtree(cachedir)
                
                raise e
        
                
    @staticmethod
    def prepare_fields():
        """
        Prepare torchtext fields for individual aspects of batch
        """
        # field, that will process sequential text, will use vocabulary, will tokenize the text by splitting
        # it on JOIN_TOKEN token and will lowercase the text
        
        # IMPORTANT: as the use_vocab=True, for this field (implicitly), the contents of this field will be automatically numericalized
        # numericalization - the process of replacing words with their integer representations e.g.:
        # [i, love, NLP] can be numericalized as [47,21,743]
        WORD_field = data.Field(batch_first=True, tokenize=lambda s: str.split(s, sep=JOIN_TOKEN), lower=True)
        
        # field, that will not contain sequences, does not need vocabulary and will represent dependent target variable
        TARGET_field = data.Field(sequential=False, use_vocab=False, batch_first=True, is_target=True)
        
        # raw field means, the field will not be processed at all
        RAW_field = data.RawField()
        RAW_field.is_target=False
        return [
            ('id', RAW_field),
            ('topic_title', RAW_field),
            ('document_token_positions', RAW_field),
            ('raw_document_context', RAW_field),
            ('document', WORD_field), # documents are processed as described with the WORD_field 
            ('document_char', RAW_field),
            ('raw_document', RAW_field),
            ('question', WORD_field), # questions are processed as described with the WORD_field 
            ('question_char', RAW_field),
            ('raw_question', RAW_field),
            # token indices of answer start and answer end are processed as described with the TARGET_field 
            ("a_start", TARGET_field),
            ("a_end", TARGET_field),
            
            ('ext_answer', RAW_field),
            ('gt_answer', RAW_field)
        ]

    @staticmethod
    def prepare_fields_char():
        WORD_field = data.Field(batch_first=True, tokenize=lambda s: str.split(s, sep=JOIN_TOKEN), lower=True)
        
        # This is field is applied on each unit of CHAR_nested_field, here we pass list as tokenize argument to split 
        # tokenized string into characters
        CHAR_field = data.Field(batch_first=True, tokenize=list, lower=True)
        CHAR_nested_field = data.NestedField(CHAR_field, tokenize=lambda s: str.split(s, sep=JOIN_TOKEN))
            
        RAW_field = data.RawField()
        RAW_field.is_target=False
        return [
            ('id', RAW_field),
            ('topic_title', RAW_field),
            ('document_token_positions', RAW_field),
            ('raw_document_context', RAW_field),
            ('document', WORD_field),
            ('document_char', CHAR_nested_field),
            ('raw_document', RAW_field),
            ('question', WORD_field),
            ('question_char', CHAR_nested_field),
            ('raw_question', RAW_field),
            ("a_start", data.Field(sequential=False, use_vocab=False, batch_first=True, is_target=True)),
            ("a_end", data.Field(sequential=False, use_vocab=False, batch_first=True, is_target=True)),
            ('ext_answer', RAW_field),
            ('gt_answer', RAW_field)
        ]
      
    @staticmethod    
    def get_example_list(file:string):
        """
        Extracts processed examples from original dataset
        """
        examples = []
        cnt = 0

        ## Open file for error reporting
        f = open(f"./drive/MyDrive/data/squad/errors_{os.path.basename(file)}.csv", "a+")
        problems = 0

        # Iterate over examples in dataset
        with open(file) as fd:
            data_json = json.load(fd)
            for data_topic in data_json["data"]:
                topic_title = data_topic["title"]
                for paragraph in data_topic["paragraphs"]:
                    # Tokenize document paragraph
                    paragraph_tokens, paragraph_context = tokenize(paragraph["context"])
                    # Keep positions of each token in document, we will need this later, when decoding model outputs
                    paragraph_token_positions = [[token.idx, token.idx + len(token.text)] for token in paragraph_tokens]
                    
                    joined_paragraph_context = JOIN_TOKEN.join(paragraph_context)
                    for question_and_answers in paragraph['qas']:
                        example_id = question_and_answers["id"]
                        question = tokenize_and_join(question_and_answers['question'])
                        answers = question_and_answers['answers']

                        for possible_answer in answers:
                            answer_start_ch = possible_answer["answer_start"]
                            answer_end = possible_answer["answer_start"] + len(possible_answer["text"])
                            answer_tokens, answer = tokenize(possible_answer["text"])
                            
                            
                            # Try finding answer in the document
                            answer_locations = find_sub_list(answer, paragraph_context)
                            
                            # If we found multiple answer locations, we select the one, which is closest to the annotation
                            if len(answer_locations) > 1:
                                # get start character offset of each span
                                answer_ch_starts = [paragraph_tokens[token_span[0]].idx for token_span in
                                                    answer_locations]
                                distance_from_gt = np.abs((np.array(answer_ch_starts) - answer_start_ch))
                                closest_match = distance_from_gt.argmin()

                                answer_start, answer_end = answer_locations[closest_match]
                                
                            # If we have not found answer in document, call heuristic from AllenNLP
                            elif not answer_locations:
                                # Call heuristic from AllenNLP to help :(
                                token_span = char_span_to_token_span(
                                    [(t.idx, t.idx + len(t.text)) for t in paragraph_tokens],
                                    (answer_start_ch, answer_end))
                                answer_start, answer_end = token_span[0]
                                
                            # Otherwise, everything is OK
                            else:
                                answer_start, answer_end = answer_locations[0]
                            cnt += 1

                            ## Check if the token span is correct
                            ## write correct cases into csv
                            def is_correct():
                                def remove_ws(s):
                                    return "".join(s.split())

                                csvf = csv.writer(f, delimiter=',')
                                if remove_ws(possible_answer["text"]) != remove_ws(
                                        "".join(paragraph_context[answer_start:answer_end + 1])):
                                    csvf.writerow({"id": example_id,
                                                   "topic": topic_title,
                                                   "raw_paragraph_context": paragraph["context"],
                                                   "paragraph_context": joined_paragraph_context,
                                                   "paragraph_token_positions": paragraph_token_positions,
                                                   "question": question,
                                                   "a_start": answer_start,
                                                   "a_end": answer_end,
                                                   "a_extracted": JOIN_TOKEN.join(
                                                       paragraph_context[answer_start:answer_end + 1]),
                                                   "a_gt": possible_answer["text"]}.values())
                                    return False
                                return True

                            if not is_correct():
                                problems += 1

                            examples.append({"id": example_id,
                                             "topic": topic_title,
                                             "raw_paragraph_context": paragraph["context"],
                                             "paragraph_context": joined_paragraph_context,
                                             "paragraph_token_positions": paragraph_token_positions,
                                             "question": question,
                                             "a_start": answer_start,
                                             "a_end": answer_end,
                                             "a_extracted": JOIN_TOKEN.join(
                                                 paragraph_context[answer_start:answer_end + 1]),
                                             "a_gt": possible_answer["text"]})

            # print how many problems token-span mapping problems have occured
            print(f"# problems: {problems}")
            print(f"Problems affect {problems/len(examples)/100:.5f} % of dataset.")
            return examples

In [None]:
# Borrowed from AllenNLP
# https://github.com/allenai/allennlp/blob/master/allennlp/data/dataset_readers/reading_comprehension/util.py
def char_span_to_token_span(token_offsets: List[Tuple[int, int]],
                            character_span: Tuple[int, int]) -> Tuple[Tuple[int, int], bool]:
    """
    Converts a character span from a passage into the corresponding token span in the tokenized
    version of the passage.  If you pass in a character span that does not correspond to complete
    tokens in the tokenized version, we'll do our best, but the behavior is officially undefined.
    We return an error flag in this case, and have some debug logging so you can figure out the
    cause of this issue (in SQuAD, these are mostly either tokenization problems or annotation
    problems; there's a fair amount of both).
    The basic outline of this method is to find the token span that has the same offsets as the
    input character span.  If the tokenizer tokenized the passage correctly and has matching
    offsets, this is easy.  We try to be a little smart about cases where they don't match exactly,
    but mostly just find the closest thing we can.
    The returned ``(begin, end)`` indices are `inclusive` for both ``begin`` and ``end``.
    So, for example, ``(2, 2)`` is the one word span beginning at token index 2, ``(3, 4)`` is the
    two-word span beginning at token index 3, and so on.
    Returns
    -------
    token_span : ``Tuple[int, int]``
        `Inclusive` span start and end token indices that match as closely as possible to the input
        character spans.
    error : ``bool``
        Whether the token spans match the input character spans exactly.  If this is ``False``, it
        means there was an error in either the tokenization or the annotated character span.
    """
    # We have token offsets into the passage from the tokenizer; we _should_ be able to just find
    # the tokens that have the same offsets as our span.
    error = False
    start_index = 0
    while start_index < len(token_offsets) and token_offsets[start_index][0] < character_span[0]:
        start_index += 1
    # start_index should now be pointing at the span start index.
    if token_offsets[start_index][0] > character_span[0]:
        # In this case, a tokenization or labeling issue made us go too far - the character span
        # we're looking for actually starts in the previous token.  We'll back up one.
        start_index -= 1
    if token_offsets[start_index][0] != character_span[0]:
        error = True
    end_index = start_index
    while end_index < len(token_offsets) and token_offsets[end_index][1] < character_span[1]:
        end_index += 1
    if token_offsets[end_index][1] != character_span[1]:
        error = True
    return (start_index, end_index), error

In [None]:
class Baseline(torch.nn.Module):
    # We would like to define all the submodules of our model in initializer
    def __init__(self, config, vocab):
        super().__init__()
        # Embedder - module that constructs token embeddings from token indices
        self.embedder = Embedder(vocab, config)
        # Encoder - which encodes our word representations
        self.encoder = torch.nn.LSTM(
            config["RNN_input_dim"],
            config["RNN_nhidden"],
            config["RNN_layers"],
            dropout=float(config['dropout_rate']),
            batch_first=True,
            bidirectional=True)
        # linear projections, which project max-pooled question representation
        # into answer_start/ answer_end representing space
        self.lin_S = nn.Linear(config["RNN_nhidden"] * 2, config["RNN_nhidden"] * 2)
        self.lin_E = nn.Linear(config["RNN_nhidden"] * 2, config["RNN_nhidden"] * 2)
        
        # dropout - regularization
        self.dropout = nn.Dropout(p=config["dropout_rate"])

    def forward(self, batch,return_max=False):      
        # abbreviations and symbols:
        # batch_size - size of a mini-batch
        # d - embedding dimension
        # q_len - the length of the longest query in mini-batch
        # d_len - the length of the longest document in mini-batch
        # RNN_out - the output dimension of RNN
      
        # 1. Transform token indices to token embeddings
        # dimensions of query token indices: batch_size x q_len
        # dimensions of document token indices: batch_size x d_len
        # dimensions of query/document token embeddings:
        # batch_size x q_len x d / batch_size x d_len x d
        # print("")
        # print("batch_size x q_len ", batch.question.size())
        # print("batch_size x d_len ", batch.document.size())
        # print("")

        q_emb = self.embedder(batch.question)
        d_emb = self.embedder(batch.document)

        # print("batch_size x q_len x d ", q_emb.size())
        # print("batch_size x d_len x d ", d_emb.size())
        # print("")
        # MIND that you can access documents/questions in batch via batch.document, batch.question
        
        # 2. Call the encoder, pass question / document representation to it
        # batch_size x q_len x RNN_out / batch_size x d_len x RNN_out
        # do not forget to apply dropout!
        q_enc, _ = self.encoder(self.dropout(q_emb))
        d_enc, _ = self.encoder(self.dropout(d_emb))
        # print("batch_size x q_len x RNN_out ", q_enc.size())
        # print("batch_size x d_len x RNN_out ", d_enc.size())
        # print("")
        
        # 3. Pick the maximum over time (dimension q_len) out of encoded question representations q_enc
        # batch_size x RNN_out
        # q = q_enc.max(dim=-2)[0]
        # argmax_q = q_enc.argmax(dim=-2)[0]
        q, argmax_q = torch.max(q_enc, dim=1)
        # print("max-size - batch_size x RNN_out", q.size())
        # print("argmax-size - batch_size x RNN_out", argmax_q.size())
        # print("")

        # 4. Project the max-pooled representation in start/end seeking space
        # do not forget to apply dropout!
        # both of shape batch_size x RNN_out
        q_s = self.dropout(self.lin_S(q))
        q_e = self.dropout(self.lin_E(q))
        # print("batch_size x RNN_out ", q_s.size())
        # print("batch_size x RNN_out ", q_e.size())
        # print("")
        
        # 5. Change the shape of q_s, q_e to be:
        # batch_size x RNN_out x 1

        q_s.unsqueeze_(-1), q_e.unsqueeze_(-1)
        # print("batch_size x RNN_out x 1 ", q_s.size())
        # print("batch_size x RNN_out x 1 ", q_e.size())
        
        # 6. Attention 
        # do the batch-wise matrix multiplication between:
        # - q_s and encoded document contents d_enc obtaining unnormalized log probabilities of 
        # answer start
        # - q_e and encoded document contents d_enc obtaining unnormalized log probabilities of
        # answer end
        # both of shape batch_size x d_len x 1

        # print("indamodel6")
        # print(d_enc.size())
        # print(q_s.size())
        s = torch.bmm(d_enc, q_s)
        e = torch.bmm(d_enc, q_e)
        
        # 7. Reshape start / end representations to be batch_size x d_len
        # and return the unnormalized log probabilities
        # !Note that softmax is applied to these inside the CrossEntropyLoss error function.

        s.squeeze_(-1), e.squeeze_(-1) 
        # print("outtadamodel")
        
        
        if return_max:
          return s, e, argmax_q
        return s, e

In [None]:
# Token indices to token embeddings
# Next, we would like to implement submodule of our model, which returns embeddings of each input token. We can simply implement it like following:
class Embedder(torch.nn.Module):
    def __init__(self, vocab, config):
        super().__init__()
        # Whether to scale gradient for embeddings by its frequency
        # (talk to me to explain this further)
        self.scale_grad = config['scale_emb_grad_by_freq']
        
        self.init_vocab(vocab, config['optimize_embeddings'])
        print(f"Optimize embeddings = {config['optimize_embeddings']}")
        print(f"Scale grad by freq: {self.scale_grad}")
        print(f"Vocabulary size = {len(vocab.vectors)}")

    def init_vocab(self, vocab, optimize_embeddings=False, device=None):
        self.embedding_dim = vocab.vectors.shape[1]
        # Create an torch.nn.Embedding abstraction
        self.embeddings = torch.nn.Embedding(len(vocab), self.embedding_dim, scale_grad_by_freq=self.scale_grad)
        
        # Copy over the pre-trained GloVe embeddings
        self.embeddings.weight.data.copy_(vocab.vectors)
        self.embeddings.weight.requires_grad = optimize_embeddings
        
        # Save also vocab, so we can access it later when loading the model if needed
        self.vocab = vocab
        
        # map to gpu
        if device is not None:
            self.embeddings = self.embeddings.to(device)

    def forward(self, input):
        return self.embeddings(input)

In [None]:
def decode(span_start_logits: torch.Tensor, span_end_logits: torch.Tensor) -> \
        Tuple[torch.Tensor,Tuple[torch.Tensor, torch.Tensor]]:
    """
    This method has been borrowed from AllenNLP
    :param span_start_logits: unnormalized start log probabilities
    :param span_end_logits: unnormalized end log probabilities
    :return:
    """
    # We call the inputs "logits" - they could either be unnormalized logits or normalized log
    # probabilities.  A log_softmax operation is a constant shifting of the entire logit
    # vector, so taking an argmax over either one gives the same result.
    if span_start_logits.dim() != 2 or span_end_logits.dim() != 2:
        raise ValueError("Input shapes must be (batch_size, document_length)")
    batch_size, passage_length = span_start_logits.size()
    device = span_start_logits.device
   
  
    
    # span_start_logits.unsqueeze(2) has shape:
    # (batch_size, passage_length, 1)
    
    # span_end_logits.unsqueeze(1) has shape:
    # (batch_size, 1, passage_length)
    
    # Addition in log-domain = multiplication in real domain
    # This will create a matrix containing addition of each span_start_logit with span_end_logit
    # (batch_size, passage_length, passage_length)
    span_log_probs = span_start_logits.unsqueeze(2) + span_end_logits.unsqueeze(1)
    
    # Only the upper triangle of the span matrix is valid; the lower triangle has entries where
    # the span ends before it starts. We will mask these values out
    span_log_mask = torch.triu(torch.ones((passage_length, passage_length),
                                          device=device)).log().unsqueeze(0)
    # The mask will look like this
    #0000000
    #X000000
    #XX00000
    #XXX0000
    #XXXX000
    #XXXXX00
    #XXXXXX0
    # where X are -infinity
    valid_span_log_probs = span_log_probs + span_log_mask # see image above, part 1.
        
    
    # Here we take the span matrix and flatten it, then find the best span using argmax.  We
    # can recover the start and end indices from this flattened list using simple modular
    # arithmetic.
    # (batch_size, passage_length * passage_length)
    # valid_span_log_probs is a vector [s_00,s_01,...,s_0n,s10,s11,...,s1n, ... , sn0,sn1,..., snn] of span scores
    # e.g. s_01 is a score of answer span from token 0 to token 1
    valid_span_log_probs = valid_span_log_probs.view(batch_size, -1) # see image above, part 2.
    
    # Turn all the log-probabilities into probabilities
    logprobs = valid_span_log_probs
    valid_span_probs = F.softmax(valid_span_log_probs, dim=-1)

    best_span_probs, best_spans = valid_span_probs.max(-1) # see image above, part 3.
    logprobs, _ = logprobs.max(-1)
    # best_span_probs of shape batch_size now contains all probabilities for each best span in the batch
    # best_spans of shape batch_size now contains argmaxes of each answer from unrolled sequence valid_span_log_probs
    
    span_start_indices = best_spans // passage_length
    span_end_indices = best_spans % passage_length

    return best_span_probs, (span_start_indices, span_end_indices), logprobs

In [None]:
def get_spans(batch, candidates):
  r = []
  for i in range(len(batch.raw_document_context)):
      candidate_start = candidates[0][i]
      candidates_end = candidates[1][i]
      
      # In initial state of learning, we can predict the start/end in the padding area
      # since we do not do the masking
      # We will fix that here.
      if candidate_start > len(batch.document_token_positions[i]) - 1:
          candidate_start = len(batch.document_token_positions[i]) - 1
      if candidates_end > len(batch.document_token_positions[i]) - 1:
          candidates_end = len(batch.document_token_positions[i]) - 1
      
      # If everything is OK, append (character_start,character_end) of answer span to r
      r.append(batch.raw_document_context[i][batch.document_token_positions[i][candidate_start][0]:
                                             batch.document_token_positions[i][candidates_end][-1]])
  return r

In [None]:
def train_epoch(model: torch.nn.Module, lossfunction: _Loss, optimizer: torch.optim.Optimizer,
              train_iter: Iterator,gradient_clipping_norm = 5.) -> float:
  model.train()
  train_loss = 0
  # set gradients for all parameters to 0
  optimizer.zero_grad()
  for i, batch in enumerate(train_iter):
      # get the unnormalized log probabilities
      logprobs_S, logprobs_E = model(batch)
      
      # compute the (cross-entropy) loss for start and end separately
      loss_s = lossfunction(logprobs_S, batch.a_start)
      loss_e = lossfunction(logprobs_E, batch.a_end)
      loss = loss_s + loss_e
      
      loss.backward() # compute gradients
      torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), gradient_clipping_norm) # clip the gradients
      optimizer.step() # add portion of negative gradients to model parameters
      optimizer.zero_grad() # set gradients for all parameters to 0

      train_loss += loss.item() # .item() returns integer value from 0-dimensional torch tensor (scalar).

      if i % 300 == 0 and i > 0:
          print(f"Training loss: {train_loss / i + 1}")

  return train_loss / len(train_iter.data())


In [None]:
@torch.no_grad() # do not create computational graph in this method, this saves memory
def validate(model: torch.nn.Module, lossfunction: _Loss, iter: Iterator,log_results=False) -> \
      Tuple[float, float, float]:
  # turn on evaluation mode (disable dropout etc.)
  model.eval()

  # initialize variables
  results = dict()
  ids = []
  lossvalues = []
  spans = []
  gt_spans = []
  span_probs = []
  
  # iterate over validation set
  for i, batch in enumerate(iter):
      ids += batch.id
      
      # get predictions
      logprobs_S, logprobs_E = model(batch)
      # compute loss
      loss_s = lossfunction(logprobs_S, batch.a_start)
      loss_e = lossfunction(logprobs_E, batch.a_end)
      loss = loss_s + loss_e
      
      # save loss values into list, we compute loss for each answer position
      # but later we will pick only the best prediction
      lossvalues += loss.tolist()
      
      # decode from log probabilities to predictions
      best_span_probs, candidates, _ = decode(logprobs_S, logprobs_E)
      span_probs += best_span_probs.tolist()
      spans += get_spans(batch, candidates)
      gt_spans += batch.gt_answer

  # compute the final loss and results
  # we need to filter through multiple possible choices and pick the best one
  lossdict = defaultdict(lambda: math.inf)
  probs = defaultdict(lambda: 0)
  for id, value, span, span_prob in zip(ids, lossvalues, spans, span_probs):
      # record only lowest loss
      if lossdict[id] > value:
          lossdict[id] = value
      # record predicted result
      results[id] = span
      # record probability of predicted result
      probs[id] = span_prob
  
  # results logging 
  if log_results:
      write_results(results, probs)
  
  # compute loss from best answer predictions
  loss = sum(lossdict.values()) / len(lossdict)
  
  # write out predictions for evaluation script
  prediction_file = f"./drive/MyDrive/data/squad/dev_results_{socket.gethostname()}.json"
  with open(prediction_file, "w") as f:
      json.dump(results, f)

  # initialize arguments of evaluation script
  dataset_file = "./drive/MyDrive/data/squad/dev-v1.1.json"
  expected_version = '1.1'
  with open(dataset_file) as dataset_file:
      dataset_json = json.load(dataset_file)
      if (dataset_json['version'] != expected_version):
          print('Evaluation expects v-' + expected_version +
                       ', but got dataset with v-' + dataset_json['version'],
                       file=sys.stderr)
      dataset = dataset_json['data']
  with open(prediction_file) as prediction_file:
      predictions = json.load(prediction_file)
  # run the evaluation script
  result = evaluate(dataset, predictions)
  

  return loss, result["exact_match"], result["f1"]

In [None]:
def fit(config, device):
  # print configuration
  print(json.dumps(config, indent=4, sort_keys=True))

  # prepare torchtext fields (different in case of character embeddings)
  if config["char_embeddings"]:
      fields = SquadDataset.prepare_fields_char()
  else:
      fields = SquadDataset.prepare_fields()
  
  # create train/validation datasets
  train, val = SquadDataset.splits(fields)
  fields = dict(fields)
  
  # we use the same field for question and document
  # we can build vocabulary of words it represents by calling build_vocab [this takes a while]
  # for each used word, we can pick the glove embedding and create an embedding matrix with index to embedding mapping
  fields["question"].build_vocab(train, val, vectors=GloVe(name='6B', dim=config["embedding_size"]))
  
  # similarly, we can build character vocabulary, if needed
  if not type(fields["question_char"]) == torchtext.data.field.RawField:
      fields["question_char"].build_vocab(train, val, max_size=config["char_maxsize_vocab"])

  # shuffle the examples to get the best distribution estimate
  train_iter = BucketIterator(train, sort_key=lambda x: -(len(x.question) + len(x.document)),
                              shuffle=True, sort=False, sort_within_batch=True,
                              batch_size=config["train_batch_size"], train=True,
                              repeat=False,
                              device=device)
  
  # sort validation examples for faster validation
  val_iter = BucketIterator(val, sort_key=lambda x: -(len(x.question) + len(x.document)), sort=True,
                            batch_size=config["validation_batch_size"],
                            repeat=False,
                            device=device)
  # create model
  model = Baseline(config, fields["question"].vocab).to(device)
  
  # create optimizer
  optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()),
                   lr=config["learning_rate"])

  start_time = time.time()
  best_model = None
  try:
      best_val_loss = math.inf
      best_val_f1 = 0
      best_em = 0
      no_improvement = 0
      for it in range(config["max_iterations"]):
          print(f"Iteration {it}")
          # run training epoch
          train_epoch(model, CrossEntropyLoss(), optimizer, train_iter)
          
          # validate model
          validation_loss, em, f1 = validate(model, CrossEntropyLoss(reduction='none'), val_iter)     
          print(f"Validation loss/F1/EM: {validation_loss:.2f}, {f1:.2f}, {em:.2f}")
          
          # increment the patience counter
          no_improvement+=1
          
          # Update the best statistics if needed
          if validation_loss < best_val_loss: 
              best_val_loss = validation_loss
              no_improvement = 0
          if f1 > best_val_f1: 
            best_val_f1 = f1
          if em > best_em: 
            best_em = em
            model = model.to(torch.device("cpu"))
            best_model = copy.deepcopy(model)
            model = model.to(torch.device("cuda"))
          print(f"BEST L/F1/EM = {best_val_loss:.2f}/{best_val_f1:.2f}/{best_em:.2f}")
          
          # Early stopping
          # if the validation loss did not improved for several iterations
          # the training is finished
          if no_improvement>=config["patience"]:
            break
       

  except KeyboardInterrupt:
      print('-' * 120)
      print('Exit from training early.')
  finally:
      print(f'Finished after {(time.time() - start_time) / 60} minutes.')
      return best_model,best_val_loss, best_val_f1, best_em

In [None]:
!ls -a


In [None]:
baseline_config = {"modelname": "baseline",
                   "train_batch_size": 40,
                   "validation_batch_size": 128,
                   "embedding_size": 100,
                   "optimize_embeddings": False,
                   "scale_emb_grad_by_freq": False,
                   "RNN_input_dim": 100,
                   "dropout_rate": 0.2,
                   "RNN_nhidden": 256,
                   "learning_rate": 5e-3,
                   "RNN_layers": 4,
                   "max_iterations": 100,
                   "optimizer": "adam",
                   "patience":2,
                   "char_embeddings": False}

model, validation_loss, f1, em = fit(baseline_config, device)

In [None]:
baseline_config = {"modelname": "baseline",
                   "train_batch_size": 40,
                   "validation_batch_size": 128,
                   "embedding_size": 100,
                   "optimize_embeddings": False,
                   "scale_emb_grad_by_freq": False,
                   "RNN_input_dim": 100,
                   "dropout_rate": 0.1,
                   "RNN_nhidden": 200,
                   "learning_rate": 5e-3,
                   "RNN_layers": 4,
                   "max_iterations": 100,
                   "optimizer": "adam",
                   "patience":2,
                   "char_embeddings": False}
                   
def revalidate(config):
  # validate model
  # print configuration
  print(json.dumps(config, indent=4, sort_keys=True))

  # prepare torchtext fields (different in case of character embeddings)
  if config["char_embeddings"]:
      fields = SquadDataset.prepare_fields_char()
  else:
      fields = SquadDataset.prepare_fields()
  
  # create train/validation datasets
  train, val = SquadDataset.splits(fields)
  fields = dict(fields)
  
  # we use the same field for question and document
  # we can build vocabulary of words it represents by calling build_vocab [this takes a while]
  # for each used word, we can pick the glove embedding and create an embedding matrix with index to embedding mapping
  fields["question"].build_vocab(train, val, vectors=GloVe(name='6B', dim=config["embedding_size"]))
  
  # sort validation examples for faster validation
  val_iter = BucketIterator(val, sort_key=lambda x: -(len(x.question) + len(x.document)), sort=True,
                            batch_size=config["validation_batch_size"],
                            repeat=False,
                            device=device)
  validation_loss, em, f1 = validate(model, CrossEntropyLoss(reduction='none'), val_iter)     
  print(f"Validation loss/F1/EM: {validation_loss:.2f}, {f1:.2f}, {em:.2f}")


In [None]:
revalidate(baseline_config)

# validate n stuff


In [None]:
if not os.path.isdir("saved"):
  os.mkdir("saved")
torch.save(model, "./drive/MyDrive/data/saved/model_saved_4layers_256hidden_40tB_100embb.pt")

In [None]:
# Uncomment to download and load pre-trained model
#!wget -P "saved/" -nc "www.stud.fit.vutbr.cz/~ifajcik/bissit19/checkpoint_Baseline_EM_28.33_F1_39.01_L_4.78.pt"
model = torch.load("./drive/MyDrive/data/saved/model_saved_4layers200hidden.pt")
model = model.to(torch.device("cuda"))
model = model.eval()

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
  
print(f"Models has {count_parameters(model)} parameters")

In [None]:
def write_results(results, probs, val_file="./drive/MyDrive/data/squad/dev-v1.1.json"):
  if not os.path.isdir("results"):
    os.mkdir("results")
  f = open(f"results/result_{get_timestamp()}_{socket.gethostname()}.csv", mode="w")
  csvw = csv.writer(f, delimiter=',')
  HEADER = ["Correct", "Ground Truth(s)", "Prediction", "Confidence", "Question", "Context", "Topic", "ID"]
  csvw.writerow(HEADER)
  with open(val_file) as fd:
      data_json = json.load(fd)
      for data_topic in data_json["data"]:
          for paragraph in data_topic["paragraphs"]:
              for question_and_answers in paragraph['qas']:
                  prediction = results[question_and_answers["id"]]
                  confidence = str(f"{probs[question_and_answers['id']]:.2f}")
                  answers = "|".join(map(lambda x: x['text'], question_and_answers['answers']))
                  correct = int(results[question_and_answers["id"]].lower() in map(lambda x: x['text'].lower(),
                                                                                   question_and_answers[
                                                                                       'answers']))
                  ex = [correct,
                        answers,
                        prediction,
                        confidence,
                        question_and_answers['question'],
                        paragraph["context"],
                        data_topic["title"],
                        question_and_answers["id"]]
                  csvw.writerow(ex)
  f.close()

# GoogleTrans


In [None]:
res = retrieve("Kdy se porařilo první prokazatelné dosažení severního pólu ?")
for result in res:
  print(result)
  print()

In [None]:
# get majka database
!curl --remote-name-all https://nlp.fi.muni.cz/ma{/majka.w-lt}
!mv majka.w-lt drive/MyDrive/data/

In [None]:
# download czech squad
!curl --remote-name-all https://lindat.cz/repository/xmlui/bitstream/handle/11234/1-3069{/sqad_v3.tar.xz}
!mv sqad_v3.tar.xz drive/MyDrive/data/
!tar -xf drive/MyDrive/data/sqad_v3.tar.xz

In [None]:
!pip install googletrans==4.0.0-rc1
!pip install wikipedia
!pip install rank_bm25
!pip install majka

from googletrans import Translator
translator = Translator()
import wikipedia
wikipedia.set_lang("cs") 
from rank_bm25 import BM25Okapi
import re
import majka

wiki retriever


In [None]:
def extract_que_ans(dirnum, filename):
  # get question and answer from czech squad 
  f = open(f"drive/MyDrive/data/cz_sqad/data/{dirnum}/{filename}", "r")
  q = f.read().split("\n")
  question = ""

  for line in q:
    line = line.split("\t")[0]
    if line in {"<s>", "<g/>", "</s>"}:
      continue
    question += line + " "

  f.close()
  return question

In [None]:
# save the most common czech words
common = "být a se v na ten on že s z který mít do já o k i jeho ale svůj jako za moci rok pro tak po tento co když všechen už jak aby od nebo říci jeden jen můj jenž člověk ty stát u muset velký chtít také až než ještě při jít pak před dva však ani vědět nový hodně podle další celý jiný mezi dát tady den tam kde doba každý místo dobrý takový strana protože nic začít něco vidět říkat ne sám bez či dostat nějaký proto"
common = common.split()
punctuation = ". , ? ! ... \" ( ) ; - /"
punctuation = punctuation.split()

def iscommon(x):
  if x in common or x in punctuation:
    return True
  else:
    return False

def delete_common(tokens):
  tokens = [x for x in tokens if not iscommon(x)]
      
  return tokens

In [None]:
def search_again(tokens):

  searched_term = (' ').join(tokens)
  doc_list = wikipedia.search(searched_term)

  if len(tokens) == 0:
    return []

  if len(doc_list) == 0:
    del tokens[0]
    return search_again(tokens)

  return doc_list

In [None]:
morph = majka.Majka('drive/MyDrive/data/majka.w-lt')
morph.flags |= majka.ADD_DIACRITICS  # find word forms with diacritics
morph.flags |= majka.DISALLOW_LOWERCASE  # do not enable to find lowercase variants
morph.flags |= majka.IGNORE_CASE  # ignore the word case whatsoever
morph.flags = 0  # unset all flags

morph.tags = False  # return just the lemma, do not process the tags
morph.first_only = True  # return only the first entry
morph.negative = "ne"

# returns lemma of each token in a list of lemmatized tokens
def lemmatize(text):

  tok_text = text.lower()
  tok_text = re.split("\W", text)

  # lemmatize each token
  lemmatized_tokens = []
  for token in tok_text:
    if token == '':
      continue
    lemma = morph.find(token)
    if len(lemma) == 0:
      lemmatized_tokens.append(token)
    else:
      lemmatized_tokens.append(lemma[0]['lemma'])

  return lemmatized_tokens


In [None]:
def retrieve(question):  

  #search for documents
  doc_list = wikipedia.search(question)

  # simplify the search if its too bad
  if len(doc_list) == 0:
    # extract important for wiki
    tokens = lemmatize(question)
    tokens = delete_common(tokens)
    doc_list = search_again(tokens)
    
  if len(doc_list) == 0:
      return ""

  # split docs into paragraphs
  pars = []
  max_docs = 3
  num_docs = 0

  for doc in doc_list:
    # get whole page content
    try:
      doc = wikipedia.page(doc)
    except wikipedia.DisambiguationError as e:
      s = e.options[0]
      doc = wikipedia.page(s)
    result = re.split('== .*. ==|\n\n', doc.content)

    # save stripped paragraphs
    for par in result:
      par = par.strip()
      par = par.strip('=')
      par = par.strip('\n')
      par = par.strip('\r\n')

      if par == '' or par == '\n':
        continue
      pars.append(par)

    num_docs += 1
    if num_docs >= max_docs:
      break

  # tokenize for bm25
  tok_text = []
  for par in pars:
    tok_par = par.lower()
    tok_par = re.split("\W", tok_par)
    for tok in tok_par:
      if tok == "":
        tok_par.remove("")
    tok_text.append(tok_par)

  # build index
  bm25 = BM25Okapi(tok_text)

  # tokenize the query
  tokenized_query = question.lower()
  tokenized_query = re.split("\W", tokenized_query)

  # get results
  results = bm25.get_top_n(tokenized_query, pars, n=3)

  return results


In [None]:
# create very simple batch object
class simple_batch():
  def __init__(self,q,d,raw_d,d_pos):
    map_to_gpu_tensor = lambda x: torch.Tensor(x).long().to(torch.device("cuda")).unsqueeze(0)
    self.question=map_to_gpu_tensor(q)
    self.document=map_to_gpu_tensor(d)
    self.raw_document_context = [raw_d]
    self.document_token_positions = [d_pos]

final pipeline


In [None]:
f = open("drive/MyDrive/data/saved_answers/test.txt", "w")

# write first question-answer pairs in sqad
for i in range(1, 2):
  name = ""
  for _ in range(len(str(i)), 6):
    name += "0"
  name += str(i)

  # extract from dataset
  question = extract_que_ans(name, "01question.vert")
  correct_answer = extract_que_ans(name, "09answer_extraction.vert")

  # wiki search
  documents = retrieve(question)
  bestAnswers = []
  bestProbs = []
  bestDocs = []
  bestLogProbs = []

  question_cs = question # save czech question

  # iterate over retrieved paragraphs
  for document in documents:
    document = document.strip()

    if document == "":
      f.write("question: " + question + "\n" +
              "answer: odpověď nenalezena" + "\n" + 
              "correct answer: " + correct_answer + "\n\n")
      continue;
    try:
      document_cs = document
      document = translator.translate(document, dest='en').text
    except TypeError:
      continue

    # remove some trash
    if (document_cs.strip().startswith("Obrázky, zvuky či videa k tématu")):
      continue

    bestDocs.append(document_cs)

    # translate
    question = translator.translate(question, dest='en').text

    # make sure the current vocab is model's vocab
    vocab = model.embedder.vocab

    # tokenization
    document_tokens, tokenized_document_list = tokenize(document)
    tokenized_question_list = tokenize(question)[1]

    # keep positions of each token in document, we will need this later, when decoding model outputs
    document_token_positions = [[token.idx, token.idx + len(token.text)] for token in document_tokens]

    # lowercasing and numericalization
    numericalized_document = [vocab.stoi[s.lower()] for s in tokenized_document_list]
    numericalized_question = [vocab.stoi[s.lower()] for s in tokenized_question_list]

    batch = simple_batch(numericalized_question,numericalized_document,document,document_token_positions)

    # get predictions with arg_maxes
    logprobs_S, logprobs_E, argmax_Q = model.forward(batch, return_max=True)

    # decode from log probabilities to predictions
    best_span_prob, candidate, logprobs = decode(logprobs_S, logprobs_E)
    confidence =  best_span_prob.item()
    log_conf = logprobs.item()

    #get answer
    answers = get_spans(batch, candidate)
    answer = answers[0]

    # save probs and answer
    bestAnswers.append(answer)
    bestProbs.append(confidence)
    bestLogProbs.append(log_conf)

  # check if any answer was found
  if len(bestProbs) == 0:
    f.write("question: " + question_cs + "\n" +
              "answer: odpověď nenalezena" + "\n" + 
              "correct answer: " + correct_answer + "\n\n")
    continue

  # get the best doc
  # get best answer from retriever according to reader
  document = bestDocs[np.argmax(bestLogProbs, axis=0)]
  answer = bestAnswers[np.argmax(bestLogProbs, axis=0)]
  confidence = bestProbs[np.argmax(bestLogProbs, axis=0)]

  # translate the final answer
  answer =  translator.translate(answer, dest='cs').text

  f.write("otázka č." + name + ": " + question_cs + "\n" +
          "odpověď: " + answer + "\n" + 
          "správná odpověď podle sqad : " + correct_answer + "\n" +
          "----------------------------------------------------------------\n"+
          "získaný dokument: " + document + 
          "\n----------------------------------------------------------------\n"+
          "----------------------------------------------------------------"+
          "\n\n")
  print("wrote: " + name)

  # print(f"The answer is: \"{answer}\".")
  # print(f"The model is confident with {confidence:.2f} probability.")
f.close()