# Requirments



In [None]:
"""
# get majka database
!curl --remote-name-all https://nlp.fi.muni.cz/ma{/majka.w-lt}
!mv majka.w-lt drive/MyDrive/data/
# download czech squad
!curl --remote-name-all https://lindat.cz/repository/xmlui/bitstream/handle/11234/1-3069{/sqad_v3.tar.xz}
!mv sqad_v3.tar.xz drive/MyDrive/data/
!tar -xf drive/MyDrive/data/sqad_v3.tar.xz
"""

In [None]:
!pip install progressbar2
!pip install sentencepiece
!pip install datasets transformers
!pip install googletrans==4.0.0-rc1
!pip install wikipedia
!pip install rank_bm25
!pip install majka

Importing important libraries

In [2]:
import torch
import string
import os
import sys
import time
import shutil
import json
import numpy as np
import collections
import datetime
from tqdm.auto import tqdm
import warnings
import json
import nltk.data
import nltk

from datasets import load_dataset, load_metric
from typing import List, Tuple, Dict
from collections import defaultdict
from transformers import AlbertTokenizerFast, AlbertForQuestionAnswering, TrainingArguments, Trainer, default_data_collator

from rank_bm25 import BM25Okapi, BM25Plus, BM25L
import re
import majka
import wikipedia
from googletrans import Translator
import requests

from google.colab import drive

warnings.filterwarnings("ignore")

nltk.download('punkt')

In [3]:
# Remove pre-cached sample data in colab's directory
if os.path.isdir("sample_data"):
  shutil.rmtree("sample_data")
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing SQUAD

In [None]:
# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
squad_v2 = True
if squad_v2:
  model_checkpoint = "./drive/MyDrive/albert_models/albert_squad2_finetuned"
else:
  model_checkpoint = "./drive/MyDrive/albert_models/albert_finetuned"
batch_size = 16

In [None]:
def prepare_train_features(examples):
    """
    This method has been borrowed from huggingface notebook
    https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb

    """

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

Create tokenizer, load squad dataset and prepare features for training

In [None]:
tokenizer = AlbertTokenizerFast.from_pretrained(model_checkpoint)
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

max_length=384
doc_stride=128
tokenized_datasets = datasets.map(prepare_train_features,
                                  batched=True, 
                                  remove_columns=datasets["train"].column_names)

# Albert model fine-tuning

Load model

In [None]:
model = AlbertForQuestionAnswering.from_pretrained(model_checkpoint)

Create trainer

In [None]:
# training arguments
args = TrainingArguments(
    f"./drive/MyDrive/data/checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)
data_collator = default_data_collator

# creating trainer class
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Train and save

In [None]:
trainer.train()

# saved model to stated dict
trainer.save_model("./drive/MyDrive/data/albert_finetuned")

# Model-reader evaluation

This part has been borrowed from Huggingface notebook
https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb

Prepare our validation dataset features

In [None]:
def prepare_validation_features(examples):
    """
    This method has been borrowed from huggingface notebook
    https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb

    """
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1
        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

Get model predictions on validation dataset

In [None]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    """
    This method has been borrowed from huggingface notebook
    https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb

    """
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

Load features and predictions for validation

In [None]:
# get ground truth features
validation_features = datasets["validation"].map(prepare_validation_features,
                                                 batched=True,
                                                 remove_columns=datasets["validation"].column_names)

# get raw predictions
raw_predictions = trainer.predict(validation_features)

validation_features.set_format(type=validation_features.format["type"], 
                               columns=list(validation_features.features.keys()))

Evaluate

In [None]:
# hyperparameters
max_answer_length = 30
n_best_size = 20

# map examples to features
examples = datasets["validation"]
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

# get final predictions
final_predictions = postprocess_qa_predictions(datasets["validation"], validation_features, raw_predictions.predictions)
# get metric used
metric = load_metric("squad_v2" if squad_v2 else "squad")

#evaluate
if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
metric.compute(predictions=formatted_predictions, references=references)

# Reader

In [4]:
class Reader():

  def __init__(self, model_checkpoint, max_answer_length=10, n_best_size=10, max_length=384, stride=128, use_cpu=False):
    # load all parameters of the reader
    self.max_answer_length = max_answer_length  # max answer span length
    self.n_best_size = n_best_size  # 
    self.max_length = max_length  # max count of tokens in one tokenized passage
    self.stride = stride  # the length of overlap between two mini-batches of tokenizer

    # choose device; cuda if available
    self.device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cpu == False) else "cpu")

    # load tokenizer and model from pretrained checkpoint
    self.tokenizer = AlbertTokenizerFast.from_pretrained(model_checkpoint)
    # load model to device if possible
    self.model = AlbertForQuestionAnswering.from_pretrained(model_checkpoint).to(self.device)

    print("Model loaded from: " + model_checkpoint)
    print(f"Model has {self.count_parameters(self.model)} parameters")
    print("Device selected:")
    print(self.device)


  def decode(self, output, context, offset_mappings):
    """
    get the text span from the unnormalized log probabilities

    method has been partly borrowed from 
    https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb

    """
    # enumerate over all outputs (max output size is 500 tokens in a log prob tensor)
    valid_answers = []

    for i, _ in enumerate(output.start_logits):

      start_logits = output.start_logits[i].cpu().detach().numpy()
      end_logits = output.end_logits[i].cpu().detach().numpy()
      offset_mapping = offset_mappings[i]

      # Gather the indices the best start/end logits:
      start_indexes = np.argsort(start_logits)[-1 : - self.n_best_size - 1 : -1].tolist()
      end_indexes = np.argsort(end_logits)[-1 : - self.n_best_size - 1 : -1].tolist()
      for start_index in start_indexes:
          for end_index in end_indexes:
              # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
              # to part of the input_ids that are not in the context.
              if (
                  start_index >= len(offset_mapping)
                  or end_index >= len(offset_mapping)
                  or offset_mapping[start_index] is None
                  or offset_mapping[end_index] is None
              ):
                  continue
              # Don't consider answers with a length that is either < 0 or > max_answer_length.
              if end_index < start_index or end_index - start_index + 1 > self.max_answer_length:
                  continue
              if start_index <= end_index: # We need to refine that test to check the answer is inside the context
                  start_char = offset_mapping[start_index][0]
                  end_char = offset_mapping[end_index][1]
                  valid_answers.append(
                      {
                          "score": start_logits[start_index] + end_logits[end_index],
                          "text": context[start_char: end_char].strip()
                      }
                  )
    valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:self.n_best_size]
    return valid_answers


  def get_answers(self, question, context):
    """
    get the best answers from the context to the question 

    """
    inputs = self.tokenizer(question, context, 
                      return_tensors='pt',
                      truncation="only_second",
                      max_length=self.max_length, # to prevent cuda running out of memory
                      stride=self.stride,     # overlap within splitted long
                      return_offsets_mapping=True,
                      return_overflowing_tokens=True,
                      padding="max_length")
    inputs.to(self.device)

    outputs = self.model(inputs['input_ids'], 
                    token_type_ids=inputs['token_type_ids'],
                    attention_mask=inputs['attention_mask'])
    
    valid_answers = self.decode(outputs, context, inputs['offset_mapping'])
    
    return valid_answers


  def count_parameters(self, model):
    """
    Counts the parameters of the model

    """

    return sum(p.numel() for p in self.model.parameters() if p.requires_grad)
    

# Retriever


In [60]:
class Retriever():

  def __init__(self):
    wikipedia.set_lang("cs") 

    # save the most common czech words (stop words)
    common = "kdy být a se v na ten on že s z který mít do já o k i jeho ale svůj jako za moci pro tak po tento co když všechen už jak aby od nebo říci jeden jen můj jenž ty stát u muset chtít také až než ještě při jít pak před však ani vědět hodně podle další celý jiný mezi dát tady tam kde každý takový protože nic něco ne sám bez či dostat nějaký proto"
    self.common = common.split()

    # save punctuation to be removed
    punctuation = ". , ? ! ... \" ( ) ; - /"
    self.punctuation = punctuation.split()

    # majka lemmatizer settings
    self.morph = majka.Majka('drive/MyDrive/data/majka.w-lt')
    self.morph.flags = 0  # unset all flags
    self.morph.tags = False  # return just the lemma, do not process the tags
    self.morph.first_only = True  # return only the first entry
    self.morph.negative = "ne"

    # load wiki titles and build index for search
    self.bm25_articles_index, self.titles = self.get_title_search_index()

    # load tokenizer to split text into sentences
    self.tokenizer = nltk.data.load('tokenizers/punkt/czech.pickle')


  def get_title_search_index(self):
    """
    Build index for searching through relevant title names on czech wiki:

    """

    f = open("drive/MyDrive/data/wiki/cswiki-latest-all-titles-in-ns0", "r")
    titles = []

    for line in f: 
      title = ((" ").join(line.split("_"))).strip()
      title = title.strip('\n')
      titles.append(title)

    f.close()

    # tokenize for bm25
    tok_titles = []
    for title in titles:
      tok_tit = re.split(" ", title.lower())
      for tok in tok_tit:
        if tok == "":
          tok_tit.remove("")
      tok_titles.append(tok_tit)

    bm25 = BM25Okapi(tok_titles)

    return bm25, titles


  def search_titles(self, question):
    """
    Search with bm25 among the wiki titles

    """
    tokenized_query = self.delete_common(self.lemmatize(question.lower()))
    # print(tokenized_query)
    results = self.bm25_articles_index.get_top_n(tokenized_query, self.titles, n=5)

    return results


  def get_named_entities(self, question):
    """
    Extracts named entities from the question.

    """

    URL = "https://nlp.fi.muni.cz/projekty/ner/nerJSON.py"
    text = question
    PARAMS = {'text': text}
    r = requests.get(url = URL, params = PARAMS)
    data = r.json() 

    lemmatas = []

    if data != {}:
      for item in data:
        lemma = data[item]['lemma']
        lemmatas.append(lemma)

    return lemmatas

  
  def iscommon(self, x):
    """
    decides if query token is common

    """
    if x in self.common or x in self.punctuation:
      return True
    else:
      return False


  def delete_common(self, tokens):
    """
    Remove the most common czech words from the query tokens (low information value)

    """
    tokens = [x for x in tokens if not self.iscommon(x)]
        
    return tokens

  
  def lemmatize(self, text):
    """
    Returns lemma of each token in a list of lemmatized tokens

    """

    tok_text = text.lower()
    tok_text = re.split("\W", text)

    # lemmatize each token
    lemmatized_tokens = []
    for token in tok_text:
      if token == '':
        continue
      lemma = self.morph.find(token)
      if len(lemma) == 0:
        lemmatized_tokens.append(token)
      else:
        lemmatized_tokens.append(lemma[0]['lemma'])

    return lemmatized_tokens


  def search_again(self, tokens):
    """
    Performs repeated search in case wiki api didnt find any documents

    """
    searched_term = (' ').join(tokens)
    #print(searched_term)
    doc_list = wikipedia.search(searched_term, results=1)

    if len(tokens) == 0:
      return []

    if len(doc_list) == 0:
      del tokens[0]
      return self.search_again(tokens)

    return doc_list


  def get_doc_list(self, question):
    """
    Returns top 1-3 wiki arcitles that might answer the question topic

    """

    # get names entities if present
    named_ERs = self.get_named_entities(question)
    # get relevant article title names
    relevant_titles = self.search_titles(question)

    #search for documents
    max_docs = 1
    doc_list = []

    # search based on recognised named entity
    if len(named_ERs) > 0:
      article = wikipedia.search(named_ERs[0], results=max_docs)
      if len(article) > 0:
        doc_list.append(article[0])
    # search based on best wiki title match
    if len(relevant_titles) > 0:
      article = wikipedia.search(relevant_titles[0], results=max_docs)
      if len(article) > 0:
        doc_list.append(article[0])

    # basic search for the question
    article = wikipedia.search(question, results=max_docs)
    # simplify the search if its too bad
    if len(article) == 0:
      # extract important for wiki
      tokens = self.delete_common(self.lemmatize(question))
      article = self.search_again(tokens)
    doc_list.append(article[0])

    return doc_list

  
  def normalize_length(self, par):
    """
    Splits too long paragraph into smaller ones

    """

    #split long paragraph into sentences
    sentences = self.tokenizer.tokenize(par)

    normalized_pars = []
    new_paragraph = ""

    # iterate over sentences
    for idx, sentence in enumerate(sentences):
      
      if len(new_paragraph) + len(sentence) > 1500:
        normalized_pars.append(new_paragraph)
        new_paragraph = ""

        # make some overlap
        for k, trailing in enumerate(sentences[idx-2:idx]):
          new_paragraph += trailing

      else:
        new_paragraph += sentence
    
    return normalized_pars

  
  def split_documents(self, doc_list):
    """
    Splits each retrievede wiki article into paragraphs and normalizes its lengths

    """

    pars = []
    lemm_pars = []

    # iterate over articles and process each one
    for doc in doc_list:
      # get whole page content
      try:
        doc = wikipedia.page(doc)
      except wikipedia.DisambiguationError as e:
        s = e.options[0]
        try:
          doc = wikipedia.page(s)
        except wikipedia.DisambiguationError:
          continue
      
      # split article into paragraphs
      result = re.split('== .*. ==|\\n\\n', doc.content)

      # save stripped paragraphs
      for par in result:
        par = ((((par.strip()).strip('=')).strip('\n')).strip('\n\n')).strip('\r\n')

        # remove some trash
        if par == '' or par == '\n' or par.strip().startswith("Obrázky, zvuky či videa k tématu"):
          continue

        # check max paragraph length
        if len(par) > 1500:
          # split into smaller paragraphs
          normalized_paragraphs = self.normalize_length(par)
          # append each smaller paragraph
          for norm_par in normalized_paragraphs:
            pars.append(norm_par)
            lemm_pars.append((' ').join(self.delete_common(self.lemmatize(norm_par.lower()))))
        else:
          # append paragraph
          pars.append(par)

          # get lemmas and append
          lemm_pars.append((' ').join(self.delete_common(self.lemmatize(par.lower()))))

    return pars, lemm_pars

  def retrieve(self, question):  
    """
    Returns the top 3 paragraphs for the given question

    """
    # max question length
    if len(question) > 250:
      return ""
    # strip questionmark
    question = question.strip('?')

    # TODO SPEED THIS UP
    # get wiki documents
    doc_list = self.get_doc_list(question)

    # convert to set to only work with unique article names
    doc_list = set(doc_list)

    # TODO SPEED THIS UP
    # split docs into paragraphs
    pars, lemm_pars = self.split_documents(doc_list)

    # tokenize for bm25
    tok_text = []
    for par in lemm_pars:
      tok_par = re.split("\W", par)
      for tok in tok_par:
        if tok == "":
          tok_par.remove("")
      tok_text.append(tok_par)

    # build index
    bm25 = BM25Plus(tok_text)
    # bm25 = BM25Okapi(tok_text)

    # tokenize and lemmatize the query
    tokenized_query = (' ').join(self.delete_common(self.lemmatize(question.lower())))
    tokenized_query = re.split("\W", tokenized_query)

    # get results
    results = bm25.get_top_n(tokenized_query, pars, n=3)

    return results, doc_list


  @staticmethod
  def count_log_conf(best_answer, all_answers):
    """
    Returns the sum of log probs 
    
    """

    log_conf = 0
    for answer in all_answers:
      if (best_answer in answer['text']) or (answer['text'] in best_answer):
        log_conf += answer['score']
    
    return log_conf


# Preprocessing SQAD

In [12]:
class SqadDataset():

  def __init__(self, sqad_dir, save_dir="./sqad_processed", process_boolean=False):
    self.save_dir = save_dir
    self.sqad_dir = sqad_dir
    self.process_boolean = process_boolean

  def extract_answer(self, dirnum):
    """
    Parse the answer of current dataset record.
    Returns the parsed answer and its lemma.

    """

    f = open(f"{self.sqad_dir}/{dirnum}/09answer_extraction.vert", "r")

    q = f.read().split("\n")
    answer = ""
    answer_lemma = ""

    # parse answer
    for line in q:
      # split into columns
      line = line.split("\t")

      # end sign
      if line[0] == "</s>":
        break
      
      # get answer and its lemma
      line_a = line[0]
      if len(line) > 1:
        line_a_lemma = line[1]
        if line_a_lemma == "[number]":
          line_a_lemma = line[0]

      # process special signs
      if line_a in {"<s>", "<g/>", "</s>"}:
        answer = answer[:-1]
        answer_lemma = answer_lemma[:-1]
        continue

      # append answer
      answer += line_a + " "
      answer_lemma += line_a_lemma + " "
    
    f.close()
    return answer, answer_lemma


  def extract_question(self, dirnum):
    """
    Parse the answer of current dataset record.
    Returns the parsed answer and its lemma.
    
    """

    f = open(f"{self.sqad_dir}/{dirnum}/01question.vert", "r")

    q = f.read().split("\n")
    question = ""

    for line in q:
      line = line.split("\t")
      # end sign
      if line[0] == "</s>" and question[-1] == "?":
        break

      line = line[0]
      if line in {"<s>", "<g/>", "</s>"}:
        question = question[:-1]
        continue
      
      if line != "?":
        question += line + " "
      else:
        question += "? "

    f.close()
    return question


  def process_dataset(self, from_q, to_q):
    """
    Process the questions and answers from the sqad dataset and save it as a json file

    Process the dataset from record from_q to record to_q
    
    """

    sqad_dataset = {}
    counter = from_q

    for i in tqdm(range(from_q, to_q+1)):

      # get question number
      q_number = ""
      for _ in range(len(str(i)), 6):
        q_number += "0"
      q_number += str(i)

      # extract from dataset
      question = self.extract_question(q_number)
      correct_answer, lemmatized_answer = self.extract_answer(q_number)

      # exclude yes/no questions
      if not self.process_boolean:
        if correct_answer.strip() == "ano" or correct_answer.strip() == "ne":
          continue

      # save data
      data = {}
      data["question"] = question
      data["answer"] = correct_answer
      data["answer_lemma"] = lemmatized_answer

      sqad_dataset[counter] = data
      counter += 1

    # save extracted data as json
    with open(self.save_dir, "w") as f:
      json.dump(sqad_dataset, f)
      print("Sqad dataset has been processed to: " + self.save_dir)


  @staticmethod
  def load_sqad(saved_dataset_file):
    """
    Loads the saved json daataset as a dictionary
    
    """

    # load preprocessed sqad dataset
    with open(saved_dataset_file) as f: 
        data = json.load(f)
    print("Sqad dataset loaded from: " + saved_dataset_file)

    return data

In [None]:
sqad = SqadDataset(sqad_dir="drive/MyDrive/data/cz_sqad/data/", save_dir="drive/MyDrive/data/test.json")
sqad.process_dataset(100, 150)

HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Sqad dataset has been processed to: drive/MyDrive/data/test.json


# Loading model


Choose model checkpoint

In [13]:
squad_v2 = False
if squad_v2:
  model_checkpoint = "./drive/MyDrive/albert_models/albert_squad2_finetuned"
else:
  model_checkpoint = "./drive/MyDrive/albert_models/albert_finetuned"

# albert xlarge pretrained on squad v2 for experiments
# model_checkpoint = "ktrapeznikov/albert-xlarge-v2-squad-v2"

Create reader



In [14]:
reader = Reader(model_checkpoint)

Model loaded from: ./drive/MyDrive/albert_models/albert_finetuned
Model has 11094530 parameters
Device selected:
cuda:0


Create retriever

In [61]:
retriever = Retriever()

# Final pipeline

In [33]:
def translate(question_cs, documents_cs, translator):
  """
  Translates the czech question and documents and returns
  question and list of documents in english

  """
  # we concatenate the question with all the documents to be translated at once
  # so we minimize the number of requests for googletrans
  delimiter = " _____ "
  concatenated = question_cs

  for doc in documents_cs:
    concatenated += delimiter
    concatenated += doc

  # and translate as a whole
  concatenated = translator.translate(concatenated, src='cs', dest='en').text
  # and split again
  delimiter = "_____"
  concatenated = concatenated.split(delimiter)
  
  # get translated question and doc
  question = concatenated[0]
  documents = concatenated[1:]

  return question, documents

In [17]:
def find_answer(question, reader, retriever, translator):
  """
  Finds the answer to the question

  """

  question_cs = question # save czech question

  # wiki search
  documents_cs, article_list = retriever.retrieve(question)

  # for saving the best results
  bestAnswers = []
  bestDocs = []
  bestLogProbs = []
  bestSummedLogProbs = []

  # translate question and documents for reader
  question, documents = translate(question_cs, documents_cs, translator)

  # iterate over retrieved paragraphs
  for idx, document in enumerate(documents):
    # strip whitespaces
    document = document.strip()

    # chceck if any document has been found for the question
    if document == "":
      continue;

    #get answer -------------------------------------------
    answers = reader.get_answers(question, document)

    # choose valid answer
    answer = ''
    for answer in answers:
      if answer['text'] != '':
        log_conf = answer['score']
        answer = answer['text']
        log_conf_summed = Retriever.count_log_conf(answer, answers)
        break
    #######################################################

    # save probs and answer
    bestAnswers.append(answer)
    bestLogProbs.append(log_conf)
    bestSummedLogProbs.append(log_conf_summed)
    # save retrieved doc
    bestDocs.append(documents_cs[idx])

  ############################################################
  # check if any answer was found
  if len(bestLogProbs) == 0 or bestAnswers[np.argmax(bestLogProbs, axis=0)] == '':
    return ""

  # get the best doc
  # get best answer from retriever according to reader
  document = bestDocs[np.argmax(bestLogProbs, axis=0)]
  answer = bestAnswers[np.argmax(bestLogProbs, axis=0)]

  # translate the final answer
  answer_en = answer
  answer =  translator.translate(answer, src='en', dest='cs').text

  return answer, answer_en, document, bestAnswers, bestLogProbs, bestSummedLogProbs, article_list

# Evaluation on  sqad

In [44]:
def sqad_eval(from_q, to_q, data, save_results_to):
  """
  Evaluates the model on sqad dataset
  """
  # create translator
  translator = Translator()

  # write results to
  f = open(save_results_to, "w")

  # for counting correct answers
  score = 0

  for i in tqdm(range(from_q, to_q+1)):

    # get from dataset
    question = data[str(i)]["question"]
    correct_answer = data[str(i)]["answer"]
    lemmatized_answer = data[str(i)]["answer_lemma"]

    # get answer and other info to the specific question
    answer, answer_en, document, bestAnswers, bestLogProbs, bestSummedLogProbs, article_list = find_answer(question, reader, retriever, translator)

    # write the result to file
    f.write("----------------------------------------------------------------\n"+
            "-----------------------| otázka č." + str(i) + " |------------------------\n" +
            "----------------------------------------------------------------\n"+
            "::otázka : " + question + "\n" +
            "::odpověď: " + answer + " / " + answer_en + "\n" + 
            "::správná odpověď podle sqad : " + correct_answer + "\n" +
            "::lemma odpovědi podle sqad : " + lemmatized_answer + "\n\n" +
            "----------------------------------------------------------------\n"+
            "získaný dokument: " + document + 
            "\n----------------------------------------------------------------\n")
    # what answers did we get over-all - debugging info
    for listitem in bestAnswers:
      f.write('%s ;; ' % listitem)
    f.write("\n")
    for listitem in bestLogProbs:
      f.write('%s ;; ' % listitem)
    f.write("\n")
    for listitem in bestSummedLogProbs:
      f.write('%s ;; ' % listitem)
    f.write("\n")
    for listitem in article_list:
      f.write('%s ;; ' % listitem)
    f.write("\n\n\n")

    # convert to lowercase
    answer = answer.lower()
    answer_en = answer_en.lower()
    correct_answer = correct_answer.lower()
    lemmatized_answer = lemmatized_answer.lower()
    # increment score, if we got any match between the original and retrieved answer
    if (answer in correct_answer or correct_answer in answer or
        answer in lemmatized_answer or lemmatized_answer in answer or 
        answer_en in lemmatized_answer or lemmatized_answer in answer_en):
      score += 1

  # close file descriptor
  f.close()

  # get the count of questions answered for score calculation
  answered_count = to_q - from_q + 1

  print("done")
  print("score: " + str(score) + "/" + str(answered_count))

In [45]:
# load preprocessed sqad dataset
data = SqadDataset.load_sqad("drive/MyDrive/data/sqad_processed_without_yes_no.json")

Sqad dataset loaded from: drive/MyDrive/data/sqad_processed_without_yes_no.json


Run the evaluation and save the results to the chosen file

In [50]:
save_to = "drive/MyDrive/data/saved_answers/saved_answers_albert/test.txt"

sqad_eval(100, 200, data, save_to)

HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))


done
score: 52/101


# Ask a question

In [64]:
# create translator
translator = Translator()

question = "Kdy byla udělena první ceny Emmy?"
answer = find_answer(question, reader, retriever, translator)

if len(answer) == 0:
  print(answer)
else:
  print(answer[0])
  print()
  #print(answer[1])
  print(answer[2])

25. ledna 1949

Cena Emmy (anglicky Emmy Award) představuje každoročně udělované americké televizní ocenění, televizní obdoba filmových Oscarů.
Jsou vedeny ve více odvětvích amerického televizního průmyslu a jsou předávány na rozdílných každoročních ceremoniích během celého roku. Nejznámějšími předáváními jsou Primetime Emmys a Daytime Emmys, které hodnotí výjimečnou práci v americkém zábavném programu v průběhu dne a večera. Mezi další známé ceremonie cen Emmy patří ty, které hodnotí sportovní vysílání, národní televizní zprávy a dokumentární pořady, národní pracovní a finanční zprávy a technologické a inženýrské úspěchy v televizi.
Tři podobné, ale oddělené organizace organizují cen Emmy: akademie televizního umění a věd (ATAS), národní akademie televizního umění a věd (NATAS) a mezinárodní akademie televizního umění a věd (IATAS). Každá z organizací je zodpovědná za vytváření specifické složky na předávání cen.
První cena Emmy byla udělena 25. ledna 1949 při premiérovém slavnostním 