In [None]:
username = 'MarcelloCeresini'
repository = 'QuestionAnswering'

# COLAB ONLY CELLS
try:
    import google.colab
    IN_COLAB = True
    !pip3 install transformers
    !git clone https://www.github.com/{username}/{repository}.git
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd /content/QuestionAnswering/src
except:
    IN_COLAB = False

# Setup

 ## Imports & paths definitions

In [6]:
%matplotlib inline

from genericpath import exists
import os
from tqdm import tqdm
import random
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from functools import partial

from sklearn.feature_extraction.text import TfidfVectorizer

from config import Config
config = Config()
import utils

# Fix random seed for reproducibility
np.random.seed(config.RANDOM_SEED)
random.seed(config.RANDOM_SEED)
tf.random.set_seed(config.RANDOM_SEED)

from typing import List, Dict
#os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

ROOT_PATH = os.path.dirname(os.getcwd())
TRAINING_FILE = os.path.join(ROOT_PATH, 'data', 'training_set.json')
VALIDATION_FILE = os.path.join(ROOT_PATH, 'data', 'validation_set.json')
TEST_FILE = os.path.join(ROOT_PATH, 'data', 'dev_set.json')
BEST_WEIGHTS_PATH = "./checkpoints/normal.h5" if not IN_COLAB else \
    '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_100_tpu_h5_cval/normal.h5'
BERT_BEST_WEIGHTS_PATH = "./checkpoints/bert.h5" if not IN_COLAB else \
    "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_BERT_100_tpu_h5_cval/bert.h5"

if IN_COLAB:
    checkpoint_dir = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/training_dpr/'
    datasets_dir = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/datasets/dpr/'
else:
    checkpoint_dir = os.path.join(ROOT_PATH, "data", "training_dpr")
    datasets_dir = os.path.join(checkpoint_dir, "dataset")

representations_dir = os.path.join(datasets_dir, 'representations')
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(datasets_dir, exist_ok=True)
os.makedirs(os.path.join(ROOT_PATH, 'data', 'results', 'dpr_results'), exist_ok=True)

## Preparing paragraphs and questions

In [7]:
train_paragraphs_and_questions = utils.read_question_set(TRAINING_FILE)['data']
val_paragraphs_and_questions = utils.read_question_set(VALIDATION_FILE)['data']
test_paragraphs_and_questions = utils.read_question_set(TEST_FILE)['data']

# Remove the validation set from the train set
train_paragraphs_and_questions = [article for article in train_paragraphs_and_questions \
                                  if article not in val_paragraphs_and_questions]

In [8]:
from utils import get_questions_and_paragraphs

train_questions, train_paragraphs = get_questions_and_paragraphs(train_paragraphs_and_questions)
val_questions, val_paragraphs = get_questions_and_paragraphs(val_paragraphs_and_questions)
test_questions, test_paragraphs = get_questions_and_paragraphs(test_paragraphs_and_questions)

## Importing questions and paragraphs representations (according to DPR)

In [9]:
train_paragraphs_encodings = np.load(os.path.join(representations_dir, 'train_paragraphs_encodings.npy'))
val_paragraphs_encodings   = np.load(os.path.join(representations_dir, 'val_paragraphs_encodings.npy'))
test_paragraphs_encodings  = np.load(os.path.join(representations_dir, 'test_paragraphs_encodings.npy'))

train_questions_encodings  = np.load(os.path.join(representations_dir, 'train_questions_encodings.npy'))
val_questions_encodings    = np.load(os.path.join(representations_dir, 'val_questions_encodings.npy'))
test_questions_encodings   = np.load(os.path.join(representations_dir, 'test_questions_encodings.npy'))

## Preparing vectorizers

In [10]:
train_vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, max_df=0.8, norm='l2')
val_vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, max_df=0.8, norm='l2')
test_vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True, max_df=0.8, norm='l2')

Train the vectorizers and simultaneously create representations of the paragraphs.

In [11]:
train_docs = train_vectorizer.fit_transform([train_paragraphs[i]['context'] for i in range(len(train_paragraphs))])
val_docs = val_vectorizer.fit_transform([val_paragraphs[i]['context'] for i in range(len(val_paragraphs))])

test_vectorizer.fit([train_paragraphs[i]['context'] for i in range(len(train_paragraphs))] + 
                    [val_paragraphs[i]['context'] for i in range(len(val_paragraphs))])
test_docs = test_vectorizer.transform([test_paragraphs[i]['context'] for i in range(len(test_paragraphs))])

## Utility functions

Some functions that can be used to facilitate the scoring of paragraphs with respect to a query question.

In [12]:
def score_documents(vectorizer, query, docs):
    '''
    Obtain the TfIdf scores between the question and the matrix of paragraphs.
    '''
    q = query['qas']['question']
    q = vectorizer.transform([q]) # q will be a (sparse) matrix with dimensionality 1 x vocab_dim
    # We can compute a vector of all dot products scores and transform it from dense matrix to numpy array like this:
    return np.asarray(np.dot(docs, q.T).todense()).flatten()

def top_n_for_question(paragraphs, vectorizer, query, docs, n=5):
    '''
    Obtain the most relevant paragraph for the presented query question according to the vectorizer.
    '''
    scores = score_documents(vectorizer, query, docs)
    sorted_scores = np.argsort(-scores) # Negated scores for descending order
    return [paragraphs[i] for i in sorted_scores[:n]], scores[sorted_scores[:n]], sorted_scores[:n]

def get_paragraph_encoding_index(question, dataset):
    '''
    Obtain the index of paragraph the question refers to inside a specific dataset.
    '''
    art_id, par_id = question['context_id']
    idx = sum([len(dataset[i]['paragraphs']) for i in range(art_id)]) + par_id
    return idx

# Dataset creation

We need to create a dataset to link each question to the **predicted** best paragraph.

First we create a generator that yields encoded pairs of the form (question - best predicted paragraph).

In [13]:
def predicted_paragraphs_dataset_generator(questions: List[Dict], predicted_paragraphs: List, 
                                            config: Config, return_question_id:bool=False):
    # Iterate over questions
    for i, q in enumerate(questions):
        # We use the paragraph obtained by the vectorizer to compute the best scoring paragraph for the question
        paragraph = predicted_paragraphs[i]
        # Then encode the input as usual using Bert's tokenizer
        encoded_inputs = config.tokenizer(
            q['qas']["question"],               # First we pass the question text
            paragraph['context'],               # Then the best scoring paragraph text
            max_length = config.INPUT_LEN,      # We want to pad and truncate to the max length
            truncation = True,
            padding = 'max_length',             # Pads all sequences to 512.
            return_token_type_ids = config.bert,# Return if the token is from sentence 0 or sentence 1
            return_attention_mask = True,       # Return if it's a pad token or not
        )
        if return_question_id:
            yield dict(encoded_inputs), q['qas']['id']
        else:
            yield dict(encoded_inputs)

Then we generate the "original" dataset containing only the text of the predicted paragraph and the offset mappings of its tokens, which is useful to retrieve the answer to the question.

In [14]:
def create_original_dataset_with_tf_idf(questions: List[Dict], 
                                        predicted_paragraphs: List,
                                        config: Config):
    features = []
    for i, q in enumerate(questions):
        inputs={}
        # The paragraph is collected from those that were pre-predicted
        paragraph = predicted_paragraphs[i]
        encoded_inputs = config.tokenizer(
            q['qas']["question"],               # First we pass the question
            paragraph["context"],               # Then the context
            max_length = config.INPUT_LEN,      # We want to pad and truncate to this length
            truncation = True,
            padding = 'max_length',             # Pads all sequences to 512.
            return_token_type_ids = False,      # Return if the token is from sentence  0 or sentence 1
            return_attention_mask = False,      # Return if it's a pad token or not
            return_offsets_mapping = True       # Returns each token's first and last char positions in the original sentence
        )
        # We fill the inputs dictionary
        inputs["context"] = paragraph["context"]
        inputs["offset_mapping"] = encoded_inputs["offset_mapping"]
        features.append(inputs)
    return tf.data.Dataset.from_tensor_slices(
        pd.DataFrame.from_dict(features).to_dict(orient="list"))

Finally, we create the actual dataset using the generator we defined above.

In [15]:
def create_dataset_using_tf_idf_vectorizer( questions: List[Dict],
                                            predicted_paragraphs: List,
                                            config: Config  ) -> tf.data.Dataset:
    # Create expected signature for the generator output
    if config.bert:
        features = {
            'input_ids': tf.TensorSpec(shape=(512,), dtype=tf.int32), 
            'attention_mask': tf.TensorSpec(shape=(512,), dtype=tf.int32),
            'token_type_ids': tf.TensorSpec(shape=(512,), dtype=tf.int32)
        }
    else:
        features = {
            'input_ids': tf.TensorSpec(shape=(512,), dtype=tf.int32), 
            'attention_mask': tf.TensorSpec(shape=(512,), dtype=tf.int32)
        }
    # The dataset contains the features and the question IDs (strings)
    signature = (features, tf.TensorSpec(shape=(), dtype=tf.string))
    # Instantiates a partial generator
    data_gen = partial(predicted_paragraphs_dataset_generator, 
        questions, predicted_paragraphs, config, return_question_id=True)
    # Creates the dataset with the computed signature
    dataset = tf.data.Dataset.from_generator(data_gen,
        output_signature=signature)
    # Compute dataset length, to be used by tensorflow internals
    dataset = dataset.apply(tf.data.experimental.assert_cardinality(len(questions)))
    # Return the dataset
    return dataset

We prepare some utility classes, objects and functions for handling the score mixing prior to the paragraph selection.

In [16]:
from enum import Enum, auto

class MixingType(Enum):
    TF_IDF_ONLY = auto()
    DPR_ONLY = auto()
    SUM = auto()
    MAX = auto()
    WEIGHTED_SUM = auto()

class DatasetType(Enum):
    TRAIN = auto()
    VAL = auto()
    TEST = auto()
    TEST_WITH_TRAIN_VECT = auto()

datasets_info = {
    DatasetType.TRAIN: {
        'questions': train_questions,
        'question_encodings': train_questions_encodings,
        'paragraphs': train_paragraphs,
        'paragraph_encodings': train_paragraphs_encodings,
        'vectorizer': train_vectorizer,
        'docs_vectorized': train_docs,
        'dataset_path': TRAINING_FILE
    },
    DatasetType.VAL: {
        'questions': val_questions,
        'question_encodings': val_questions_encodings,
        'paragraphs': val_paragraphs,
        'paragraph_encodings': val_paragraphs_encodings,
        'vectorizer': val_vectorizer,
        'docs_vectorized': val_docs,
        'dataset_path': VALIDATION_FILE
    },
    DatasetType.TEST: {
        'questions': test_questions,
        'question_encodings': test_questions_encodings,
        'paragraphs': test_paragraphs,
        'paragraph_encodings': test_paragraphs_encodings,
        'vectorizer': test_vectorizer,
        'docs_vectorized': test_docs,
        'dataset_path': TEST_FILE
    },
}

These functions will handle all of the scores mixing options defined previously.

In [17]:
def weighted_sum_func(dpr_scores, tf_idf_scores, op='normalize', w_dpr=0, w_tfidf=0):
    '''
    A general functions for handling all weighted-sum-based mixing operations between scores.
    '''
    if op == 'normalize':
        dpr_scores = dpr_scores/np.max(dpr_scores)
        tf_idf_scores = tf_idf_scores/np.max(tf_idf_scores)
    elif op == 'standardize':
        dpr_scores = (dpr_scores-np.mean(dpr_scores))/np.std(dpr_scores)
        tf_idf_scores = (tf_idf_scores-np.mean(tf_idf_scores))/np.std(tf_idf_scores)
    return np.argsort(dpr_scores*w_dpr + tf_idf_scores*w_tfidf)

def max_func(dpr_scores, tf_idf_scores):
    '''
    A general functions for handling all max-based mixing operations between scores.
    '''
    dpr_scores = dpr_scores/np.max(dpr_scores)
    tf_idf_scores = tf_idf_scores/np.max(tf_idf_scores)
    return np.argsort([max(d, t) for d, t in zip(dpr_scores, tf_idf_scores)])

def get_best_paragraph_for_question_using_mix_type(dataset_info, i, mix_type:MixingType, h=0.15):    
    '''
    A general function that is able to handle all mixing types transparently.
    It computes both DPR and TfIdf scores and sets up the parameters for mixing them
    according to the `mix_type` argument.
    `h` is the weighted sum hyperparameter which controls the weight of the TfIdf scores
    with respect to the DPR scores.
    '''
    # Compute DPR scores
    sample_q_repr = dataset_info['question_encodings'][i]
    dpr_scores = np.dot(sample_q_repr, dataset_info['paragraph_encodings'].T)
    # Compute TfIdf score
    question_text = dataset_info['questions'][i]['qas']['question']
    vect_question = dataset_info['vectorizer'].transform([question_text])
    tfidf_scores = np.asarray(np.dot(dataset_info['docs_vectorized'], vect_question.T).todense()).flatten()
    # Handle types
    if mix_type is not MixingType.MAX:
        op = 'normalize' if mix_type is not MixingType.WEIGHTED_SUM else 'standardize'
        if mix_type is MixingType.DPR_ONLY:
            w_dpr, w_tfidf = 1, 0
        elif mix_type is MixingType.SUM:
            w_dpr, w_tfidf = 1, 1
        elif mix_type is MixingType.TF_IDF_ONLY:
            w_dpr, w_tfidf = 0, 1
        elif mix_type is MixingType.WEIGHTED_SUM:
            w_dpr, w_tfidf = 1-h, h
        return weighted_sum_func(dpr_scores, tfidf_scores, op, w_dpr, w_tfidf)[-1]
    else:
        return max_func(dpr_scores, tfidf_scores)[-1]

Finally, we define the prediction function and start the evaluations:

In [18]:
def compute_predictions(best_weights_path:str, 
                         path_to_predictions_json:str,
                         config:Config,
                         mixing_type:MixingType=MixingType.WEIGHTED_SUM,
                         weighted_sum_h:float=0.15,
                         dataset_type:DatasetType=DatasetType.TEST,
                         hidden_state_list:List[int]=[3,4,5,6],
                         bert=False):

    # Deal with dataset type
    print("Collecting the requested dataset and vectorizer...")
    if dataset_type in datasets_info:
        dataset = datasets_info[dataset_type]
    else:
        raise NotImplementedError("That dataset type does not exist. "
            "Change the dataset_type argument into one in the class DatasetType")
        
    # We pre-compute the predicted paragraph for each question in the set.
    print("Obtaining best paragraph for questions...")
    predicted_paragraphs = [dataset['paragraphs'][get_best_paragraph_for_question_using_mix_type(
                                dataset, i, mixing_type, weighted_sum_h)]
                            for i in tqdm(range(len(dataset['questions'])))]

    print("Creating model and dataset...")
    config = Config(bert=bert)
    # Process questions
    tf_dataset = create_dataset_using_tf_idf_vectorizer(dataset['questions'], predicted_paragraphs, config)
    print("Number of samples: ", len(tf_dataset))
    tf_dataset = tf_dataset.batch(config.BATCH_SIZE)

    # Generate the original dataset that contains the original context and token-char mapping
    original_dataset = create_original_dataset_with_tf_idf(dataset['questions'], predicted_paragraphs, config)
    original_dataset = original_dataset.batch(config.BATCH_SIZE)

    # Load model with the best obtained weights from the old project
    model = config.create_standard_model(hidden_state_list=hidden_state_list)
    model.load_weights(best_weights_path)

    # Predict the answers to the questions in the dataset
    print("Computing predictions...")
    predictions = utils.compute_predictions(tf_dataset, original_dataset, model)
    print(f"Done! Saving predictions at {path_to_predictions_json} and running evaluation script...")

    # Create a prediction file formatted like the one that is expected
    with open(path_to_predictions_json, 'w') as f:
        json.dump(predictions, f)

# Evaluation

We save the path of the test dataset since almost all evaluations will use it.

In [19]:
TEST_DATASET_PATH = datasets_info[DatasetType.TEST]['dataset_path']

## Distilbert

### Tf-Idf-only evaluation

In [20]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/tf_idf_only_test_predictions.json'
compute_predictions(BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.TF_IDF_ONLY)
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:49<00:00, 212.73it/s]


Creating model and dataset...
Number of samples:  10570


Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [04:38<00:00,  2.38it/s]


Done! Saving predictions at ../data/results/dpr_results/tf_idf_only_test_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 32.79091769157994,
  "f1": 42.08966814487265,
  "total": 10570,
  "HasAns_exact": 32.79091769157994,
  "HasAns_f1": 42.08966814487265,
  "HasAns_total": 10570
}


### DPR-only evaluation

In [21]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/dpr_only_test_predictions.json'
compute_predictions(BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.DPR_ONLY)
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:47<00:00, 223.48it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [04:39<00:00,  2.36it/s]


Done! Saving predictions at ../data/results/dpr_results/dpr_only_test_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 30.57710501419111,
  "f1": 40.149694209963585,
  "total": 10570,
  "HasAns_exact": 30.57710501419111,
  "HasAns_f1": 40.149694209963585,
  "HasAns_total": 10570
}


### Sum between DPR and Tf-Idf scores evaluation

In [22]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/sum_dpr_tf_idf_test_predictions.json'
compute_predictions(BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.SUM)
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:47<00:00, 220.90it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [04:35<00:00,  2.40it/s]


Done! Saving predictions at ../data/results/dpr_results/sum_dpr_tf_idf_test_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 36.51844843897824,
  "f1": 46.594106306624084,
  "total": 10570,
  "HasAns_exact": 36.51844843897824,
  "HasAns_f1": 46.594106306624084,
  "HasAns_total": 10570
}


### Max between DPR and Tf-Idf scores evaluation

In [23]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/max_dpr_tf_idf_test_predictions.json'
compute_predictions(BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.MAX)
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [01:06<00:00, 159.79it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [04:37<00:00,  2.38it/s]


Done! Saving predictions at ../data/results/dpr_results/max_dpr_tf_idf_test_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 31.598864711447494,
  "f1": 41.110529063689434,
  "total": 10570,
  "HasAns_exact": 31.598864711447494,
  "HasAns_f1": 41.110529063689434,
  "HasAns_total": 10570
}


### Weighted sum between DPR and Tf-Idf scores evaluation

In [24]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/weighted_sum_dpr_tf_idf_test_predictions.json'
compute_predictions(BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.WEIGHTED_SUM)
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:49<00:00, 215.04it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [04:39<00:00,  2.37it/s]


Done! Saving predictions at ../data/results/dpr_results/weighted_sum_dpr_tf_idf_test_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 44.210028382213814,
  "f1": 55.97332732000276,
  "total": 10570,
  "HasAns_exact": 44.210028382213814,
  "HasAns_f1": 55.97332732000276,
  "HasAns_total": 10570
}


## Bert

### Tf-Idf-only evaluation

In [25]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/tf_idf_only_test_bert_predictions.json'
compute_predictions(BERT_BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.TF_IDF_ONLY, 
                    bert=True, hidden_state_list=[9,10,11,12])
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:44<00:00, 236.03it/s]


Creating model and dataset...


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Number of samples:  10570


Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [08:38<00:00,  1.27it/s]


Done! Saving predictions at ../data/results/dpr_results/tf_idf_only_test_bert_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 34.48438978240303,
  "f1": 43.71467771561827,
  "total": 10570,
  "HasAns_exact": 34.48438978240303,
  "HasAns_f1": 43.71467771561827,
  "HasAns_total": 10570
}


### DPR-only evaluation

In [26]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/dpr_only_test_bert_predictions.json'
compute_predictions(BERT_BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.DPR_ONLY, 
                    bert=True, hidden_state_list=[9,10,11,12])
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:46<00:00, 227.79it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [08:43<00:00,  1.26it/s]


Done! Saving predictions at ../data/results/dpr_results/dpr_only_test_bert_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 31.807000946073792,
  "f1": 41.39915035613073,
  "total": 10570,
  "HasAns_exact": 31.807000946073792,
  "HasAns_f1": 41.39915035613073,
  "HasAns_total": 10570
}


### Sum between DPR and Tf-Idf scores evaluation

In [27]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/sum_dpr_tf_idf_test_bert_predictions.json'
compute_predictions(BERT_BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.SUM, 
                    bert=True, hidden_state_list=[9,10,11,12])
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:47<00:00, 224.84it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [08:33<00:00,  1.29it/s]


Done! Saving predictions at ../data/results/dpr_results/sum_dpr_tf_idf_test_bert_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 38.438978240302745,
  "f1": 48.43242543504654,
  "total": 10570,
  "HasAns_exact": 38.438978240302745,
  "HasAns_f1": 48.43242543504654,
  "HasAns_total": 10570
}


### Max between DPR and Tf-Idf scores evaluation

In [28]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/max_dpr_tf_idf_test_bert_predictions.json'
compute_predictions(BERT_BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.MAX, 
                    bert=True, hidden_state_list=[9,10,11,12])
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [01:09<00:00, 152.07it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [08:27<00:00,  1.30it/s]


Done! Saving predictions at ../data/results/dpr_results/max_dpr_tf_idf_test_bert_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 33.131504257332075,
  "f1": 42.52371311190534,
  "total": 10570,
  "HasAns_exact": 33.131504257332075,
  "HasAns_f1": 42.52371311190534,
  "HasAns_total": 10570
}


### Weighted sum between DPR and Tf-Idf scores evaluation

In [29]:
PATH_TO_PREDICTIONS_JSON = '../data/results/dpr_results/weighted_sum_dpr_tf_idf_test_bert_predictions.json'
compute_predictions(BERT_BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, config, mixing_type=MixingType.WEIGHTED_SUM, 
                    bert=True, hidden_state_list=[9,10,11,12])
print("------------------------------------------------------------------------------------")
print("Scores:")
!python eval/evaluate.py $TEST_DATASET_PATH $PATH_TO_PREDICTIONS_JSON

Collecting the requested dataset and vectorizer...
Obtaining best paragraph for questions...


100%|██████████| 10570/10570 [00:50<00:00, 208.53it/s]


Creating model and dataset...
Number of samples:  10570


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Computing predictions...


100%|██████████| 661/661 [08:29<00:00,  1.30it/s]


Done! Saving predictions at ../data/results/dpr_results/weighted_sum_dpr_tf_idf_test_bert_predictions.json and running evaluation script...
------------------------------------------------------------------------------------
Scores:
{
  "exact": 46.40491958372753,
  "f1": 58.028319798598396,
  "total": 10570,
  "HasAns_exact": 46.40491958372753,
  "HasAns_f1": 58.028319798598396,
  "HasAns_total": 10570
}
