# Predictions and evaluation notebook

## Setup

In [None]:
# PRIVATE CELL
git_token = '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'
username = 'MarcelloCeresini'
repository = 'QuestionAnswering'

# COLAB ONLY CELLS
try:
    import google.colab
    IN_COLAB = True
    !pip3 install transformers
    !nvidia-smi             # Check which GPU has been chosen for us
    !rm -rf logs
    !git clone https://{git_token}@github.com/{username}/{repository}
    %cd {repository}
    %ls
except:
    IN_COLAB = False

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd /content/QuestionAnswering/src

## Model definition for NER attention

In [3]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from transformers import TFDistilBertModel
from transformers.models.distilbert.modeling_tf_distilbert import TFMultiHeadSelfAttention as MHSA

CHOSEN_ENHANCED_LAYER = 0
CHOSEN_OUTPUT_STATES_IDX = [3, 4, 5, 6]

class TFInjectMultiHeadSelfAttention(MHSA):

    def load_NER_attention(self, NER_attention):
        self.NER_attention = NER_attention

    def call(self, query, key, value, mask, head_mask, output_attentions, training=False):
        # key = key*tf.reshape(self.NER_attention, [self.NER_attention.shape[0], self.NER_attention.shape[1], 1])
        key = key * tf.expand_dims(self.NER_attention, axis=-1)
        return super().call(query, key, value, mask, head_mask, output_attentions, training=training)

class QuestionAnsweringModel(keras.Model):

    def __init__(self, transformer_model: TFDistilBertModel) -> None:
        super(QuestionAnsweringModel, self).__init__()

        self.transformer_model = transformer_model
        # Apply layer change to first attention block
        self.transformer_model.layers[0].transformer.layer[CHOSEN_ENHANCED_LAYER].attention = \
            TFInjectMultiHeadSelfAttention(transformer_model.config)
        
        # Add all remaining layers
        self.dense_S = layers.Dense(1)
        self.dense_E = layers.Dense(1)
        self.flatten = layers.Flatten()
        self.softmax_S = layers.Softmax(name='out_S')
        self.softmax_E = layers.Softmax(name='out_E')

    def call(self, inputs, training=False):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        NER_attention = inputs["NER_attention"]

        # Load the NER tensor into the custom layer
        self.transformer_model.layers[0].transformer.layer[0].attention.load_NER_attention(NER_attention)

        out = self.transformer_model(
            {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
            }
        )

        hidden_states = out.hidden_states
        chosen_states_idx = CHOSEN_OUTPUT_STATES_IDX

        chosen_hidden_states = tf.concat([hidden_states[i] for i in chosen_states_idx], axis=2)

        out_S = self.dense_S(chosen_hidden_states) # dot product between token representation and start vector
        out_S = self.flatten(out_S)
        out_S = self.softmax_S(out_S)

        out_E = self.dense_E(chosen_hidden_states) # dot product between token representation and end vector
        out_E = self.flatten(out_E)
        out_E = self.softmax_E(out_E)

        return out_S, out_E

## Prediction function

In a single function we prepare the model, load the best weights available for it and evaluate the predictions using SQuAD's official script.

In [4]:
import os
import sys
import json
from typing import List

from config import Config
import utils

def predict_and_evaluate(DATASET_PATH:str, 
                         BEST_WEIGHTS_PATH:str, 
                         PATH_TO_PREDICTIONS_JSON:str,
                         hidden_state_list:List[int]=[3,4,5,6],
                         use_NER_attention=False, NER_value=0,
                         bert=False):
    '''
    Uses the standard model to predict the answers to the dataset provided in 
    `DATASET_PATH` using the selected weights (`BEST_WEIGHTS_PATH`), 
    saves the predictions into `PATH_TO_PREDICTIONS_JSON` and executes SQuAD's
    evaluation script to get the exact match accuracy and F1 score.
    '''
    config = Config(bert=bert)
    # Read dataset (JSON file)
    data = utils.read_question_set(DATASET_PATH)
    # Process questions
    dataset = utils.create_dataset_from_generator(data, config, for_training=False,
        use_NER_attention=use_NER_attention, NER_value=NER_value)
    # Generate the original dataset that contains the original context and token-char mapping
    original_dataset = utils.create_original_dataset(data, config)
    original_dataset = original_dataset.batch(config.BATCH_SIZE)
    print("Number of samples: ", len(dataset))
    dataset = dataset.batch(config.BATCH_SIZE)
    # Load model
    if not use_NER_attention:
        model = config.create_standard_model(hidden_state_list=hidden_state_list)
    else:
        model = QuestionAnsweringModel(config.get_transformer())
        # A subclassed model needs to be called at least once before loading the weights, 
        # because it needs to create the graph
        for sample in dataset.take(1):
            model(sample[0])
    # Load best model weights
    model.load_weights(BEST_WEIGHTS_PATH)
    # Predict the answers to the questions in the dataset
    predictions = utils.compute_predictions(dataset, original_dataset, model)
    # Create a prediction file formatted like the one that is expected
    with open(PATH_TO_PREDICTIONS_JSON, 'w') as f:
        json.dump(predictions, f)
    
    !python eval/evaluate.py $DATASET_PATH $PATH_TO_PREDICTIONS_JSON

## Evaluations

### Normal model (TPU)

#### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_100_tpu_h5_cval/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/normal_predictions_val_tpu_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
1409it [09:24,  2.50it/s]


{
  "exact": 48.70645662303084,
  "f1": 69.03455794280707,
  "total": 22535,
  "HasAns_exact": 48.70645662303084,
  "HasAns_f1": 69.03455794280707,
  "HasAns_total": 22535
}


#### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_100_tpu_h5_cval/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/normal_predictions_test_tpu_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON)

Number of samples:  10570


100%|██████████| 48/48 [00:06<00:00,  7.42it/s]
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
661it [04:19,  2.55it/s]

{
  "exact": 61.173131504257334,
  "f1": 75.52867201553052,
  "total": 10570,
  "HasAns_exact": 61.173131504257334,
  "HasAns_f1": 75.52867201553052,
  "HasAns_total": 10570
}


### Separate layer models

#### Layer 6

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_6/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_6_val_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[6])

Number of samples:  22535


100%|██████████| 111/111 [00:13<00:00,  8.04it/s]
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
1409it [09:22,  2.51it

{
  "exact": 48.320390503660974,
  "f1": 68.38944666755968,
  "total": 22535,
  "HasAns_exact": 48.320390503660974,
  "HasAns_f1": 68.38944666755968,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_6/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_6_test_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[6])

Number of samples:  10570


100%|██████████| 48/48 [00:06<00:00,  7.35it/s]
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
661it [04:24,  2.50it/s]

{
  "exact": 60.74739829706717,
  "f1": 75.1497476516914,
  "total": 10570,
  "HasAns_exact": 60.74739829706717,
  "HasAns_f1": 75.1497476516914,
  "HasAns_total": 10570
}


#### Layer 5

##### Validation set

In [5]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_5/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_5_val_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[5])

100%|██████████| 111/111 [00:15<00:00,  6.98it/s]


Number of samples:  22535


Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 1409/1409 [14:26<00:00,  1.63it/s]


{
  "exact": 47.8411360106501,
  "f1": 67.92389418729441,
  "total": 22535,
  "HasAns_exact": 47.8411360106501,
  "HasAns_f1": 67.92389418729441,
  "HasAns_total": 22535
}


##### Test set

In [6]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_5/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_5_test_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[5])

100%|██████████| 48/48 [00:07<00:00,  6.03it/s]


Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 661/661 [06:51<00:00,  1.61it/s]


{
  "exact": 60.23651844843898,
  "f1": 74.57716815141067,
  "total": 10570,
  "HasAns_exact": 60.23651844843898,
  "HasAns_f1": 74.57716815141067,
  "HasAns_total": 10570
}


#### Layer 4

##### Validation set

In [7]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_4/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_4_val_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[4])

100%|██████████| 111/111 [00:16<00:00,  6.64it/s]


Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 1409/1409 [12:06<00:00,  1.94it/s]


{
  "exact": 46.78943865098735,
  "f1": 66.21026117006551,
  "total": 22535,
  "HasAns_exact": 46.78943865098735,
  "HasAns_f1": 66.21026117006551,
  "HasAns_total": 22535
}


##### Test set

In [8]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_4/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_4_test_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[4])

100%|██████████| 48/48 [00:08<00:00,  5.97it/s]


Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 661/661 [05:42<00:00,  1.93it/s]


{
  "exact": 57.59697256385998,
  "f1": 72.40476885312422,
  "total": 10570,
  "HasAns_exact": 57.59697256385998,
  "HasAns_f1": 72.40476885312422,
  "HasAns_total": 10570
}


#### Layer 3

##### Validation set

In [9]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_3/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_3_val_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[3])

100%|██████████| 111/111 [00:16<00:00,  6.84it/s]


Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 1409/1409 [09:48<00:00,  2.39it/s]


{
  "exact": 41.54426447747947,
  "f1": 61.10454005294163,
  "total": 22535,
  "HasAns_exact": 41.54426447747947,
  "HasAns_f1": 61.10454005294163,
  "HasAns_total": 22535
}


##### Test set

In [10]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_3/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_3_test_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[3])

100%|██████████| 48/48 [00:08<00:00,  5.46it/s]


Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 661/661 [04:37<00:00,  2.38it/s]


{
  "exact": 51.81646168401135,
  "f1": 66.88629030679493,
  "total": 10570,
  "HasAns_exact": 51.81646168401135,
  "HasAns_f1": 66.88629030679493,
  "HasAns_total": 10570
}


#### Layer 2

##### Validation set

In [11]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_2/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_2_val_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[2])

100%|██████████| 111/111 [00:17<00:00,  6.46it/s]


Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 1409/1409 [07:52<00:00,  2.98it/s]


{
  "exact": 33.12624805857555,
  "f1": 51.57614379132821,
  "total": 22535,
  "HasAns_exact": 33.12624805857555,
  "HasAns_f1": 51.57614379132821,
  "HasAns_total": 22535
}


##### Test set

In [12]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_2/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_2_test_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[2])

100%|██████████| 48/48 [00:08<00:00,  5.91it/s]


Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 661/661 [03:40<00:00,  2.99it/s]


{
  "exact": 39.71617786187323,
  "f1": 55.37620955911666,
  "total": 10570,
  "HasAns_exact": 39.71617786187323,
  "HasAns_f1": 55.37620955911666,
  "HasAns_total": 10570
}


#### Layer 1

##### Validation set

In [13]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_1/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_1_val_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[1])

100%|██████████| 111/111 [00:16<00:00,  6.66it/s]


Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 1409/1409 [05:48<00:00,  4.04it/s]


{
  "exact": 9.496339028178388,
  "f1": 20.43760797464446,
  "total": 22535,
  "HasAns_exact": 9.496339028178388,
  "HasAns_f1": 20.43760797464446,
  "HasAns_total": 22535
}


##### Test set

In [14]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_1/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_1_test_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[1])

100%|██████████| 48/48 [00:08<00:00,  5.99it/s]


Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 661/661 [02:43<00:00,  4.04it/s]


{
  "exact": 10.491958372753075,
  "f1": 20.898479815610294,
  "total": 10570,
  "HasAns_exact": 10.491958372753075,
  "HasAns_f1": 20.898479815610294,
  "HasAns_total": 10570
}


### Bert model (TPU)

#### Validation set

In [17]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_BERT_100_tpu_h5_cval/training_BERT_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/bert_predictions_val_tpu_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[9,10,11,12], bert=True)

100%|██████████| 111/111 [00:20<00:00,  5.35it/s]


Number of samples:  22535


Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 1409/1409 [31:07<00:00,  1.33s/it]


{
  "exact": 51.12935433769692,
  "f1": 71.45761788040576,
  "total": 22535,
  "HasAns_exact": 51.12935433769692,
  "HasAns_f1": 71.45761788040576,
  "HasAns_total": 22535
}


#### Test set

In [18]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_BERT_100_tpu_h5_cval/training_BERT_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/bert_predictions_test_tpu_CORRECT.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[9,10,11,12], bert=True)

100%|██████████| 48/48 [00:08<00:00,  5.97it/s]


Number of samples:  10570


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 661/661 [14:39<00:00,  1.33s/it]


{
  "exact": 64.33301797540209,
  "f1": 78.58992810021775,
  "total": 10570,
  "HasAns_exact": 64.33301797540209,
  "HasAns_f1": 78.58992810021775,
  "HasAns_total": 10570
}


### Ensemble DistilBert + Bert

In [20]:
import tensorflow as tf
from tqdm import tqdm

def compute_ensemble_predictions(dataset: tf.data.Dataset,
                                 original_dataset: tf.data.Dataset,
                                 model: List):
    '''
    Computes predictions given the dataset, the used configuration parameters and model

    Inputs:
    - dataset: a `tf.data.Dataset` on which we will compute predictions.
    - original_dataset: a `tf.data.Dataset` which contains the original context
        and initial/starting characters for each token.
    - model: a `keras.Model` that computes the predictions.
    '''
    predictions = {}
    # For each sample we can extract from the dataset (it can be a single element or 
    # a batch)
    for sample, original_sample in tqdm(zip(dataset, original_dataset), total=len(dataset)):
        # We let the model predict the probability tensors given the input features
        contexts = original_sample["context"].numpy()
        offsets = original_sample["offset_mapping"].numpy()

        features = sample[0]
        pstartv_bert, pendv_bert = model[0].predict(features)
        features.pop("token_type_ids")
        pstartv_distil, pendv_distil = model[1].predict(features)

        # We sum the two probability distributions
        pstartv = pstartv_bert + pstartv_distil
        pendv = pendv_bert + pendv_distil
        
        # We obtain the span from the probabilities
        predicted_limits = utils.start_end_token_from_probabilities(
            pstartv, pendv
        )
        # Then we decode the answer's tokens 
        question_ids = [x.decode('utf-8') for x in sample[1].numpy()]

        input_ids = features["input_ids"]
        # Finally, we produce the output dictionary for the batch
        for i in range(len(input_ids)):
            question_id = question_ids[i]
            predicted_limit = predicted_limits[i]
            context = contexts[i]
            offset = offsets[i]
            # Compute the predictions using the token characters 
            # from the original dataset
            predictions[question_id] = context.decode()[
                offset[predicted_limit[0], 0] 
                : 
                offset[predicted_limit[1], 1]
            ]
    
    return predictions

def predict_and_evaluate_ensemble(DATASET_PATH:str,
                         BEST_WEIGHTS_PATH:List, 
                         PATH_TO_PREDICTIONS_JSON:str,
                         hidden_state_list:List=[[3,4,5,6], [9,10,11,12]]):
    '''
    Uses the standard model to predict the answers to the dataset provided in 
    `DATASET_PATH` using the selected weights (`BEST_WEIGHTS_PATH`), 
    saves the predictions into `PATH_TO_PREDICTIONS_JSON` and executes SQuAD's
    evaluation script to get the exact match accuracy and F1 score.
    '''
    config = Config()
    # Read dataset (JSON file)
    data = utils.read_question_set(DATASET_PATH)
    # Process questions
    dataset = utils.create_dataset_from_generator(data, config, token_type_ids=True, for_training=False)
    print("Number of samples: ", len(dataset))
    dataset = dataset.batch(config.BATCH_SIZE)

    # Create the original dataset with the same order as the processed one
    original_dataset = utils.create_original_dataset(data, config)
    original_dataset = original_dataset.batch(config.BATCH_SIZE)

    # Load models
    distilbert_model = config.create_model(False, hidden_state_list=hidden_state_list[0])
    bert_model = config.create_model(True, hidden_state_list=hidden_state_list[1])

    # Load best model weights
    distilbert_model.load_weights(BEST_WEIGHTS_PATH[0])
    bert_model.load_weights(BEST_WEIGHTS_PATH[1])

    # Predict the answers to the questions in the dataset
    predictions = compute_ensemble_predictions(dataset, original_dataset, [bert_model, distilbert_model])

    # Create a prediction file formatted like the one that is expected
    with open(PATH_TO_PREDICTIONS_JSON, 'w') as f:
        json.dump(predictions, f)

#### Validation set

In [None]:
DATASET_PATH = "/content/drive/MyDrive/NLP/data/validation_set.json"
BEST_WEIGHTS_PATH_BERT = "/content/drive/MyDrive/NLP/weights/training_BERT_tpu_last.h5"
BEST_WEIGHTS_PATH_DISTILBERT = "/content/drive/MyDrive/NLP/weights/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/NLP/data/ensemble_predictions_val_tpu_CORRECT.txt'

predict_and_evaluate_ensemble(DATASET_PATH=DATASET_PATH, 
                              BEST_WEIGHTS_PATH=[BEST_WEIGHTS_PATH_DISTILBERT, BEST_WEIGHTS_PATH_BERT], 
                              PATH_TO_PREDICTIONS_JSON=PATH_TO_PREDICTIONS_JSON)
!python /content/eval/evaluate.py $DATASET_PATH $PATH_TO_PREDICTIONS_JSON

#### Test set


In [None]:
DATASET_PATH = "/content/drive/MyDrive/NLP/data/dev_set.json"
BEST_WEIGHTS_PATH_BERT= "/content/drive/MyDrive/NLP/weights/training_BERT_tpu_last.h5"
BEST_WEIGHTS_PATH_DISTILBERT = "/content/drive/MyDrive/NLP/weights/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/NLP/data/ensemble_predictions_test_tpu_CORRECT.txt'

predict_and_evaluate_ensemble(DATASET_PATH=DATASET_PATH, 
                              BEST_WEIGHTS_PATH=[BEST_WEIGHTS_PATH_DISTILBERT, BEST_WEIGHTS_PATH_BERT], 
                              PATH_TO_PREDICTIONS_JSON=PATH_TO_PREDICTIONS_JSON)
!python /content/eval/evaluate.py $DATASET_PATH $PATH_TO_PREDICTIONS_JSON

### NER model

#### NER weight offset = 0.2

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,2/training_NER_02_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_02_predictions_val.txt'
NER_VALUE = 0.2

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [30:21<00:00,  5.16s/it]


{
  "exact": 40.29731528733082,
  "f1": 61.10515720895817,
  "total": 22535,
  "HasAns_exact": 40.29731528733082,
  "HasAns_f1": 61.10515720895817,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,2/training_NER_02_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_02_predictions_test.txt'
NER_VALUE = 0.2

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [13:53<00:00,  5.02s/it]


{
  "exact": 51.36234626300851,
  "f1": 68.03648925049636,
  "total": 10570,
  "HasAns_exact": 51.36234626300851,
  "HasAns_f1": 68.03648925049636,
  "HasAns_total": 10570
}


#### NER weight offset = 0.4

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0.4/training_NER_04_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_04_predictions_val.txt'
NER_VALUE = 0.4

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [30:21<00:00,  5.16s/it]


{
  "exact": 40.35944086975815,
  "f1": 61.296511079528514,
  "total": 22535,
  "HasAns_exact": 40.35944086975815,
  "HasAns_f1": 61.296511079528514,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0.4/training_NER_04_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_04_predictions_test.txt'
NER_VALUE = 0.4

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [14:21<00:00,  5.19s/it]


{
  "exact": 51.863765373699145,
  "f1": 68.43574692147928,
  "total": 10570,
  "HasAns_exact": 51.863765373699145,
  "HasAns_f1": 68.43574692147928,
  "HasAns_total": 10570
}


#### NER weight offset = 0.6

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,6/training_NER_06_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_06_predictions_val.txt'
NER_VALUE = 0.6

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  22535


Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [24:21<00:00,  4.14s/it]


{
  "exact": 40.32837807854449,
  "f1": 61.11215226289375,
  "total": 22535,
  "HasAns_exact": 40.32837807854449,
  "HasAns_f1": 61.11215226289375,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,6/training_NER_06_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_06_predictions_test.txt'
NER_VALUE = 0.6

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [11:21<00:00,  4.11s/it]


{
  "exact": 51.57048249763481,
  "f1": 68.12658908682066,
  "total": 10570,
  "HasAns_exact": 51.57048249763481,
  "HasAns_f1": 68.12658908682066,
  "HasAns_total": 10570
}
