# Predictions and evaluation notebook

## Setup

In [None]:
# PRIVATE CELL
git_token = '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'
username = 'MarcelloCeresini'
repository = 'QuestionAnswering'

# COLAB ONLY CELLS
try:
    import google.colab
    IN_COLAB = True
    !pip3 install transformers
    !nvidia-smi             # Check which GPU has been chosen for us
    !rm -rf logs
    !git clone https://{git_token}@github.com/{username}/{repository}
    %cd {repository}
    %ls
except:
    IN_COLAB = False

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd /content/QuestionAnswering/src

## Model definition for NER attention

In [3]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from transformers import TFDistilBertModel
from transformers.models.distilbert.modeling_tf_distilbert import TFMultiHeadSelfAttention as MHSA

CHOSEN_ENHANCED_LAYER = 0
CHOSEN_OUTPUT_STATES_IDX = [3, 4, 5, 6]

class TFInjectMultiHeadSelfAttention(MHSA):

    def load_NER_attention(self, NER_attention):
        self.NER_attention = NER_attention

    def call(self, query, key, value, mask, head_mask, output_attentions, training=False):
        # key = key*tf.reshape(self.NER_attention, [self.NER_attention.shape[0], self.NER_attention.shape[1], 1])
        key = key * tf.expand_dims(self.NER_attention, axis=-1)
        return super().call(query, key, value, mask, head_mask, output_attentions, training=training)

class QuestionAnsweringModel(keras.Model):

    def __init__(self, transformer_model: TFDistilBertModel) -> None:
        super(QuestionAnsweringModel, self).__init__()

        self.transformer_model = transformer_model
        # Apply layer change to first attention block
        self.transformer_model.layers[0].transformer.layer[CHOSEN_ENHANCED_LAYER].attention = \
            TFInjectMultiHeadSelfAttention(transformer_model.config)
        
        # Add all remaining layers
        self.dense_S = layers.Dense(1)
        self.dense_E = layers.Dense(1)
        self.flatten = layers.Flatten()
        self.softmax_S = layers.Softmax(name='out_S')
        self.softmax_E = layers.Softmax(name='out_E')

    def call(self, inputs, training=False):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        NER_attention = inputs["NER_attention"]

        # Load the NER tensor into the custom layer
        self.transformer_model.layers[0].transformer.layer[0].attention.load_NER_attention(NER_attention)

        out = self.transformer_model(
            {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
            }
        )

        hidden_states = out.hidden_states
        chosen_states_idx = CHOSEN_OUTPUT_STATES_IDX

        chosen_hidden_states = tf.concat([hidden_states[i] for i in chosen_states_idx], axis=2)

        out_S = self.dense_S(chosen_hidden_states) # dot product between token representation and start vector
        out_S = self.flatten(out_S)
        out_S = self.softmax_S(out_S)

        out_E = self.dense_E(chosen_hidden_states) # dot product between token representation and end vector
        out_E = self.flatten(out_E)
        out_E = self.softmax_E(out_E)

        return out_S, out_E

## Prediction function

In a single function we prepare the model, load the best weights available for it and evaluate the predictions using SQuAD's official script.

In [4]:
import os
import sys
import json
from typing import List

from config import Config
import utils

def predict_and_evaluate(DATASET_PATH:str, 
                         BEST_WEIGHTS_PATH:str, 
                         PATH_TO_PREDICTIONS_JSON:str,
                         hidden_state_list:List[int]=[3,4,5,6],
                         use_NER_attention=False, NER_value=0,
                         bert=False):
    '''
    Uses the standard model to predict the answers to the dataset provided in 
    `DATASET_PATH` using the selected weights (`BEST_WEIGHTS_PATH`), 
    saves the predictions into `PATH_TO_PREDICTIONS_JSON` and executes SQuAD's
    evaluation script to get the exact match accuracy and F1 score.
    '''
    config = Config(bert=bert)
    # Read dataset (JSON file)
    data = utils.read_question_set(DATASET_PATH)
    # Process questions
    dataset = utils.create_dataset_from_generator(data, config, for_training=False,
        use_NER_attention=use_NER_attention, NER_value=NER_value)
    # Generate the original dataset that contains the original context and token-char mapping
    original_dataset = utils.create_original_dataset(data, config)
    original_dataset = original_dataset.batch(config.BATCH_SIZE)
    print("Number of samples: ", len(dataset))
    dataset = dataset.batch(config.BATCH_SIZE)
    # Load model
    if not use_NER_attention:
        model = config.create_standard_model(hidden_state_list=hidden_state_list)
    else:
        model = QuestionAnsweringModel(config.get_transformer())
        # A subclassed model needs to be called at least once before loading the weights, 
        # because it needs to create the graph
        for sample in dataset.take(1):
            model(sample[0])
    # Load best model weights
    model.load_weights(BEST_WEIGHTS_PATH)
    # Predict the answers to the questions in the dataset
    predictions = utils.compute_predictions(dataset, original_dataset, model)
    # Create a prediction file formatted like the one that is expected
    with open(PATH_TO_PREDICTIONS_JSON, 'w') as f:
        json.dump(predictions, f)
    
    !python eval/evaluate.py $DATASET_PATH $PATH_TO_PREDICTIONS_JSON

## Evaluations

### Normal model (TPU)

#### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_100_tpu_h5_cval/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/normal_predictions_val_tpu.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON)

Number of samples:  22535


Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [16:21<00:00,  2.78s/it]


{
  "exact": 42.57821167073441,
  "f1": 63.47665599138955,
  "total": 22535,
  "HasAns_exact": 42.57821167073441,
  "HasAns_f1": 63.47665599138955,
  "HasAns_total": 22535
}


#### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/normal_100_tpu_h5_cval/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/normal_predictions_test_tpu.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [07:21<00:00,  2.66s/it]


{
  "exact": 54.13434247871334,
  "f1": 70.4218316349117,
  "total": 10570,
  "HasAns_exact": 54.13434247871334,
  "HasAns_f1": 70.4218316349117,
  "HasAns_total": 10570
}


### Normal model (GPU)

#### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/old_weights_no_tpu/normal/cp-0007.ckpt"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/normal_predictions_test_gpu.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [07:23<00:00,  2.67s/it]


{
  "exact": 53.78429517502365,
  "f1": 70.23643758487442,
  "total": 10570,
  "HasAns_exact": 53.78429517502365,
  "HasAns_f1": 70.23643758487442,
  "HasAns_total": 10570
}


### Separate layer models

#### Layer 6

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_6/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_6_val.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[6])

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [15:37<00:00,  2.65s/it]


{
  "exact": 42.30752163301531,
  "f1": 62.873386169232035,
  "total": 22535,
  "HasAns_exact": 42.30752163301531,
  "HasAns_f1": 62.873386169232035,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_6/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_6_test.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[6])

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [08:21<00:00,  3.02s/it]


{
  "exact": 53.841059602649004,
  "f1": 70.01180303541746,
  "total": 10570,
  "HasAns_exact": 53.841059602649004,
  "HasAns_f1": 70.01180303541746,
  "HasAns_total": 10570
}


#### Layer 5

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_5/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_5_val.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[5])

Number of samples:  22535


Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [08:14<00:00,  1.40s/it]


{
  "exact": 41.74839139116929,
  "f1": 62.411157409621886,
  "total": 22535,
  "HasAns_exact": 41.74839139116929,
  "HasAns_f1": 62.411157409621886,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_5/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_5_test.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[5])

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [04:21<00:00,  1.58s/it]


{
  "exact": 53.39640491958373,
  "f1": 69.51986803264565,
  "total": 10570,
  "HasAns_exact": 53.39640491958373,
  "HasAns_f1": 69.51986803264565,
  "HasAns_total": 10570
}


#### Layer 4

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_4/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_4_val.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[4])

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [07:17<00:00,  1.24s/it]


{
  "exact": 40.74994453072998,
  "f1": 60.74684044239788,
  "total": 22535,
  "HasAns_exact": 40.74994453072998,
  "HasAns_f1": 60.74684044239788,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_4/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_4_test.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[4])

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [03:27<00:00,  1.25s/it]


{
  "exact": 51.116367076631974,
  "f1": 67.53716270010537,
  "total": 10570,
  "HasAns_exact": 51.116367076631974,
  "HasAns_f1": 67.53716270010537,
  "HasAns_total": 10570
}


#### Layer 3

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_3/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_3_val.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[3])

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [05:34<00:00,  1.06it/s]


{
  "exact": 36.2059019303306,
  "f1": 56.05652193236517,
  "total": 22535,
  "HasAns_exact": 36.2059019303306,
  "HasAns_f1": 56.05652193236517,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_3/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_3_test.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[3])

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [02:40<00:00,  1.03it/s]


{
  "exact": 45.96972563859981,
  "f1": 62.36077645143444,
  "total": 10570,
  "HasAns_exact": 45.96972563859981,
  "HasAns_f1": 62.36077645143444,
  "HasAns_total": 10570
}


#### Layer 2

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_2/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_2_val.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[2])

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [04:31<00:00,  1.30it/s]


{
  "exact": 28.697581539826935,
  "f1": 47.20514090463173,
  "total": 22535,
  "HasAns_exact": 28.697581539826935,
  "HasAns_f1": 47.20514090463173,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_2/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_2_test.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[2])

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [02:21<00:00,  1.17it/s]


{
  "exact": 35.052034058656574,
  "f1": 51.60085646289401,
  "total": 10570,
  "HasAns_exact": 35.052034058656574,
  "HasAns_f1": 51.60085646289401,
  "HasAns_total": 10570
}


#### Layer 1

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_1/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_1_val.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[1])

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [03:21<00:00,  1.75it/s]


{
  "exact": 8.00088750832039,
  "f1": 18.539525957828342,
  "total": 22535,
  "HasAns_exact": 8.00088750832039,
  "HasAns_f1": 18.539525957828342,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/separate_100_tpu_h5_cval/layer_1/tpu_epoch_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/layer_1_test.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[1])

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [01:28<00:00,  1.89it/s]


{
  "exact": 9.12961210974456,
  "f1": 19.420668617291476,
  "total": 10570,
  "HasAns_exact": 9.12961210974456,
  "HasAns_f1": 19.420668617291476,
  "HasAns_total": 10570
}


### Bert model (TPU)

#### Validation set

In [None]:
DATASET_PATH = "../data/validation_set.json"
BEST_WEIGHTS_PATH = "../weights/training_BERT_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '../data/bert_predictions_val_tpu.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[9,10,11,12], bert=True)

Number of samples:  22535


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 353/353 [23:48<00:00,  4.05s/it]


{
  "exact": 44.54848014200133,
  "f1": 65.65412241408059,
  "total": 22535,
  "HasAns_exact": 44.54848014200133,
  "HasAns_f1": 65.65412241408059,
  "HasAns_total": 22535
}


#### Test set

In [None]:
DATASET_PATH = "/content/drive/MyDrive/NLP/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/NLP/weights/training_BERT_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/NLP/data/bert_predictions_test_tpu.txt'

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, hidden_state_list=[9,10,11,12], bert=True)

Number of samples:  10570


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 166/166 [14:37<00:00,  5.29s/it]


{
  "exact": 57.086092715231786,
  "f1": 73.19355280319321,
  "total": 10570,
  "HasAns_exact": 57.086092715231786,
  "HasAns_f1": 73.19355280319321,
  "HasAns_total": 10570
}


### Ensemble DistilBert + Bert

In [None]:
import tensorflow as tf
from tqdm import tqdm

def compute_ensemble_predictions(dataset: tf.data.Dataset,
                                 model: List):
    '''
    Computes predictions given the dataset, the used configuration parameters and model

    Inputs:
    - dataset: a `tf.data.Dataset` on which we will compute predictions.
    - model: a `keras.Model` that computes the predictions.
    '''
    predictions = {}
    # For each sample we can extract from the dataset (it can be a single element or 
    # a batch)
    for sample in tqdm(dataset):
        # We let the model predict the probability tensors given the input features
        features = sample[0]
        pstartv_bert, pendv_bert = model[0].predict(features)
        features.pop("token_type_ids")
        pstartv_distil, pendv_distil = model[1].predict(features)

        pstartv = pstartv_bert + pstartv_distil
        pendv = pendv_bert + pendv_distil
        
        # We obtain the span from the probabilities
        predicted_limits = utils.start_end_token_from_probabilities(
            pstartv, pendv
        )
        # Then we decode the answer's tokens 
        question_ids = [x.decode('utf-8') for x in sample[1].numpy()]

        # Finaally, we produce the output dictionary for the batch
        input_ids = features["input_ids"]
        for i in range(len(input_ids)):
            input_id = input_ids[i]
            question_id = question_ids[i]
            predicted_limit = predicted_limits[i]
            # In the output dictionary, the key is given by the question ID,
            # while the answer is provided as decoded text.
            predictions[question_id] = config.tokenizer.decode(
                input_id[
                    predicted_limit[0]:predicted_limit[1]+1
                ], skip_special_tokens=True
            )
    
    return predictions

def predict_and_evaluate_ensemble(DATASET_PATH:str,
                         BEST_WEIGHTS_PATH:List, 
                         PATH_TO_PREDICTIONS_JSON:str,
                         hidden_state_list:List=[[3,4,5,6], [9,10,11,12]]):
    '''
    Uses the standard model to predict the answers to the dataset provided in 
    `DATASET_PATH` using the selected weights (`BEST_WEIGHTS_PATH`), 
    saves the predictions into `PATH_TO_PREDICTIONS_JSON` and executes SQuAD's
    evaluation script to get the exact match accuracy and F1 score.
    '''
    config = Config()
    # Read dataset (JSON file)
    data = utils.read_question_set(DATASET_PATH)
    # Process questions
    dataset = utils.create_dataset_from_generator(data, config, token_type_ids=True, for_training=False)
    print("Number of samples: ", len(dataset))
    dataset = dataset.batch(config.BATCH_SIZE)

    # Load models
    distilbert_model = config.create_model(False, hidden_state_list=hidden_state_list[0])
    bert_model = config.create_model(True, hidden_state_list=hidden_state_list[1])

    # Load best model weights
    distilbert_model.load_weights(BEST_WEIGHTS_PATH[0])
    bert_model.load_weights(BEST_WEIGHTS_PATH[1])

    # Predict the answers to the questions in the dataset
    predictions = compute_ensemble_predictions(dataset, [bert_model, distilbert_model])

    # Create a prediction file formatted like the one that is expected
    with open(PATH_TO_PREDICTIONS_JSON, 'w') as f:
        json.dump(predictions, f)

#### Validation set

In [None]:
DATASET_PATH = "/content/drive/MyDrive/NLP/data/validation_set.json"
BEST_WEIGHTS_PATH_BERT = "/content/drive/MyDrive/NLP/weights/training_BERT_tpu_last.h5"
BEST_WEIGHTS_PATH_DISTILBERT = "/content/drive/MyDrive/NLP/weights/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/NLP/data/ensemble_predictions_val_tpu.txt'

predict_and_evaluate_ensemble(DATASET_PATH=DATASET_PATH, 
                              BEST_WEIGHTS_PATH=[BEST_WEIGHTS_PATH_DISTILBERT, BEST_WEIGHTS_PATH_BERT], 
                              PATH_TO_PREDICTIONS_JSON=PATH_TO_PREDICTIONS_JSON)
!python /content/eval/evaluate.py $DATASET_PATH $PATH_TO_PREDICTIONS_JSON

{
  "exact": 44.8502329709341,
  "f1": 65.5578733295942,
  "total": 22535,
  "HasAns_exact": 44.8502329709341,
  "HasAns_f1": 65.5578733295942,
  "HasAns_total": 22535
}


#### Test set


In [None]:
DATASET_PATH = "/content/drive/MyDrive/NLP/data/dev_set.json"
BEST_WEIGHTS_PATH_BERT= "/content/drive/MyDrive/NLP/weights/training_BERT_tpu_last.h5"
BEST_WEIGHTS_PATH_DISTILBERT = "/content/drive/MyDrive/NLP/weights/training_normal_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/NLP/data/ensemble_predictions_test_tpu.txt'

predict_and_evaluate_ensemble(DATASET_PATH=DATASET_PATH, BEST_WEIGHTS_PATH=[BEST_WEIGHTS_PATH_DISTILBERT, BEST_WEIGHTS_PATH_BERT], PATH_TO_PREDICTIONS_JSON=PATH_TO_PREDICTIONS_JSON)
!python /content/eval/evaluate.py $DATASET_PATH $PATH_TO_PREDICTIONS_JSON

{
  "exact": 56.9914853358562,
  "f1": 72.94816595843261,
  "total": 10570,
  "HasAns_exact": 56.9914853358562,
  "HasAns_f1": 72.94816595843261,
  "HasAns_total": 10570
}


### NER model

#### NER weight offset = 0.2

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,2/training_NER_02_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_02_predictions_val.txt'
NER_VALUE = 0.2

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [30:21<00:00,  5.16s/it]


{
  "exact": 40.29731528733082,
  "f1": 61.10515720895817,
  "total": 22535,
  "HasAns_exact": 40.29731528733082,
  "HasAns_f1": 61.10515720895817,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,2/training_NER_02_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_02_predictions_test.txt'
NER_VALUE = 0.2

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [13:53<00:00,  5.02s/it]


{
  "exact": 51.36234626300851,
  "f1": 68.03648925049636,
  "total": 10570,
  "HasAns_exact": 51.36234626300851,
  "HasAns_f1": 68.03648925049636,
  "HasAns_total": 10570
}


#### NER weight offset = 0.4

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0.4/training_NER_04_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_04_predictions_val.txt'
NER_VALUE = 0.4

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  22535


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [30:21<00:00,  5.16s/it]


{
  "exact": 40.35944086975815,
  "f1": 61.296511079528514,
  "total": 22535,
  "HasAns_exact": 40.35944086975815,
  "HasAns_f1": 61.296511079528514,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0.4/training_NER_04_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_04_predictions_test.txt'
NER_VALUE = 0.4

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [14:21<00:00,  5.19s/it]


{
  "exact": 51.863765373699145,
  "f1": 68.43574692147928,
  "total": 10570,
  "HasAns_exact": 51.863765373699145,
  "HasAns_f1": 68.43574692147928,
  "HasAns_total": 10570
}


#### NER weight offset = 0.6

##### Validation set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/validation_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,6/training_NER_06_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_06_predictions_val.txt'
NER_VALUE = 0.6

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  22535


Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 353/353 [24:21<00:00,  4.14s/it]


{
  "exact": 40.32837807854449,
  "f1": 61.11215226289375,
  "total": 22535,
  "HasAns_exact": 40.32837807854449,
  "HasAns_f1": 61.11215226289375,
  "HasAns_total": 22535
}


##### Test set

In [None]:
DATASET_PATH = "/content/QuestionAnswering/data/dev_set.json"
BEST_WEIGHTS_PATH = "/content/drive/MyDrive/Uni/Magistrale/NLP/Project/weights/NER_100_tpu_h5_cval/hyperparameter_0,6/training_NER_06_tpu_last.h5"
PATH_TO_PREDICTIONS_JSON = '/content/drive/MyDrive/Uni/Magistrale/NLP/Project/results/NER_06_predictions_test.txt'
NER_VALUE = 0.6

predict_and_evaluate(DATASET_PATH, BEST_WEIGHTS_PATH, PATH_TO_PREDICTIONS_JSON, use_NER_attention=True, NER_value=NER_VALUE)

Number of samples:  10570


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 166/166 [11:21<00:00,  4.11s/it]


{
  "exact": 51.57048249763481,
  "f1": 68.12658908682066,
  "total": 10570,
  "HasAns_exact": 51.57048249763481,
  "HasAns_f1": 68.12658908682066,
  "HasAns_total": 10570
}
