In [None]:
# Authentication & Google Drive-free version of the below cells, uncomment if there are problems
# COLAB ONLY CELLS
#try:
#    import google.colab
#    IN_COLAB = True
#    !pip3 install transformers  # https://huggingface.co/docs/transformers/installation
#    !nvidia-smi                 # Check which GPU has been chosen for us
#    !rm -rf logs
#    # Download the dataset from personal drive
#    !mkdir data
#    !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=19jcMX4KFwVAp4yvgvw1GXSnSgpoQytqg' -O data/training_set.json
#except:
#    IN_COLAB = False

In [None]:
# PRIVATE CELL
git_token = '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'
username = 'MarcelloCeresini'
repository = 'QuestionAnswering'

In [None]:
# COLAB ONLY CELLS
try:
    import google.colab
    IN_COLAB = True
    !pip3 install transformers
    !nvidia-smi             # Check which GPU has been chosen for us
    !rm -rf logs
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/GitHub/
    !git clone https://{git_token}@github.com/{username}/{repository}
    %cd {repository}
    %ls
except:
    IN_COLAB = False

In [None]:
%load_ext tensorboard

import os
import requests
import zipfile
from tqdm import tqdm
import time
import random
import datetime
from IPython.display import display
from functools import partial

from typing import List, Dict, Callable, Sequence, Tuple

from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


%matplotlib inline

from config import ConfigFile
config = ConfigFile()

# Fix random seed for reproducibility
np.random.seed(config.RANDOM_SEED)
random.seed(config.RANDOM_SEED)
tf.random.set_seed(config.RANDOM_SEED)

In [None]:
ROOT_PATH = os.path.dirname(os.getcwd())
TRAINING_FILE = os.path.join(ROOT_PATH, 'data', 'training_set.json') # comment this if directory works differently
# TRAINING_FILE = os.path.join('data', 'training_set.json') # uncomment this if directory works differently
with open(TRAINING_FILE, 'r') as f:
    questions = json.load(f)

In [None]:
TRAIN_SPLIT_ELEM = int(len(questions['data']) * config.TRAIN_SPLIT)
data = random.sample(questions['data'], len(questions['data'])) # reshuffle the samples

In [None]:
train_dataset = {'data': data[:TRAIN_SPLIT_ELEM]} # recreate the original dataset structure lost by shuffling through the dictionary
val_dataset = {'data': data[TRAIN_SPLIT_ELEM:]}

# we also create a small training set to test the model while building it, just to speed up

small_data = random.sample(train_dataset["data"], config.SMALL_TRAIN_LEN)
small_train_dataset = {'data': small_data}
small_val_data = random.sample(val_dataset["data"], config.SMALL_VAL_LEN)
small_val_dataset = {'data': small_val_data}

In [None]:
tokenizer = config.tokenizer

In [None]:
def find_start_end_token_one_hot_encoded(answers: Dict, offsets: List[Tuple[int]]) -> int:
    '''
    This function returns the starting and ending token of the answer, already one hot encoded and ready for binary crossentropy
    Inputs:
        answers: List[Dict] --> for each question, a list of answers. Each answer contains:
            - answer_start: the index of the starting character
            - text: the text of the answer, that we exploit through the number of chars that it containts
        offsets: List[Tuple[int]] --> the tokenizer from HuggingFace transforms the sentence (question+context)
            into a sequence of tokens. Offsets keeps track of the character start and end indexes for each token
    Output:
        result: Dict --> each key contains only one array, the one-hot encoded version of, respectively, the start
            and end token of the answer in the sentence (question+context)
    '''
    result = {
        "out_S": np.zeros(len(offsets)),
        "out_E": np.zeros(len(offsets))
    }   

    for answer in answers:
        starting_char = answer['answer_start']
        answer_len = len(answer['text'])

        for i in range(1, len(offsets)): # we skip the first token, [CLS], that has (0,0) as a tuple
            # We cycle through all the tokens of the question, until we find (0,0), which determines the separator
            if offsets[i] == (0,0): # The [SEP] special char --> this indicates the beginning of the context
                for j in range(1, len(offsets)-i-1): # We skip the first and the last tokens, both special tokens
                    # If the starting char is in the interval, the index (j) of its position inside the context, 
                    # plus the length of the question (i) is the right index
                    if (starting_char >= offsets[i+j][0]) and (starting_char <= offsets[i+j][1]):
                        result["out_S"][i+j] += 1
                    # if the ending char (starting + length -1) is in the interval, same as above
                    if (starting_char + answer_len - 1 >= offsets[i+j][0]) and (starting_char + answer_len - 1 < offsets[i+j][1]):
                        result["out_E"][i+j] += 1
                        break
                # After this cycle, we must check other answers
                break
    
    return result

def create_data_for_dataset(data):
    '''
    This function takes in input the whole data structure and iteratively composes question+context pairs, plus their label
    Inputs:
        data: Dict --> the data structure containing the data
    Outputs:
        tf.data.Dataset --> the data structure containing (features, labels) that will be fed to the model during fitting
        more specifically:
        features: Dict --> keys:
            - input_ids: array of token ids
            - attention_mask: array indicating if the corresponding token is padding or not
        labels: Dict --> keys:
            - gt_S: array representing the index of the initial token of the answer, one-hot encoded
            - gt_E: array representing the index of the final token of the answer, one-hot encoded

    This function, for each article in "data", extracts all paragraphs (and their text, the "context"), for each paragraph, all questions_and_answers
    At this point, it tokenizes (question+context) while truncating and padding up to MAX_LEN_PAIRS
    Moreover, it also returns the "attention_mask", an array that tells if the token is padding or normal, that will be used by the model

    It also keeps track, through "find_start_end_token_one_hot_encoded", of the index of the initial and final token of the answer, the labels for the model

    In the end, it returns a tf.data.Dataset with the structure (features, labels), to be injected directly in the fit method of the model
    '''
    features = []
    labels = []

    for article in tqdm(data["data"]):
        for paragraph in article["paragraphs"]:
            for question_and_answer in paragraph["qas"]:
                ### QUESTION AND CONTEXT TOKENIZATION ###
                # For question answering with BERT we need to encode both 
                # question and context, and this is the way in which 
                # HuggingFace's BertTokenizer does it.
                # The tokenizer returns a dictionary containing all the information we need
                encoded_inputs = tokenizer(
                    question_and_answer["question"],    # First we pass the question
                    paragraph["context"],               # Then the context
                    max_length = config.INPUT_LEN,         # We want to pad and truncate to this length
                    truncation = True,
                    padding = 'max_length',             # Pads all sequences to 512.
                                                        # If "True" it would pad to the longest sentence in the batch 
                                                        # (in this case we only use 1 sentence, so no padding at all)
                    # return_token_type_ids = True,     # IF USING BERT, DistilBert does not need it 
                    return_token_type_ids = False,      # Return if the token is from sentence 0 or sentence 1 
                    return_attention_mask = True,       # Return if it's a pad token or not
                    return_offsets_mapping = True       # Really important --> returns each token's first and last char position in the original sentence 
                )
                
                ### MAPPING OF THE START OF THE ANSWER BETWEEN CHARS AND TOKENS ###
                # We want to pass from the starting position in chars to the starting position in tokens
                label = find_start_end_token_one_hot_encoded(
                    # We pass the list of answers (usually there is still one per question,
                    #   but we mustn't assume anything)
                    answers = question_and_answer["answers"],
                    # And also the inputs offset mapping just recieved from the tokenizer
                    offsets = encoded_inputs["offset_mapping"]
                )
                
                encoded_inputs.pop("offset_mapping", None) # Removes the offset mapping, not useful anymore 
                                                           # ("None" is used because otherwise KeyError could be raised if the key wasn't present)
                
                # TODO: Add NER attention vector
                encoded_inputs['NER_attention'] = np.ones(512)
                
                features.append(encoded_inputs)
                labels.append(label)

                # DO NOT KNOW IF IT IS NEEDED
                '''
                ### ANSWER TOKENIZATION ###
                # use the same tokenizer also to tokenize the answers
                encoded_answer = tokenizer(
                    question_and_answer["answers"][0]["text"],  # here we only need to pass the answer
                    max_length=MAX_LEN_ANSWERS,
                    truncation = True,
                    padding = 'max_length',
                    add_special_tokens = False,                 # the answer will only be used for the loss, not as input to the model, it does not need special tokens [CLS] and [SEP]
                    return_token_type_ids = False,              # only one sentence
                    return_attention_mask = True)               # still interested in padding tokens
                
                # we add to the dictionary of the pair question-context the token ids of the answer and its mask
                encoded_inputs["answer_ids"] = encoded_answer["input_ids"]
                encoded_inputs["answer_mask"] = encoded_answer["attention_mask"]
                '''

    print("Creating dataset")
    return tf.data.Dataset.from_tensor_slices((
        pd.DataFrame.from_dict(features).to_dict(orient="list"),  # dataframe for features 
        pd.DataFrame.from_dict(labels).to_dict(orient="list")                                                    # dataframe for labels 
    ))

In [None]:
# create the datasets
#####################################################################
 ######## TODO: CHANGE LINE BELOW WITH "train_dataset" #############
#####################################################################
train_ds = create_data_for_dataset(small_train_dataset)
val_ds = create_data_for_dataset(small_val_dataset)

In [None]:
train_ds = train_ds.batch(config.BATCH_SIZE)
val_ds = val_ds.batch(config.VAL_BATCH_SIZE)

In [None]:
transformer_model = config.transformer_model
# FREEZE the layers to only train the head if needed
for layer in transformer_model.layers:
    layer.trainable = False

## Training separate layers

In [None]:
# training cell example: train layers separately
histories = []
for hidden_state in range(1, 7):

    model = config.create_model(hidden_state)

    model.compile(tf.keras.optimizers.Adam(3e-5), 
                loss={'out_S': 'binary_crossentropy', 'out_E': 'binary_crossentropy'},
                metrics={'out_S': 'accuracy', 'out_E': 'accuracy'})

    checkpoint_path = "training" + str(hidden_state) + "/cp-{epoch:04d}.ckpt"
    checkpoint_dir = os.path.dirname(os.path.join(ROOT_PATH, 'data', checkpoint_path))
    
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath = checkpoint_path,
        verbose=1,
        save_weights_only = True,
        save_best_only = False # only in this case, 
    )

    model.save_weights(checkpoint_path.format(epoch=0))

    history = model.fit(
        train_ds, 
        validation_data=val_ds,
        epochs=5, 
        callbacks=[cp_callback]
        )
    
    histories.append(history)


In [None]:
# plot
x = [i for i in range(1, 6)]
for history in histories:
    plt.plot(x, history.history['val_loss'])

plt.xticks([i for i in range(1,6)])
plt.xlabel("epoch")
plt.ylabel("val_loss")
plt.legend([str(i) for i in range(1,7)])
plt.show()

with open("training_histories.json", 'w') as f:
    json.dump(histories, f)

## Training with NER attention enhancement

In [None]:
main_layer = transformer_model.layers[0]
transformer_layers = main_layer.transformer
first_transformer_block = transformer_layers.layer[0]
attention_layer = first_transformer_block.attention

print(attention_layer)

In [None]:
from transformers.models.distilbert.modeling_tf_distilbert import TFMultiHeadSelfAttention as MHSA

class TFInjectMultiHeadSelfAttention(MHSA):

    def load_NER_attention(self, NER_attention):
        self.NER_attention = NER_attention

    def call(self, query, key, value, mask, head_mask, output_attentions, training=False):
        key = key*tf.reshape(self.NER_attention, [self.NER_attention.shape[0], self.NER_attention.shape[1], 1])
        return super().call(query, key, value, mask, head_mask, output_attentions, training=training)

In [None]:
from transformers import TFDistilBertModel

class QuestionAnsweringModel(keras.Model):

    def __init__(self, transformer_model: TFDistilBertModel) -> None:
        super(QuestionAnsweringModel, self).__init__()

        self.transformer_model = transformer_model
        # Apply layer change to first attention block
        transformer_model.layers[0].transformer.layer[0].attention = \
            TFInjectMultiHeadSelfAttention(transformer_model.config)
        
        # Add all remaining layers
        self.dense_S = layers.Dense(1)
        self.dense_E = layers.Dense(1)
        self.flatten = layers.Flatten()
        self.softmax_S = layers.Softmax(name='out_S')
        self.softmax_E = layers.Softmax(name='out_E')

    def call(self, inputs, training=False):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        NER_attention = inputs["NER_attention"]
        # token_type_ids = inputs["token_type_ids"] # uncomment if using BERT

        # Load the NER tensor into the custom layer
        self.transformer_model.layers[0].transformer.layer[0].attention.load_NER_attention(NER_attention)

        out = self.transformer_model(
            {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                # "token_type_ids": token_type_ids # uncomment if using BERT
            }
        )

        # TODO: chose which layers
        hidden_states = out.hidden_states
        chosen_states_idx = [3, 4, 5, 6]

        # TODO: chose merging method
        chosen_hidden_states = tf.concat([hidden_states[i] for i in chosen_states_idx], axis=2)

        # output = layers.Bidirectional(layers.LSTM(64, return_sequences = True, activation = "relu"))(chosen_hidden_states)
        # output = layers.Dense(2, activation = "softmax")(output) # 2 because we need both 

        out_S = self.dense_S(chosen_hidden_states) # dot product between token representation and start vector
        out_S = self.flatten(out_S)
        out_S = self.softmax_S(out_S)

        out_E = self.dense_E(chosen_hidden_states) # dot product between token representation and end vector
        out_E = self.flatten(out_E)
        out_E = self.softmax_E(out_E)

        return {'out_S': out_S, 'out_E': out_E}

In [None]:
model = QuestionAnsweringModel(transformer_model)

model.compile(tf.keras.optimizers.Adam(3e-5), 
                loss={'out_S': 'binary_crossentropy', 'out_E': 'binary_crossentropy'},
                metrics={'out_S': 'accuracy', 'out_E': 'accuracy'})

checkpoint_path = "training" + str(hidden_state) + "/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(os.path.join(ROOT_PATH, 'data', checkpoint_path))

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_path,
    verbose=1,
    save_weights_only = True,
    save_best_only = False # only in this case, 
)

model.save_weights(checkpoint_path.format(epoch=0))

history = model.fit(
    train_ds, 
    validation_data=val_ds,
    epochs=5, 
    callbacks=[cp_callback]
    )


# Predictions

In [None]:
def start_end_token_from_probabilities(pstartv: np.array, 
                                       pendv: np.array, 
                                       dim:int=512) -> List[List[int]]:
    '''
    Returns a List of [StartToken, EndToken] elements computed from the batch outputs.
    '''
    idxs = []
    for i in range(pstartv.shape[0]):
        pstart = np.stack([pstartv[i,:]]*dim, axis=1)
        pend = np.stack([pendv[i,:]]*dim, axis=0)
        sums = pstart + pend
        sums = np.triu(sums, k=1) # Zero out lower triangular matrix + diagonal
        val = np.argmax(sums)
        row = val // dim
        col = val - dim*row
        idxs.append([row,col])
    return idxs

In [None]:
for batch in train_ds.take(1):
    random_in_batch = np.random.randint(0, config.BATCH_SIZE-1)
    input_ids = batch[0]["input_ids"][random_in_batch]
    # attention_mask = sample[0]["attention_mask"][random_in_batch]
    print("Random sample n°", random_in_batch, "in batch of", config.BATCH_SIZE)
    
    print("Question + context: ")
    print(tokenizer.decode(input_ids, skip_special_tokens=True), "\n")

    real_start = np.argmax(batch[1]["out_S"][random_in_batch])
    real_end = np.argmax(batch[1]["out_E"][random_in_batch])
    real_limits = [real_start, real_end]

    # print(np.shape(model.predict(batch[0])[0][random_in_batch]))
    
    print("Real limits: ", real_limits)
    print("Real answer tokens: ", input_ids[real_limits[0]:real_limits[1]+1].numpy())
    print("Real answer: ", tokenizer.decode(input_ids[real_limits[0]:real_limits[1]+1], skip_special_tokens=False))
    
    predicted_limits = start_end_token_from_probabilities(*model.predict(batch[0]))[random_in_batch]
    print("Predicted_limits: ", predicted_limits)
    print("Predicted answer tokens: ", input_ids[predicted_limits[0]:predicted_limits[1]+1].numpy())
    print("Predicted answer: ", tokenizer.decode(input_ids[predicted_limits[0]:predicted_limits[1]+1], skip_special_tokens=True))
