In [None]:
# Authentication & Google Drive-free version of the below cells, uncomment if there are problems
# COLAB ONLY CELLS
#try:
#    import google.colab
#    IN_COLAB = True
#    !pip3 install transformers  # https://huggingface.co/docs/transformers/installation
#    !nvidia-smi                 # Check which GPU has been chosen for us
#    !rm -rf logs
#    # Download the dataset from personal drive
#    !mkdir data
#    !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=19jcMX4KFwVAp4yvgvw1GXSnSgpoQytqg' -O data/training_set.json
#except:
#    IN_COLAB = False

In [None]:
# PRIVATE CELL
git_token = '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'
username = 'MarcelloCeresini'
repository = 'QuestionAnswering'

In [None]:
# COLAB ONLY CELLS
try:
    import google.colab
    IN_COLAB = True
    !pip3 install transformers
    !nvidia-smi             # Check which GPU has been chosen for us
    !rm -rf logs
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/GitHub/
    !git clone https://{git_token}@github.com/{username}/{repository}
    %cd {repository}
    %ls
except:
    IN_COLAB = False

In [None]:
%load_ext tensorboard

import os
from tqdm import tqdm
import random

from typing import List, Dict, Tuple

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

%matplotlib inline

from config import Config
config = Config()
import utils

# Fix random seed for reproducibility
np.random.seed(config.RANDOM_SEED)
random.seed(config.RANDOM_SEED)
tf.random.set_seed(config.RANDOM_SEED)

In [None]:
ROOT_PATH = os.path.dirname(os.getcwd())
TRAINING_FILE = os.path.join(ROOT_PATH, 'data', 'training_set.json') # comment this if directory works differently
# TRAINING_FILE = os.path.join('data', 'training_set.json') # uncomment this if directory works differently
questions = utils.read_question_set(TRAINING_FILE)

In [None]:
TRAIN_SPLIT_ELEM = int(len(questions['data']) * config.TRAIN_SPLIT)
data = random.sample(questions['data'], len(questions['data'])) # reshuffle the samples

In [None]:
train_dataset = {'data': data[:TRAIN_SPLIT_ELEM]} # recreate the original dataset structure lost by shuffling through the dictionary
val_dataset = {'data': data[TRAIN_SPLIT_ELEM:]}

# we also create a small training set to test the model while building it, just to speed up

small_data = random.sample(train_dataset["data"], config.SMALL_TRAIN_LEN)
small_train_dataset = {'data': small_data}
small_val_data = random.sample(val_dataset["data"], config.SMALL_VAL_LEN)
small_val_dataset = {'data': small_val_data}

In [None]:
tokenizer = config.tokenizer

In [None]:
# create the datasets
#####################################################################
 ######## TODO: CHANGE LINE BELOW WITH "train_dataset" #############
#####################################################################
train_ds = utils.create_dataset_and_ids(small_train_dataset, config, for_training=True)
val_ds = utils.create_dataset_and_ids(small_train_dataset, config, for_training=True)

In [None]:
for sample in train_ds, range(len(train_ds)):
    print(sample)
    break

In [None]:
train_ds = train_ds.batch(config.BATCH_SIZE)
val_ds = val_ds.batch(config.VAL_BATCH_SIZE)

In [None]:
for batch in train_ds.take(1):
    print(batch[0].keys())
    print(batch[1].keys())

In [None]:
transformer_model = config.transformer_model
# FREEZE the layers to only train the head if needed
# for layer in transformer_model.layers:
#     layer.trainable = False

## Normal Training

In [None]:
model = config.create_standard_model([3, 4, 5, 6])

model.compile(tf.keras.optimizers.Adam(3e-5), 
            loss={'out_S': 'binary_crossentropy', 'out_E': 'binary_crossentropy'},
            metrics={'out_S': 'accuracy', 'out_E': 'accuracy'})

checkpoint_path = "../data/training_normal" + "/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_path,
    verbose=1,
    save_weights_only = True,
    save_best_only = False # only in this case, 
)

model.save_weights(checkpoint_path.format(epoch=0))

history = model.fit(
    train_ds, 
    validation_data=val_ds,
    epochs=5, 
    callbacks=[cp_callback]
    )

In [None]:
checkpoint_dir = os.path.dirname(checkpoint_path)
print(tf.train.latest_checkpoint(checkpoint_dir))

## Training separate layers

In [None]:
# # training cell example: train layers separately
# histories = []
# for hidden_state in range(1, 7):

#     model = config.create_model(hidden_state)

#     model.compile(tf.keras.optimizers.Adam(3e-5), 
#                 loss={'out_S': 'binary_crossentropy', 'out_E': 'binary_crossentropy'},
#                 metrics={'out_S': 'accuracy', 'out_E': 'accuracy'})

#     checkpoint_path = "training" + str(hidden_state) + "/cp-{epoch:04d}.ckpt"
#     checkpoint_dir = os.path.dirname(os.path.join(ROOT_PATH, 'data', checkpoint_path))
    
#     cp_callback = tf.keras.callbacks.ModelCheckpoint(
#         filepath = checkpoint_path,
#         verbose=1,
#         save_weights_only = True,
#         save_best_only = False # only in this case, 
#     )

#     model.save_weights(checkpoint_path.format(epoch=0))

#     history = model.fit(
#         train_ds, 
#         validation_data=val_ds,
#         epochs=5, 
#         callbacks=[cp_callback]
#         )
    
#     histories.append(history)


In [None]:
# # plot
# x = [i for i in range(1, 6)]
# for history in histories:
#     plt.plot(x, history.history['val_loss'])

# plt.xticks([i for i in range(1,6)])
# plt.xlabel("epoch")
# plt.ylabel("val_loss")
# plt.legend([str(i) for i in range(1,7)])
# plt.show()

# with open("training_histories.json", 'w') as f:
#     json.dump(histories, f)

## Training with NER attention enhancement

In [None]:
# main_layer = transformer_model.layers[0]
# transformer_layers = main_layer.transformer
# first_transformer_block = transformer_layers.layer[0]
# attention_layer = first_transformer_block.attention

# print(attention_layer)

In [None]:
from transformers.models.distilbert.modeling_tf_distilbert import TFMultiHeadSelfAttention as MHSA

class TFInjectMultiHeadSelfAttention(MHSA):

    def load_NER_attention(self, NER_attention):
        self.NER_attention = NER_attention

    def call(self, query, key, value, mask, head_mask, output_attentions, training=False):
        key = key*tf.reshape(self.NER_attention, [self.NER_attention.shape[0], self.NER_attention.shape[1], 1])
        return super().call(query, key, value, mask, head_mask, output_attentions, training=training)

In [None]:
from transformers import TFDistilBertModel

class QuestionAnsweringModel(keras.Model):

    def __init__(self, transformer_model: TFDistilBertModel) -> None:
        super(QuestionAnsweringModel, self).__init__()

        self.transformer_model = transformer_model
        # Apply layer change to first attention block
        transformer_model.layers[0].transformer.layer[0].attention = \
            TFInjectMultiHeadSelfAttention(transformer_model.config)
        
        # Add all remaining layers
        self.dense_S = layers.Dense(1)
        self.dense_E = layers.Dense(1)
        self.flatten = layers.Flatten()
        self.softmax_S = layers.Softmax(name='out_S')
        self.softmax_E = layers.Softmax(name='out_E')

    def call(self, inputs, training=False):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        NER_attention = inputs["NER_attention"]
        # token_type_ids = inputs["token_type_ids"] # uncomment if using BERT

        # Load the NER tensor into the custom layer
        self.transformer_model.layers[0].transformer.layer[0].attention.load_NER_attention(NER_attention)

        out = self.transformer_model(
            {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                # "token_type_ids": token_type_ids # uncomment if using BERT
            }
        )

        # TODO: chose which layers
        hidden_states = out.hidden_states
        chosen_states_idx = [3, 4, 5, 6]

        # TODO: chose merging method
        chosen_hidden_states = tf.concat([hidden_states[i] for i in chosen_states_idx], axis=2)

        # output = layers.Bidirectional(layers.LSTM(64, return_sequences = True, activation = "relu"))(chosen_hidden_states)
        # output = layers.Dense(2, activation = "softmax")(output) # 2 because we need both 

        out_S = self.dense_S(chosen_hidden_states) # dot product between token representation and start vector
        out_S = self.flatten(out_S)
        out_S = self.softmax_S(out_S)

        out_E = self.dense_E(chosen_hidden_states) # dot product between token representation and end vector
        out_E = self.flatten(out_E)
        out_E = self.softmax_E(out_E)

        return {'out_S': out_S, 'out_E': out_E}

In [None]:
# model = QuestionAnsweringModel(transformer_model)

# model.compile(tf.keras.optimizers.Adam(3e-5), 
#                 loss={'out_S': 'binary_crossentropy', 'out_E': 'binary_crossentropy'},
#                 metrics={'out_S': 'accuracy', 'out_E': 'accuracy'})

# checkpoint_path = "training" + str(hidden_state) + "/cp-{epoch:04d}.ckpt"
# checkpoint_dir = os.path.dirname(os.path.join(ROOT_PATH, 'data', checkpoint_path))

# cp_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath = checkpoint_path,
#     verbose=1,
#     save_weights_only = True,
#     save_best_only = False # only in this case, 
# )

# model.save_weights(checkpoint_path.format(epoch=0))

# history = model.fit(
#     train_ds, 
#     validation_data=val_ds,
#     epochs=5, 
#     callbacks=[cp_callback]
#     )


# Predictions

In [None]:
# for batch in train_ds.take(1):
#     random_in_batch = np.random.randint(0, config.BATCH_SIZE-1)
#     input_ids = batch[0]["input_ids"][random_in_batch]
#     # attention_mask = sample[0]["attention_mask"][random_in_batch]
#     print("Random sample n°", random_in_batch, "in batch of", config.BATCH_SIZE)
    
#     print("Question + context: ")
#     print(tokenizer.decode(input_ids, skip_special_tokens=True), "\n")

#     real_start = np.argmax(batch[1]["out_S"][random_in_batch])
#     real_end = np.argmax(batch[1]["out_E"][random_in_batch])
#     real_limits = [real_start, real_end]

#     # print(np.shape(model.predict(batch[0])[0][random_in_batch]))
    
#     print("Real limits: ", real_limits)
#     print("Real answer tokens: ", input_ids[real_limits[0]:real_limits[1]+1].numpy())
#     print("Real answer: ", tokenizer.decode(input_ids[real_limits[0]:real_limits[1]+1], skip_special_tokens=False))
    
#     predicted_limits = utils.start_end_token_from_probabilities(*model.predict(batch[0]))[random_in_batch]
#     print("Predicted_limits: ", predicted_limits)
#     print("Predicted answer tokens: ", input_ids[predicted_limits[0]:predicted_limits[1]+1].numpy())
#     print("Predicted answer: ", tokenizer.decode(input_ids[predicted_limits[0]:predicted_limits[1]+1], skip_special_tokens=True))
