# Fine Tune modelo preentrenado BERT para SQUAD




## Fase 0: Configuración entorno de desarrollo

#### GPU


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime')
else:
  print(gpu_info)

#### RAM


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

#### Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Fase 1: Importar dependencias

#### Instalación librerías


In [None]:
!pip install tokenizers
!pip install transformers==3.5.1
!pip install -q tf-models-official==2.3.0

#### Importación Librerias

In [None]:
import os
import re
import json
import string
import time
import datetime
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig, BertPreTrainedModel, TFBertForQuestionAnswering

from tqdm.notebook import tqdm

import tensorflow_hub as hub
from official.nlp.bert import configs
from official.nlp.bert.bert_models import squad_model
from official.nlp import optimization

# Load the TensorBoard notebook extension
%load_ext tensorboard


In [None]:
print(f'Versión de tensorflow: {tf.__version__}')

Versión de tensorflow: 2.3.0


In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

#### Configuraciones del proyecto

In [None]:
max_len = 384

In [None]:
user_path_drive = '/content/drive/My Drive/BibleModelBert'

train_path = os.path.join(user_path_drive, "data/train-v2.0-es.json")
eval_path = os.path.join(user_path_drive, "data/dev-v2.0-es.json")
tb_path = os.path.join(user_path_drive, "logs/")

checkpoint_path_callback = os.path.join(user_path_drive, "chkp/")

checkpoint_path = os.path.join(user_path_drive, "checkpoints/")
dir_config = os.path.join(user_path_drive, "config/")
dir_data = os.path.join(user_path_drive, "data/")
dir_tokenizer = os.path.join(user_path_drive, "tokenizer/")

## Fase 2: Preprocesado de Datos

In [None]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es", do_lower_case=False)
save_path = dir_tokenizer
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer(_vocab, lowercase=True)

Ejemplo de  tokenización de  una frase de prueba en español:

In [None]:
_tokens = tokenizer.encode("Hola a todos, esto es una prueba")
print(f'ids: {_tokens.ids}')
print(f'tokens: {_tokens.tokens}')

ids: [4, 9050, 1013, 1495, 1017, 1468, 1058, 1108, 3889, 5]
tokens: ['[CLS]', 'hola', 'a', 'todos', ',', 'esto', 'es', 'una', 'prueba', '[SEP]']


#### Procesamiento de los datos

In [None]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets


In [None]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                if len(qa["answers"]) > 0:
                  answer_text = qa["answers"][0]["text"]
                  all_answers = [_["text"] for _ in qa["answers"]]
                  start_char_idx = qa["answers"][0]["answer_start"]
                  squad_eg = SquadExample(
                      question, context, start_char_idx, answer_text, all_answers
                  )
                  squad_eg.preprocess()
                  squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y, dataset_dict



Se aplican las transformaciones sobre el conjunto de entrenamiento y validación:

In [None]:
%%time

with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)

train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train, dataset_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")

eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval, dataset_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {key: tf.cast(dataset_train[key], dtype="int32") for key in ["input_ids", "attention_mask", "token_type_ids"]},
    {key: tf.cast(dataset_train[key], dtype="int32") for key in ['start_token_idx', 'end_token_idx']}

))

eval_dataset = tf.data.Dataset.from_tensor_slices((
    {key: tf.cast(dataset_eval[key], dtype="int32") for key in ["input_ids", "attention_mask", "token_type_ids"]},
    {key: tf.cast(dataset_eval[key], dtype="int32") for key in ['start_token_idx', 'end_token_idx']}

))

## Fase 3: Construccion del modelo

In [None]:
def normalize_text(text):
    text = text.lower()

    # Remove punctuations
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # Remove articles
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    # Remove extra white space
    text = " ".join(text.split())
    return text


class ExactMatch(keras.callbacks.Callback):
    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")


### Generación del modelo y configuración del entrenamiento.


Se definen los Paámetros de configuración para el entrenamiento.

In [None]:
TRAIN_DATA_SIZE = 88641
NB_BATCHES_TRAIN = 15000
BATCH_SIZE = 16
NB_EPOCHS = 2

Se define el Modelo utilizado para realizar el finetune con el dataset en español.

En este caso el modelo utilizado es el : [bert-base-multilingual-cased](https://huggingface.co/bert-base-multilingual-cased) uno de los tanto ejemplos de Destil BERT

In [None]:
def create_model():

    optimizer = tf.keras.optimizers.Adam(lr=5e-5)
    model = TFBertForQuestionAnswering.from_pretrained("bert-base-multilingual-cased")
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.AUTO,
        name='sparse_categorical_crossentropy'
        )
    model.compile(optimizer=optimizer, loss=[loss, loss])

    return model, optimizer


In [None]:
UNFREEZE = False

In [None]:
def unfreeze_model(model):

    model.layers[0].trainable = False

    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.AUTO,
        name='sparse_categorical_crossentropy'
        )
    model.compile(optimizer=optimizer, loss=[loss, loss]
    )

In [None]:
with tf.device('/GPU:0'):
  model, optimizer = create_model()

  if UNFREEZE:
    unfreeze_model(model)

In [None]:
model.summary()

Generación de los callbacks necesarios para el entrenamiento desde la API de Keras:

In [None]:
exact_match_callback = ExactMatch(x_eval, y_eval)

logdir = os.path.join(tb_path, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, update_freq=100) 

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path_callback, monitor='loss', verbose=0, save_best_only=True,
    save_weights_only=False, mode='auto', save_freq='epoch'
)

callbacks = [exact_match_callback, checkpoint_callback, tensorboard_callback]

## Fase 4: Proceso de entrenamiento y validación

### Proceso de entrenamiento

In [None]:
model.fit(ds_train,
          epochs=NB_EPOCHS,
          callbacks=callbacks)


In [None]:
# Salvmaos nuestro modelo carpeta BibleModelBERT/chkp/1
version_model="1"
model.save(checkpoint_path_callback+ "/" + version_model + "/", save_format='tf')

In [None]:
# Loading full model TensorFlow (pb) from disk
from tensorflow.keras.models import load_model
# load model
version="1"
loaded_model = load_model(checkpoint_path_callback + "/" + version + "/")
print(list(loaded_model.signatures.keys()))
# summarize model
loaded_model.summary()

***

## Fase 5: Predicciones sobre el modelo entrenado

### Predicciones sobre el conjunto de validación

In [None]:
import collections

RawResult = collections.namedtuple("RawResult", ["start_logits", "end_logits"])

def get_raw_results(predictions):
    for start_logits, end_logits in zip(predictions['start_logits'],
                                        predictions['end_logits']):
        yield RawResult(
            start_logits=start_logits.numpy().tolist(),
            end_logits=end_logits.numpy().tolist())

In [None]:
all_results = []
for count, inputs in enumerate(eval_dataset.batch(BATCH_SIZE)):
    x, _ = inputs
    start_logits, end_logits = model(x, training=False)
    output_dict = dict(
        start_logits=start_logits,
        end_logits=end_logits)
    for result in get_raw_results(output_dict):
        all_results.append(result)
    if count % 100 == 0:
        print("{}/{}".format(count, 2709))

### Predicciones sobre datos de entrada personalizados

In [None]:
def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

def whitespace_split(text):
    doc_tokens = []
    prev_is_whitespace = True
    for c in text:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
    return doc_tokens

def tokenize_context(text_words):
    text_tok = []
    tok_to_word_id = []
    for word_id, word in enumerate(text_words):
        word_tok = tokenizer.token_to_id(word)
        text_tok += word_tok
        tok_to_word_id += [word_id]*len(word_tok)
    return text_tok, tok_to_word_id

def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # Convierte 1 en 0 y viceversa
    return seg_ids

def create_input_dict(question, context):
    question_tok = tokenizer.encode(my_question)

    context_words = whitespace_split(context)
    # context_tok, context_tok_to_word_id = tokenize_context(context_words)
    context_tok = tokenizer.encode(context).tokens
    context_tok_to_word_id = tokenizer.encode(context).ids

    input_tok =  tokenizer.encode(question, context).tokens
    # input_tok = question_tok + ["[SEP]"] + context_tok + ["[SEP]"]
    input_ids =  tokenizer.encode(question, context).ids
    input_mask =  tokenizer.encode(question, context).attention_mask
    input_type_ids =  tokenizer.encode(question, context).type_ids

    input_tok += ["[PAD]"]*(max_len-len(input_tok)) # in our case the model has been
                                                # trained to have inputs of length max 384
    input_ids += [0]*(max_len-len(input_ids))
    input_mask += [0]*(max_len-len(input_mask))
    input_type_ids += [0]*(max_len-len(input_type_ids))

    input_dict = {}
    # ["input_ids", "attention_mask", "token_type_ids"]
    input_dict["input_ids"] = tf.expand_dims(tf.cast(input_ids, tf.int32), 0)
    input_dict["attention_mask"] = tf.expand_dims(tf.cast(input_mask, tf.int32), 0)
    input_dict["token_type_ids"] = tf.expand_dims(tf.cast(input_type_ids, tf.int32), 0)

    return input_dict, context_words, context_tok_to_word_id, len(question_tok)