# Fase 1: Importar dependencias

In [1]:
%reset -f
#!pip install sentencepiece
#!pip install tf-models-official
#!pip install tf-models-nightly # mejor instalar la versión en desarrollo
#!pip install tf-nightly
#!pip install tensorflow_hub
#!pip install numpy


In [2]:
import tensorflow as tf

In [3]:
tf.__version__

'2.7.0'

In [4]:
import tensorflow_hub as hub

from official.nlp.bert.tokenization import FullTokenizer
from official.nlp.bert.input_pipeline import create_squad_dataset
from official.nlp.data.squad_lib import generate_tf_record_from_json_file

from official.nlp import optimization

from official.nlp.data.squad_lib import read_squad_examples
from official.nlp.data.squad_lib import FeatureWriter
from official.nlp.data.squad_lib import convert_examples_to_features
from official.nlp.data.squad_lib import write_predictions

In [5]:
import numpy as np
import math
import random
import time
import json
import collections
import os

# Fase 2: Preprocesado de Datos

In [6]:
pwd

'/home/icarlos/BERT/Q&A'

In [7]:
input_meta_data = generate_tf_record_from_json_file(
    "/home/icarlos/BERT/Q&A/train-v1.1.json",
    "/home/icarlos/BERT/Q&A/vocab.txt",
    "/home/icarlos/BERT/Q&A/train-v1.1.tf_record")

In [8]:
with tf.io.gfile.GFile("/home/icarlos/BERT/Q&A/train_meta_data", "w") as writer:
    writer.write(json.dumps(input_meta_data, indent=4) + "\n")

In [9]:
BATCH_SIZE = 1

train_dataset = create_squad_dataset(
    "/home/icarlos/BERT/Q&A/train-v1.1.tf_record",
    input_meta_data['max_seq_length'], # 384
    BATCH_SIZE,
    is_training=True)

2021-11-24 22:05:14.076876: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-24 22:05:14.081641: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-24 22:05:14.082098: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-24 22:05:14.083238: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

# Fase 3: Construcción del modelo

## Capa Squad

In [10]:
class BertSquadLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(BertSquadLayer, self).__init__()
        self.final_dense = tf.keras.layers.Dense(
            units=2,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))

    def call(self, inputs):
        logits = self.final_dense(inputs) # (batch_size, seq_len, 2)
        logits = tf.transpose(logits, [2, 0, 1]) # (2, batch_size, seq_len)
        unstacked_logits = tf.unstack(logits, axis=0) # [(batch_size, seq_len), (batch_size, seq_len)] 
        return unstacked_logits[0], unstacked_logits[1]

## Modelo completo

In [15]:
class BERTSquad(tf.keras.Model):
    
    def __init__(self,
                 name="bert_squad"):
        super(BERTSquad, self).__init__(name=name)
        
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=True)
        
        self.squad_layer = BertSquadLayer()
    
    def apply_bert(self, inputs):
        _ , sequence_output = self.bert_layer([inputs["input_word_ids"],
                                               inputs["input_mask"],
                                               inputs["input_type_ids"]])
        return sequence_output

    def call(self, inputs):
        seq_output = self.apply_bert(inputs)

        start_logits, end_logits = self.squad_layer(seq_output)
        
        return start_logits, end_logits

# Fase 4: Entrenamiento

## Creación de la IA

In [12]:
TRAIN_DATA_SIZE = 88641
NB_BATCHES_TRAIN = 44000
BATCH_SIZE = 3
NB_EPOCHS = 3
INIT_LR = 5e-5
WARMUP_STEPS = int(NB_BATCHES_TRAIN * 0.1)

In [13]:
train_dataset_light = train_dataset.take(NB_BATCHES_TRAIN)

In [16]:
bert_squad = BERTSquad()

In [17]:
optimizer = optimization.create_optimizer(
    init_lr=INIT_LR,
    num_train_steps=NB_BATCHES_TRAIN,
    num_warmup_steps=WARMUP_STEPS)

In [18]:
def squad_loss_fn(labels, model_outputs):
    start_positions = labels['start_positions']
    end_positions = labels['end_positions']
    start_logits, end_logits = model_outputs

    start_loss = tf.keras.backend.sparse_categorical_crossentropy(
        start_positions, start_logits, from_logits=True)
    end_loss = tf.keras.backend.sparse_categorical_crossentropy(
        end_positions, end_logits, from_logits=True)
    
    total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2

    return total_loss

train_loss = tf.keras.metrics.Mean(name="train_loss")

In [19]:
next(iter(train_dataset_light))

({'input_word_ids': <tf.Tensor: shape=(1, 384), dtype=int32, numpy=
  array([[  101,  2000,  3183,  2001,  2198,  1038,  1012,  1047,  3217,
           2278,  2496,  1029,   102,  1996,  7437,  1038,  1012,  1047,
           3217,  2278,  2820,  2005,  2248,  3521,  2913,  2012,  1996,
           2118,  1997, 10289,  8214,  2003,  4056,  2000,  2470,  1010,
           2495,  1998, 15641,  2006,  1996,  5320,  1997,  6355,  4736,
           1998,  1996,  3785,  2005,  9084,  3521,  1012,  2009,  4107,
           8065,  1010,  3040,  1005,  1055,  1010,  1998,  8324,  5445,
           1999,  3521,  2913,  1012,  2009,  2001,  2631,  1999,  3069,
           2083,  1996, 11440,  1997,  7437,  1038,  1012,  1047,  3217,
           2278,  1010,  1996,  7794,  1997,  9383,  1005,  1055,  3954,
           4097,  1047,  3217,  2278,  1012,  1996,  2820,  2001,  4427,
           2011,  1996,  4432,  1997,  1996,  7065,  1012, 10117,  1049,
           1012,  2002,  9695,  2232, 20116,  2278,  101

In [20]:
bert_squad.compile(optimizer,
                   squad_loss_fn)

In [21]:
checkpoint_path = "/home/icarlos/BERT/Q&A/ckpt_bert_squad/"

ckpt = tf.train.Checkpoint(bert_squad=bert_squad)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Último checkpoint restaurado!!")

## Entrenamiento personalizado

In [22]:
for epoch in range(NB_EPOCHS):
    print("Inicio del Epoch {}".format(epoch+1))
    start = time.time()
    
    train_loss.reset_states()
    
    for (batch, (inputs, targets)) in enumerate(train_dataset_light):
        with tf.GradientTape() as tape:
            model_outputs = bert_squad(inputs)
            loss = squad_loss_fn(targets, model_outputs)
        
        gradients = tape.gradient(loss, bert_squad.trainable_variables)
        optimizer.apply_gradients(zip(gradients, bert_squad.trainable_variables))
        
        train_loss(loss)
        
        if batch % 50 == 0:
            print("Epoch {} Lote {} Pérdida {:.4f}".format(
                epoch+1, batch, train_loss.result()))
        """
        if batch % 500 == 0:
            ckpt_save_path = ckpt_manager.save()
            print("Guardando checkpoint para el epoch {} en el directorio {}".format(epoch+1,
                                                                ckpt_save_path))"""
    print("Tiempo total para entrenar 1 epoch: {} segs\n".format(time.time() - start))

Inicio del Epoch 1


2021-11-24 22:08:34.447445: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 1 Lote 0 Pérdida 6.0601
Epoch 1 Lote 50 Pérdida 5.9680
Epoch 1 Lote 100 Pérdida 5.9271
Epoch 1 Lote 150 Pérdida 5.9106
Epoch 1 Lote 200 Pérdida 5.8838
Epoch 1 Lote 250 Pérdida 5.8429
Epoch 1 Lote 300 Pérdida 5.7826
Epoch 1 Lote 350 Pérdida 5.7139
Epoch 1 Lote 400 Pérdida 5.6307
Epoch 1 Lote 450 Pérdida 5.5336
Epoch 1 Lote 500 Pérdida 5.4196
Epoch 1 Lote 550 Pérdida 5.3051
Epoch 1 Lote 600 Pérdida 5.1861
Epoch 1 Lote 650 Pérdida 5.0797
Epoch 1 Lote 700 Pérdida 4.9616
Epoch 1 Lote 750 Pérdida 4.8364
Epoch 1 Lote 800 Pérdida 4.7354
Epoch 1 Lote 850 Pérdida 4.5974
Epoch 1 Lote 900 Pérdida 4.4960
Epoch 1 Lote 950 Pérdida 4.4166
Epoch 1 Lote 1000 Pérdida 4.3473
Epoch 1 Lote 1050 Pérdida 4.2614
Epoch 1 Lote 1100 Pérdida 4.1880
Epoch 1 Lote 1150 Pérdida 4.1160
Epoch 1 Lote 1200 Pérdida 4.0581
Epoch 1 Lote 1250 Pérdida 3.9965
Epoch 1 Lote 1300 Pérdida 3.9571
Epoch 1 Lote 1350 Pérdida 3.9233
Epoch 1 Lote 1400 Pérdida 3.8766
Epoch 1 Lote 1450 Pérdida 3.8449
Epoch 1 Lote 1500 Pérdida 3.8059


Epoch 1 Lote 12400 Pérdida 2.2235
Epoch 1 Lote 12450 Pérdida 2.2204
Epoch 1 Lote 12500 Pérdida 2.2191
Epoch 1 Lote 12550 Pérdida 2.2198
Epoch 1 Lote 12600 Pérdida 2.2178
Epoch 1 Lote 12650 Pérdida 2.2171
Epoch 1 Lote 12700 Pérdida 2.2152
Epoch 1 Lote 12750 Pérdida 2.2117
Epoch 1 Lote 12800 Pérdida 2.2107
Epoch 1 Lote 12850 Pérdida 2.2116
Epoch 1 Lote 12900 Pérdida 2.2084
Epoch 1 Lote 12950 Pérdida 2.2045
Epoch 1 Lote 13000 Pérdida 2.2030
Epoch 1 Lote 13050 Pérdida 2.1996
Epoch 1 Lote 13100 Pérdida 2.1979
Epoch 1 Lote 13150 Pérdida 2.1981
Epoch 1 Lote 13200 Pérdida 2.1968
Epoch 1 Lote 13250 Pérdida 2.1949
Epoch 1 Lote 13300 Pérdida 2.1934
Epoch 1 Lote 13350 Pérdida 2.1920
Epoch 1 Lote 13400 Pérdida 2.1888
Epoch 1 Lote 13450 Pérdida 2.1853
Epoch 1 Lote 13500 Pérdida 2.1834
Epoch 1 Lote 13550 Pérdida 2.1828
Epoch 1 Lote 13600 Pérdida 2.1809
Epoch 1 Lote 13650 Pérdida 2.1803
Epoch 1 Lote 13700 Pérdida 2.1797
Epoch 1 Lote 13750 Pérdida 2.1757
Epoch 1 Lote 13800 Pérdida 2.1750
Epoch 1 Lote 1

Epoch 1 Lote 24450 Pérdida 1.9198
Epoch 1 Lote 24500 Pérdida 1.9189
Epoch 1 Lote 24550 Pérdida 1.9187
Epoch 1 Lote 24600 Pérdida 1.9181
Epoch 1 Lote 24650 Pérdida 1.9170
Epoch 1 Lote 24700 Pérdida 1.9163
Epoch 1 Lote 24750 Pérdida 1.9148
Epoch 1 Lote 24800 Pérdida 1.9141
Epoch 1 Lote 24850 Pérdida 1.9128
Epoch 1 Lote 24900 Pérdida 1.9110
Epoch 1 Lote 24950 Pérdida 1.9098
Epoch 1 Lote 25000 Pérdida 1.9078
Epoch 1 Lote 25050 Pérdida 1.9065
Epoch 1 Lote 25100 Pérdida 1.9057
Epoch 1 Lote 25150 Pérdida 1.9049
Epoch 1 Lote 25200 Pérdida 1.9059
Epoch 1 Lote 25250 Pérdida 1.9049
Epoch 1 Lote 25300 Pérdida 1.9037
Epoch 1 Lote 25350 Pérdida 1.9026
Epoch 1 Lote 25400 Pérdida 1.9011
Epoch 1 Lote 25450 Pérdida 1.8996
Epoch 1 Lote 25500 Pérdida 1.8987
Epoch 1 Lote 25550 Pérdida 1.8984
Epoch 1 Lote 25600 Pérdida 1.8977
Epoch 1 Lote 25650 Pérdida 1.8969
Epoch 1 Lote 25700 Pérdida 1.8948
Epoch 1 Lote 25750 Pérdida 1.8952
Epoch 1 Lote 25800 Pérdida 1.8946
Epoch 1 Lote 25850 Pérdida 1.8944
Epoch 1 Lote 2

Epoch 1 Lote 36500 Pérdida 1.7564
Epoch 1 Lote 36550 Pérdida 1.7575
Epoch 1 Lote 36600 Pérdida 1.7580
Epoch 1 Lote 36650 Pérdida 1.7589
Epoch 1 Lote 36700 Pérdida 1.7603
Epoch 1 Lote 36750 Pérdida 1.7601
Epoch 1 Lote 36800 Pérdida 1.7597
Epoch 1 Lote 36850 Pérdida 1.7589
Epoch 1 Lote 36900 Pérdida 1.7582
Epoch 1 Lote 36950 Pérdida 1.7583
Epoch 1 Lote 37000 Pérdida 1.7574
Epoch 1 Lote 37050 Pérdida 1.7568
Epoch 1 Lote 37100 Pérdida 1.7560
Epoch 1 Lote 37150 Pérdida 1.7554
Epoch 1 Lote 37200 Pérdida 1.7543
Epoch 1 Lote 37250 Pérdida 1.7538
Epoch 1 Lote 37300 Pérdida 1.7530
Epoch 1 Lote 37350 Pérdida 1.7528
Epoch 1 Lote 37400 Pérdida 1.7532
Epoch 1 Lote 37450 Pérdida 1.7535
Epoch 1 Lote 37500 Pérdida 1.7531
Epoch 1 Lote 37550 Pérdida 1.7519
Epoch 1 Lote 37600 Pérdida 1.7521
Epoch 1 Lote 37650 Pérdida 1.7510
Epoch 1 Lote 37700 Pérdida 1.7498
Epoch 1 Lote 37750 Pérdida 1.7494
Epoch 1 Lote 37800 Pérdida 1.7491
Epoch 1 Lote 37850 Pérdida 1.7489
Epoch 1 Lote 37900 Pérdida 1.7485
Epoch 1 Lote 3

Epoch 2 Lote 4650 Pérdida 1.2678
Epoch 2 Lote 4700 Pérdida 1.2624
Epoch 2 Lote 4750 Pérdida 1.2600
Epoch 2 Lote 4800 Pérdida 1.2581
Epoch 2 Lote 4850 Pérdida 1.2564
Epoch 2 Lote 4900 Pérdida 1.2552
Epoch 2 Lote 4950 Pérdida 1.2575
Epoch 2 Lote 5000 Pérdida 1.2589
Epoch 2 Lote 5050 Pérdida 1.2571
Epoch 2 Lote 5100 Pérdida 1.2577
Epoch 2 Lote 5150 Pérdida 1.2557
Epoch 2 Lote 5200 Pérdida 1.2558
Epoch 2 Lote 5250 Pérdida 1.2562
Epoch 2 Lote 5300 Pérdida 1.2575
Epoch 2 Lote 5350 Pérdida 1.2589
Epoch 2 Lote 5400 Pérdida 1.2573
Epoch 2 Lote 5450 Pérdida 1.2552
Epoch 2 Lote 5500 Pérdida 1.2521
Epoch 2 Lote 5550 Pérdida 1.2491
Epoch 2 Lote 5600 Pérdida 1.2473
Epoch 2 Lote 5650 Pérdida 1.2436
Epoch 2 Lote 5700 Pérdida 1.2387
Epoch 2 Lote 5750 Pérdida 1.2354
Epoch 2 Lote 5800 Pérdida 1.2324
Epoch 2 Lote 5850 Pérdida 1.2283
Epoch 2 Lote 5900 Pérdida 1.2239
Epoch 2 Lote 5950 Pérdida 1.2205
Epoch 2 Lote 6000 Pérdida 1.2169
Epoch 2 Lote 6050 Pérdida 1.2199
Epoch 2 Lote 6100 Pérdida 1.2168
Epoch 2 Lo

Epoch 2 Lote 16900 Pérdida 1.0114
Epoch 2 Lote 16950 Pérdida 1.0107
Epoch 2 Lote 17000 Pérdida 1.0101
Epoch 2 Lote 17050 Pérdida 1.0104
Epoch 2 Lote 17100 Pérdida 1.0110
Epoch 2 Lote 17150 Pérdida 1.0122
Epoch 2 Lote 17200 Pérdida 1.0120
Epoch 2 Lote 17250 Pérdida 1.0108
Epoch 2 Lote 17300 Pérdida 1.0110
Epoch 2 Lote 17350 Pérdida 1.0114
Epoch 2 Lote 17400 Pérdida 1.0122
Epoch 2 Lote 17450 Pérdida 1.0113
Epoch 2 Lote 17500 Pérdida 1.0104
Epoch 2 Lote 17550 Pérdida 1.0085
Epoch 2 Lote 17600 Pérdida 1.0070
Epoch 2 Lote 17650 Pérdida 1.0051
Epoch 2 Lote 17700 Pérdida 1.0041
Epoch 2 Lote 17750 Pérdida 1.0035
Epoch 2 Lote 17800 Pérdida 1.0024
Epoch 2 Lote 17850 Pérdida 1.0018
Epoch 2 Lote 17900 Pérdida 1.0011
Epoch 2 Lote 17950 Pérdida 1.0001
Epoch 2 Lote 18000 Pérdida 0.9992
Epoch 2 Lote 18050 Pérdida 0.9981
Epoch 2 Lote 18100 Pérdida 0.9961
Epoch 2 Lote 18150 Pérdida 0.9950
Epoch 2 Lote 18200 Pérdida 0.9930
Epoch 2 Lote 18250 Pérdida 0.9929
Epoch 2 Lote 18300 Pérdida 0.9913
Epoch 2 Lote 1

Epoch 2 Lote 28950 Pérdida 0.8249
Epoch 2 Lote 29000 Pérdida 0.8244
Epoch 2 Lote 29050 Pérdida 0.8236
Epoch 2 Lote 29100 Pérdida 0.8232
Epoch 2 Lote 29150 Pérdida 0.8234
Epoch 2 Lote 29200 Pérdida 0.8228
Epoch 2 Lote 29250 Pérdida 0.8220
Epoch 2 Lote 29300 Pérdida 0.8207
Epoch 2 Lote 29350 Pérdida 0.8197
Epoch 2 Lote 29400 Pérdida 0.8189
Epoch 2 Lote 29450 Pérdida 0.8183
Epoch 2 Lote 29500 Pérdida 0.8177
Epoch 2 Lote 29550 Pérdida 0.8179
Epoch 2 Lote 29600 Pérdida 0.8175
Epoch 2 Lote 29650 Pérdida 0.8176
Epoch 2 Lote 29700 Pérdida 0.8168
Epoch 2 Lote 29750 Pérdida 0.8175
Epoch 2 Lote 29800 Pérdida 0.8176
Epoch 2 Lote 29850 Pérdida 0.8170
Epoch 2 Lote 29900 Pérdida 0.8171
Epoch 2 Lote 29950 Pérdida 0.8164
Epoch 2 Lote 30000 Pérdida 0.8161
Epoch 2 Lote 30050 Pérdida 0.8150
Epoch 2 Lote 30100 Pérdida 0.8141
Epoch 2 Lote 30150 Pérdida 0.8130
Epoch 2 Lote 30200 Pérdida 0.8122
Epoch 2 Lote 30250 Pérdida 0.8114
Epoch 2 Lote 30300 Pérdida 0.8109
Epoch 2 Lote 30350 Pérdida 0.8107
Epoch 2 Lote 3

Epoch 2 Lote 41000 Pérdida 0.7517
Epoch 2 Lote 41050 Pérdida 0.7517
Epoch 2 Lote 41100 Pérdida 0.7519
Epoch 2 Lote 41150 Pérdida 0.7516
Epoch 2 Lote 41200 Pérdida 0.7520
Epoch 2 Lote 41250 Pérdida 0.7518
Epoch 2 Lote 41300 Pérdida 0.7520
Epoch 2 Lote 41350 Pérdida 0.7524
Epoch 2 Lote 41400 Pérdida 0.7526
Epoch 2 Lote 41450 Pérdida 0.7528
Epoch 2 Lote 41500 Pérdida 0.7524
Epoch 2 Lote 41550 Pérdida 0.7522
Epoch 2 Lote 41600 Pérdida 0.7517
Epoch 2 Lote 41650 Pérdida 0.7517
Epoch 2 Lote 41700 Pérdida 0.7514
Epoch 2 Lote 41750 Pérdida 0.7514
Epoch 2 Lote 41800 Pérdida 0.7514
Epoch 2 Lote 41850 Pérdida 0.7520
Epoch 2 Lote 41900 Pérdida 0.7524
Epoch 2 Lote 41950 Pérdida 0.7525
Epoch 2 Lote 42000 Pérdida 0.7534
Epoch 2 Lote 42050 Pérdida 0.7542
Epoch 2 Lote 42100 Pérdida 0.7550
Epoch 2 Lote 42150 Pérdida 0.7554
Epoch 2 Lote 42200 Pérdida 0.7560
Epoch 2 Lote 42250 Pérdida 0.7559
Epoch 2 Lote 42300 Pérdida 0.7564
Epoch 2 Lote 42350 Pérdida 0.7570
Epoch 2 Lote 42400 Pérdida 0.7570
Epoch 2 Lote 4

Epoch 3 Lote 9250 Pérdida 1.1953
Epoch 3 Lote 9300 Pérdida 1.1945
Epoch 3 Lote 9350 Pérdida 1.1917
Epoch 3 Lote 9400 Pérdida 1.1877
Epoch 3 Lote 9450 Pérdida 1.1851
Epoch 3 Lote 9500 Pérdida 1.1829
Epoch 3 Lote 9550 Pérdida 1.1794
Epoch 3 Lote 9600 Pérdida 1.1766
Epoch 3 Lote 9650 Pérdida 1.1737
Epoch 3 Lote 9700 Pérdida 1.1694
Epoch 3 Lote 9750 Pérdida 1.1668
Epoch 3 Lote 9800 Pérdida 1.1628
Epoch 3 Lote 9850 Pérdida 1.1586
Epoch 3 Lote 9900 Pérdida 1.1558
Epoch 3 Lote 9950 Pérdida 1.1547
Epoch 3 Lote 10000 Pérdida 1.1502
Epoch 3 Lote 10050 Pérdida 1.1485
Epoch 3 Lote 10100 Pérdida 1.1457
Epoch 3 Lote 10150 Pérdida 1.1419
Epoch 3 Lote 10200 Pérdida 1.1395
Epoch 3 Lote 10250 Pérdida 1.1388
Epoch 3 Lote 10300 Pérdida 1.1364
Epoch 3 Lote 10350 Pérdida 1.1339
Epoch 3 Lote 10400 Pérdida 1.1339
Epoch 3 Lote 10450 Pérdida 1.1355
Epoch 3 Lote 10500 Pérdida 1.1358
Epoch 3 Lote 10550 Pérdida 1.1358
Epoch 3 Lote 10600 Pérdida 1.1340
Epoch 3 Lote 10650 Pérdida 1.1330
Epoch 3 Lote 10700 Pérdida 1.

Epoch 3 Lote 21350 Pérdida 0.9369
Epoch 3 Lote 21400 Pérdida 0.9361
Epoch 3 Lote 21450 Pérdida 0.9346
Epoch 3 Lote 21500 Pérdida 0.9333
Epoch 3 Lote 21550 Pérdida 0.9321
Epoch 3 Lote 21600 Pérdida 0.9315
Epoch 3 Lote 21650 Pérdida 0.9304
Epoch 3 Lote 21700 Pérdida 0.9288
Epoch 3 Lote 21750 Pérdida 0.9272
Epoch 3 Lote 21800 Pérdida 0.9265
Epoch 3 Lote 21850 Pérdida 0.9255
Epoch 3 Lote 21900 Pérdida 0.9247
Epoch 3 Lote 21950 Pérdida 0.9247
Epoch 3 Lote 22000 Pérdida 0.9254
Epoch 3 Lote 22050 Pérdida 0.9255
Epoch 3 Lote 22100 Pérdida 0.9245
Epoch 3 Lote 22150 Pérdida 0.9233
Epoch 3 Lote 22200 Pérdida 0.9229
Epoch 3 Lote 22250 Pérdida 0.9221
Epoch 3 Lote 22300 Pérdida 0.9212
Epoch 3 Lote 22350 Pérdida 0.9199
Epoch 3 Lote 22400 Pérdida 0.9190
Epoch 3 Lote 22450 Pérdida 0.9180
Epoch 3 Lote 22500 Pérdida 0.9173
Epoch 3 Lote 22550 Pérdida 0.9164
Epoch 3 Lote 22600 Pérdida 0.9149
Epoch 3 Lote 22650 Pérdida 0.9146
Epoch 3 Lote 22700 Pérdida 0.9138
Epoch 3 Lote 22750 Pérdida 0.9124
Epoch 3 Lote 2

Epoch 3 Lote 33400 Pérdida 0.7780
Epoch 3 Lote 33450 Pérdida 0.7776
Epoch 3 Lote 33500 Pérdida 0.7773
Epoch 3 Lote 33550 Pérdida 0.7765
Epoch 3 Lote 33600 Pérdida 0.7763
Epoch 3 Lote 33650 Pérdida 0.7761
Epoch 3 Lote 33700 Pérdida 0.7755
Epoch 3 Lote 33750 Pérdida 0.7750
Epoch 3 Lote 33800 Pérdida 0.7745
Epoch 3 Lote 33850 Pérdida 0.7741
Epoch 3 Lote 33900 Pérdida 0.7737
Epoch 3 Lote 33950 Pérdida 0.7730
Epoch 3 Lote 34000 Pérdida 0.7724
Epoch 3 Lote 34050 Pérdida 0.7717
Epoch 3 Lote 34100 Pérdida 0.7710
Epoch 3 Lote 34150 Pérdida 0.7705
Epoch 3 Lote 34200 Pérdida 0.7703
Epoch 3 Lote 34250 Pérdida 0.7695
Epoch 3 Lote 34300 Pérdida 0.7693
Epoch 3 Lote 34350 Pérdida 0.7686
Epoch 3 Lote 34400 Pérdida 0.7682
Epoch 3 Lote 34450 Pérdida 0.7681
Epoch 3 Lote 34500 Pérdida 0.7679
Epoch 3 Lote 34550 Pérdida 0.7673
Epoch 3 Lote 34600 Pérdida 0.7669
Epoch 3 Lote 34650 Pérdida 0.7663
Epoch 3 Lote 34700 Pérdida 0.7656
Epoch 3 Lote 34750 Pérdida 0.7654
Epoch 3 Lote 34800 Pérdida 0.7652
Epoch 3 Lote 3

# Fase 5: Evaluación

## Preparación de la evaluación

Get the dev set in the session

In [23]:
eval_examples = read_squad_examples(
    "/home/icarlos/BERT/Q&A/dev-v1.1.json",
    is_training=False,
    version_2_with_negative=False)

Define the function that will write the tf_record file for the dev set

In [24]:
eval_writer = FeatureWriter(
    filename=os.path.join("/home/icarlos/BERT/Q&A/",
                          "eval.tf_record"),
    is_training=False)

Create a tokenizer for future information needs

In [25]:
my_bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1",
    trainable=False)
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

Define the function that add the features (feature is a protocol in tensorflow) to our eval_features list

In [26]:
def _append_feature(feature, is_padding):
    if not is_padding:
        eval_features.append(feature)
    eval_writer.process_feature(feature)

Create the eval features and the writes the tf.record file

In [27]:
eval_features = []
dataset_size = convert_examples_to_features(
    examples=eval_examples,
    tokenizer=tokenizer,
    max_seq_length=384,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    output_fn=_append_feature,
    batch_size=4)

In [28]:
eval_writer.close()

Load the ready-to-be-used dataset to our session

In [29]:
BATCH_SIZE = 4

eval_dataset = create_squad_dataset(
    "/home/icarlos/BERT/Q&A/eval.tf_record",
    384,#input_meta_data['max_seq_length'],
    BATCH_SIZE,
    is_training=False)

## Llevar a cabo las prediccioness

Definir un cierto tipo de colección (como un diccionario).

In [30]:
RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])

Devuelve cada elemento del lote de salida, uno por uno.

In [31]:
def get_raw_results(predictions):
    for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
                                                    predictions['start_logits'],
                                                    predictions['end_logits']):
        yield RawResult(
            unique_id=unique_ids.numpy(),
            start_logits=start_logits.numpy().tolist(),
            end_logits=end_logits.numpy().tolist())

Hacemos nuestras predicciones

In [32]:
all_results = []
for count, inputs in enumerate(eval_dataset):
    x, _ = inputs  
    unique_ids = x.pop("unique_ids")
    start_logits, end_logits = bert_squad(x, training=False)
    output_dict = dict(
        unique_ids=unique_ids,
        start_logits=start_logits,
        end_logits=end_logits)
    for result in get_raw_results(output_dict):
        all_results.append(result)
    if count % 100 == 0:
        print("{}/{}".format(count, 2709))

0/2709
100/2709
200/2709
300/2709
400/2709
500/2709
600/2709
700/2709
800/2709
900/2709
1000/2709
1100/2709
1200/2709
1300/2709
1400/2709
1500/2709
1600/2709
1700/2709
1800/2709
1900/2709
2000/2709
2100/2709
2200/2709
2300/2709
2400/2709
2500/2709
2600/2709
2700/2709


Escribimos nuestras predicciones en un fichero JSON que funcionará con el script de evaluación.

In [33]:
output_prediction_file = "/home/icarlos/BERT/Q&A/predictions.json"
output_nbest_file = "/home/icarlos/BERT/Q&A/nbest_predictions.json"
output_null_log_odds_file = "/home/icarlos/BERT/Q&A/null_odds.json"

write_predictions(
    eval_examples,
    eval_features,
    all_results,
    20,
    30,
    True,
    output_prediction_file,
    output_nbest_file,
    output_null_log_odds_file,
    verbose=False)

## Predicción casera

### Creación del diccionario de entrada

Concatenamos la pregunta y el contexto, separados por `["SEP"]`, tras la tokenización, tal cual como lo hicimos con el conjunto de entrenamiento.

Lo importante a recordar es que queremos que nuestra respuesta empiece y termine con una palabra real. Por ejemplo, la palabra "ecologically" es tokenizada como `["ecological", "##ly"]`, y si el token de fin es `["ecological"]` queremos usar la palabra "ecologically" como palabra final (del mismo modo si el token de fin es`["##ly"]`). Por eso, empezamos dividiendo nuestro contexto en palabras, y luego pasamos a tokens, recordando qué token se corresponde con qué palabra (ver la función `tokenize_context()` para más detalle).

#### Útiles varios

In [34]:
my_bert_layer = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    trainable=False)
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [35]:
def is_whitespace(c):
    '''
    Indica si un cadena de caracteres se corresponde con un espacio en blanco / separador o no.
    '''
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

In [36]:
def whitespace_split(text):
    '''
    Toma el texto y devuelve una lista de "palabras" separadas segun los 
    espacios en blanco / separadores anteriores.
    '''
    doc_tokens = []
    prev_is_whitespace = True
    for c in text:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
    return doc_tokens

In [37]:
def tokenize_context(text_words):
    '''
    Toma una lista de palabras (devueltas por whitespace_split()) y tokeniza cada
    palabra una por una. También almacena, para cada nuevo token, la palabra original
    del parámetro text_words.
    '''
    text_tok = []
    tok_to_word_id = []
    for word_id, word in enumerate(text_words):
        word_tok = tokenizer.tokenize(word)
        text_tok += word_tok
        tok_to_word_id += [word_id]*len(word_tok)
    return text_tok, tok_to_word_id

In [38]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id # Convierte 1 en 0 y viceversa
    return seg_ids

In [39]:
def create_input_dict(question, context):
    '''
    Take a question and a context as strings and return a dictionary with the 3
    elements needed for the model. Also return the context_words, the
    context_tok to context_word ids correspondance and the length of
    question_tok that we will need later.
    '''
    question_tok = tokenizer.tokenize(my_question)

    context_words = whitespace_split(context)
    context_tok, context_tok_to_word_id = tokenize_context(context_words)

    input_tok = question_tok + ["[SEP]"] + context_tok + ["[SEP]"]
    input_tok += ["[PAD]"]*(384-len(input_tok)) # in our case the model has been
                                                # trained to have inputs of length max 384
    input_dict = {}
    input_dict["input_word_ids"] = tf.expand_dims(tf.cast(get_ids(input_tok), tf.int32), 0)
    input_dict["input_mask"] = tf.expand_dims(tf.cast(get_mask(input_tok), tf.int32), 0)
    input_dict["input_type_ids"] = tf.expand_dims(tf.cast(get_segments(input_tok), tf.int32), 0)

    return input_dict, context_words, context_tok_to_word_id, len(question_tok)

#### Creación

In [40]:
my_context = '''Neoclassical economics views inequalities in the distribution of income as arising from differences in value added by labor, capital and land. Within labor income distribution is due to differences in value added by different classifications of workers. In this perspective, wages and profits are determined by the marginal value added of each economic actor (worker, capitalist/business owner, landlord). Thus, in a market economy, inequality is a reflection of the productivity gap between highly-paid professions and lower-paid professions.'''

Neoclassical economics views inequalities in the distribution of income as arising from differences in value added by labor, capital and land. Within labor income distribution is due to differences in value added by different classifications of workers. In this perspective, wages and profits are determined by the marginal value added of each economic actor (worker, capitalist/business owner, landlord). Thus, in a market economy, inequality is a reflection of the productivity gap between highly-paid professions and lower-paid professions.

In [41]:
#my_question = '''What philosophy of thought addresses wealth inequality?'''
my_question = '''What are examples of economic actors?'''
#my_question = '''In a market economy, what is inequality a reflection of?'''

In [42]:
my_input_dict, my_context_words, context_tok_to_word_id, question_tok_len = create_input_dict(my_question, my_context)

### Predicción

In [43]:
start_logits, end_logits = bert_squad(my_input_dict, training=False)

### Interpretación

We remove the ids corresponding to the question and the `["SEP"]` token:

In [44]:
start_logits_context = start_logits.numpy()[0, question_tok_len+1:]
end_logits_context = end_logits.numpy()[0, question_tok_len+1:]

First easy interpretation:

In [45]:
start_word_id = context_tok_to_word_id[np.argmax(start_logits_context)]
end_word_id = context_tok_to_word_id[np.argmax(end_logits_context)]

"Advanced" - making sure that the start of the answer is before the end:

In [46]:
pair_scores = np.ones((len(start_logits_context), len(end_logits_context)))*(-1E10)
for i in range(len(start_logits_context-1)):
    for j in range(i, len(end_logits_context)):
        pair_scores[i, j] = start_logits_context[i] + end_logits_context[j]
pair_scores_argmax = np.argmax(pair_scores)

In [47]:
start_word_id = context_tok_to_word_id[pair_scores_argmax // len(start_logits_context)]
end_word_id = context_tok_to_word_id[pair_scores_argmax % len(end_logits_context)]

Final answer:

In [49]:
predicted_answer = ' '.join(my_context_words[start_word_id:end_word_id+1])
print("The answer to:\n" + my_question + "\nis:\n" + predicted_answer)

The answer to:
What are examples of economic actors?
is:
(worker, capitalist/business owner, landlord).


In [50]:
from IPython.core.display import HTML
display(HTML(f'<h2>{my_question.upper()}</h2>'))
marked_text = str(my_context.replace(predicted_answer, f"<mark>{predicted_answer}</mark>"))
display(HTML(f"""<blockquote> {marked_text} </blockquote>"""))

#### Reto Final

In [51]:
my_context = '''
Coronavirus disease 2019 is an infectious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). It was first identified in December 2019 in Wuhan, Hubei, China, and has resulted in an ongoing pandemic.
Common symptoms include fever, cough, fatigue, shortness of breath, and loss of smell and taste.While most people have mild symptoms, some people develop acute respiratory distress syndrome (ARDS) possibly precipitated by cytokine storm, multi-organ failure, septic shock, and blood clots. The time from exposure to onset of symptoms is typically around five days, but may range from two to fourteen days.
The virus is spread primarily via nose and mouth secretions including small droplets produced by coughing,[a] sneezing, and talking. The droplets usually do not travel through air over long distances. However, those standing in close proximity may inhale these droplets and become infected.[b] People may also become infected by touching a contaminated surface and then touching their face. The transmission may also occur through smaller droplets that are able to stay suspended in the air for longer periods of time in enclosed spaces.'''

my_question = '''What are the common symptoms of the disease?'''

my_input_dict, my_context_words, context_tok_to_word_id, question_tok_len = create_input_dict(my_question, my_context)

start_logits, end_logits = bert_squad(my_input_dict, training=False)

pair_scores = np.ones((len(start_logits_context), len(end_logits_context)))*(-1E10)
for i in range(len(start_logits_context-1)):
    for j in range(i, len(end_logits_context)):
        pair_scores[i, j] = start_logits_context[i] + end_logits_context[j]
pair_scores_argmax = np.argmax(pair_scores)

start_word_id = context_tok_to_word_id[pair_scores_argmax // len(start_logits_context)]
end_word_id = context_tok_to_word_id[pair_scores_argmax % len(end_logits_context)]

predicted_answer = ' '.join(my_context_words[start_word_id:end_word_id+1])


from IPython.core.display import HTML
display(HTML(f'<h2>{my_question.upper()}</h2>'))
marked_text = str(my_context.replace(predicted_answer, f"<mark>{predicted_answer}</mark>"))
display(HTML(f"""<blockquote> {marked_text} </blockquote>"""))