
# Este modelo de red entrena un BiLSTM + CRF para la clasificación de NER sobre el corpus Biobert. 

In [64]:
try:
    import seqeval
except ModuleNotFoundError as err:
    !pip install seqeval

# Carga de archivos de entrada (input) 

In [65]:

import tensorflow as tf
#matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/libs2021/tf2crf.py
/kaggle/input/libs2021/utils.py
/kaggle/input/libs2021/mwrapper.py
/kaggle/input/europarl-jrc-fastttext/Europarl_fasttext_skip_model11_300.txt
/kaggle/input/biobert/Biobert_json.py
/kaggle/input/biobert/data56/train.json
/kaggle/input/biobert/data56/test.json
/kaggle/input/biobert/data56/valid.json
/kaggle/input/fasttext-spanish/cc.es.300.vec/cc.es.300.vec
/kaggle/input/utf8conll2002/ut8_esp-train.train
/kaggle/input/utf8conll2002/ut8_esp.testb
/kaggle/input/utf8conll2002/ut8_esp.testa
/kaggle/input/vectors/word2index.npy
/kaggle/input/vectors/tag2index.npy
/kaggle/input/embedding/embedding.py
/kaggle/input/embedding/text_mapping.json
/kaggle/input/embedding/text_embeddings.gensimmodel


In [66]:
import sys
sys.path.append('/kaggle/input/libs2021')
sys.path.append('/kaggle/input/embedding')
sys.path.append('/kaggle/input/libscrf4')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
#from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
#from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten
from tensorflow.keras.optimizers import Adam, schedules
#from crfta import CRF as crf4
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random



# Se trae el corpus nltk desde Python. Nltk es una libreria de pln en python que tiene un conjuntos de corpus anotados o corpora  de tareas lingüisticas como información morfológica, lemas, pos tagging,  Ner, relaciones, análisis de sentimientos, etc. Cada corpus en nltk tiene in id que lo identifica para poder ser llamado. En el caso del corpus Conll2002, el id='conll2002'

In [67]:
!pip install datasets



Es posible que sea necesario reiniciar el kernel en este punto si al cargar el paquete de dataset se tiene algun error

In [68]:
import datasets
from datasets import load_dataset

In [69]:
%%time
datasets = load_dataset("/kaggle/input/biobert/Biobert_json.py")

  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 44.1 ms, sys: 5.85 ms, total: 49.9 ms
Wall time: 51.6 ms


In [70]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentencia', 'tag'],
        num_rows: 9788
    })
    validation: Dataset({
        features: ['sentencia', 'tag'],
        num_rows: 2758
    })
    test: Dataset({
        features: ['sentencia', 'tag'],
        num_rows: 2496
    })
})

# PARTE  1. PREPROCESAMIENTO DE LOS DATOS

# Procedimientos de extracción de palabras y labels
Ahora se definen dos procedimients para extraer de  las sentencias  la palabra o token y la etiqueta o label. Conll2002 es un archivo de texto con tres columnas; el token o palabra, el POS que es el postagging y la tercera columna la etiqueta de NER.

In [71]:
datasets['train'][0]['sentencia']

['Abuela',
 'materna',
 'con',
 'cancer',
 'de',
 'mama',
 'a',
 'los',
 '70',
 'años',
 '.']

# Extracción de las palabras o tokens y labels  de las sentencias de training, testeo, y validación. 

In [72]:
labels = datasets["train"].features['tag'].feature.names
labels

['B_CANCER_CONCEPT',
 'B_CHEMOTHERAPY',
 'B_DATE',
 'B_DRUG',
 'B_FAMILY',
 'B_FREQ',
 'B_IMPLICIT_DATE',
 'B_INTERVAL',
 'B_METRIC',
 'B_OCURRENCE_EVENT',
 'B_QUANTITY',
 'B_RADIOTHERAPY',
 'B_SMOKER_STATUS',
 'B_STAGE',
 'B_SURGERY',
 'B_TNM',
 'I_CANCER_CONCEPT',
 'I_DATE',
 'I_DRUG',
 'I_FAMILY',
 'I_FREQ',
 'I_IMPLICIT_DATE',
 'I_INTERVAL',
 'I_METRIC',
 'I_OCURRENCE_EVENT',
 'I_SMOKER_STATUS',
 'I_STAGE',
 'I_SURGERY',
 'I_TNM',
 'O']

In [73]:
def get_labels_tags(dataset):
    X_value = []
    y_value = []
    
    for i in range(datasets[dataset].num_rows):
        train_tags = []
        for j in range(len(datasets[dataset][i]['tag'])):
            tag_int = datasets[dataset][i]['tag'][j]
            train_tags.append(labels[tag_int])

        X_value.append(datasets[dataset][i]['sentencia'])
        y_value.append(train_tags)

    return X_value, y_value


In [74]:
X_train, y_train = get_labels_tags('train')
X_test, y_test = get_labels_tags('test')
X_val, y_val = get_labels_tags('validation')

In [75]:
print(X_train[2])
print(y_train[2])

['-', 'Quiste', 'renal', 'izquierdo', 'complicado', '(', 'ecografia', 'noviembre', '2013', 'quistes', 'renales', 'bilaterales', ')', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_DATE', 'I_DATE', 'O', 'O', 'O', 'O', 'O']


# Conteo e indexación de las palabras y etiquetas
Indexación de las palabras (word2index) y las etiquetas (tag2index) sobre los conjuntos  de entrada x X_train + X_eval + X_test  y sobre las sentencias de etiquetas y_train + y_eval + y_test. Esta indexación se realiza con diccionarios en python.

In [76]:
import numpy as np

words, tagsss = set([]), set([])
 
for s in (X_train + X_val + X_test):
    for w in s:
        words.add(w.lower())

for ts in (y_train + y_val + y_test):
    for t in ts:
        tagsss.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 2 for i, t in enumerate(list(tagsss))}
tag2index['-PAD-'] = 0  # The special value used to padding
tag2index['-OOV-'] = 1  # The special value used to padding

print (len(word2index))
print (len(tag2index))
print(tag2index)


print(tagsss)

9883
32
{'B_SURGERY': 2, 'B_METRIC': 3, 'I_FAMILY': 4, 'B_FREQ': 5, 'I_STAGE': 6, 'B_IMPLICIT_DATE': 7, 'I_FREQ': 8, 'B_FAMILY': 9, 'B_TNM': 10, 'I_CANCER_CONCEPT': 11, 'B_STAGE': 12, 'B_QUANTITY': 13, 'I_METRIC': 14, 'B_DRUG': 15, 'B_SMOKER_STATUS': 16, 'B_CANCER_CONCEPT': 17, 'I_DATE': 18, 'B_CHEMOTHERAPY': 19, 'B_OCURRENCE_EVENT': 20, 'I_SMOKER_STATUS': 21, 'I_OCURRENCE_EVENT': 22, 'B_INTERVAL': 23, 'I_SURGERY': 24, 'I_INTERVAL': 25, 'I_IMPLICIT_DATE': 26, 'I_TNM': 27, 'O': 28, 'B_RADIOTHERAPY': 29, 'I_DRUG': 30, 'B_DATE': 31, '-PAD-': 0, '-OOV-': 1}
{'B_SURGERY', 'B_METRIC', 'I_FAMILY', 'B_FREQ', 'I_STAGE', 'B_IMPLICIT_DATE', 'I_FREQ', 'B_FAMILY', 'B_TNM', 'I_CANCER_CONCEPT', 'B_STAGE', 'B_QUANTITY', 'I_METRIC', 'B_DRUG', 'B_SMOKER_STATUS', 'B_CANCER_CONCEPT', 'I_DATE', 'B_CHEMOTHERAPY', 'B_OCURRENCE_EVENT', 'I_SMOKER_STATUS', 'I_OCURRENCE_EVENT', 'B_INTERVAL', 'I_SURGERY', 'I_INTERVAL', 'I_IMPLICIT_DATE', 'I_TNM', 'O', 'B_RADIOTHERAPY', 'I_DRUG', 'B_DATE'}


# Enterización de las sentencias de entrenamiento, testeo y validación.
# En  esta parte se realiza la enterización  del conjunto de sentencias, para ello usamos los ids de word2index y tag2index que  tienen un entero asignado por palabra.

In [77]:
train_sentences_X, val_sentences_X, test_sentences_X, train_tags_y, val_tags_y, test_tags_y = [], [], [], [], [], []

for s in X_train:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)

for s in X_val:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    val_sentences_X.append(s_int)

for s in X_test:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)

for s in y_train:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    train_tags_y.append(s_int)

for s in y_val:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    val_tags_y.append(s_int)

for s in y_test:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    test_tags_y.append(s_int)

# Las matrices de los tags son de números indexados pequeños porque solo son 11 tags.  ({ORG, LOC, PER}  X IOB)

In [78]:
print("Longitudes de las Matrices:")
print(len(train_sentences_X))
print(len(val_sentences_X))
print(len( test_sentences_X))
print(len(train_tags_y))
print(len(val_tags_y))
print(len(test_tags_y))

print("\nMuestra de Datos presentes en las Matrices con las transformaciones:\n")


print(train_sentences_X[0])
print(val_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(val_tags_y[0])
print(test_tags_y[0])

Longitudes de las Matrices:
9788
2758
2496
9788
2758
2496

Muestra de Datos presentes en las Matrices con las transformaciones:

[6727, 344, 6143, 7637, 6298, 7515, 6629, 8841, 6458, 7116, 7175]
[8823, 4652, 1065, 1093, 1715, 5664, 551, 5499, 1760, 4317, 5098, 6138, 1338, 8094, 5045, 7175]
[566, 6298, 3150, 7116, 6143, 6360, 3163, 3648, 199, 5205, 7175]
[9, 4, 28, 17, 11, 11, 28, 28, 13, 3, 28]
[3, 13, 15, 3, 13, 28, 15, 13, 3, 28, 3, 28, 5, 8, 8, 28]
[28, 28, 13, 3, 28, 28, 28, 28, 28, 31, 28]


# Se procede a Normalizar las matrices con la longitud de la columna=MAX_LENGTH para que todas tengan la misma dimensión matricial, con la longitud máxima de palabras encontradas anteriormente y se agregan ceros a la derecha en las posiciones que hacen falta en el vector. 

In [79]:

MAX_LENGTH=202
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
val_sentences_X = pad_sequences(val_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
val_tags_y = pad_sequences(val_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(train_sentences_X.shape)
print(val_sentences_X[0])
print(val_sentences_X.shape)
print(test_sentences_X[0])
print(test_sentences_X.shape)
print(train_tags_y[0])
print(train_tags_y.shape)
print(val_tags_y[0])
print(val_tags_y.shape)
print(test_tags_y[0])
print(test_tags_y.shape)





[6727  344 6143 7637 6298 7515 6629 8841 6458 7116 7175    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

# Categorización de las sentencias de etiquetas, estas no se introducen  a red neuronal como enteros sino como vectores one-hot.

In [80]:
def to_categoricals(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [81]:


def encode(data):
    print('Shape of data (BEFORE encode): %s' % str(data.shape))
    encoded = to_categorical(data)
    print('Shape of data (AFTER  encode): %s\n' % str(encoded.shape))
    return encoded

# Se realiza la categorización one-hot de las etiquetas o labels de entrenamiento, testeo y validación

In [82]:
cat_train_tags_y = to_categoricals(train_tags_y, len(tag2index))
cat_val_tags_y  = to_categoricals(val_tags_y, len(tag2index))
cat_test_tags_y  = to_categoricals(test_tags_y, len(tag2index))

print(cat_train_tags_y[1])
print(len(cat_train_tags_y))
print(cat_train_tags_y.shape)
print(len(cat_test_tags_y))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
9788
(9788, 202, 32)
2496


# PARTE 2. ENTRENAMIENTO DEL MODELO DE RED.

# Vectorización de palabras 
# En esta rutina se trae el archivo vectorizado de fasttext con una dimensión de 300, y contiene la información de la información de la biblia. Es importante tener en cuenta que este es un archivo preentrenado con un modelo de vectorización que trata el tema de la similaridad entre palabras. De igual maneara  en esta parte se pueden. El procedimiento de embedding_matrix prepara la matriz  W que se define como un espacio de solución de la red neuronal. W.x + b.

In [83]:

EMBED_DIM=300
file = '/kaggle/input/europarl-jrc-fastttext/Europarl_fasttext_skip_model11_300.txt'
embedding_matrix = bme(file, len(word2index), EMBED_DIM, word2index)

Cargando archivo...


0it [00:00, ?it/s]

Encontrado 489289 Word Vectors.


  0%|          | 0/9883 [00:00<?, ?it/s]

Convertidos: 4644 Tokens | Perdidos: 5425 Tokens


# Modelo matématico
# 
# Aqui  se define el modelo matémático de la  máquina de aprendizaje que en este caso en una red neuronal recursiva RNN. Esepecíficamente es un Bilstm que  es un Lstm en dos direcciones 

In [84]:
from tf2crf import CRF as crf6
from mwrapper import ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten, Masking
from tensorflow.keras.optimizers import Adam, schedules
input = Input(shape=(MAX_LENGTH,))
word_embedding_size = 300

# Embedding Layer
#model = Embedding(input_dim=len(word2index), 
    #            output_dim=word_embedding_size, 
     #           input_length=MAX_LENGTH,
     #           mask_zero=False)(input)

model = Embedding(len(word2index),
                        EMBED_DIM,
                        input_length=MAX_LENGTH,  
                        weights=[embedding_matrix],
                        trainable=False,
                        mask_zero=True)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                     return_sequences=True, 
                     dropout=0.5, 
                     recurrent_dropout=0.5))(model)
model  = Dropout(0.5, name='dropout_lstm')(model)
model  = Dense(units=EMBED_DIM * 2, activation='relu')(model)
model  = Dense(units=len(tag2index), activation='relu')(model)
    
model  = Masking(mask_value=0.,input_shape=(MAX_LENGTH, len(tag2index)))(model)
    
crf = crf6(units=len(tag2index), name="ner_crf")
predictions = crf(model)

base_model = Model(inputs=input, outputs=predictions)
model = ModelWithCRFLoss(base_model, sparse_target=True)

    
model.compile(optimizer='adam')
#model.summary()

  return py_builtins.overload_of(f)(*args)


In [85]:
history= model.fit(train_sentences_X, cat_train_tags_y,
                       validation_data=(val_sentences_X, cat_val_tags_y),
                       batch_size=32, 
                       epochs=30,
                       verbose=2)

Epoch 1/30
306/306 - 581s - loss: 122.2926 - accuracy: 0.8800 - val_loss_val: 19.9140 - val_val_accuracy: 0.9779
Epoch 2/30
306/306 - 571s - loss: 18.4312 - accuracy: 0.9783 - val_loss_val: 17.4863 - val_val_accuracy: 0.9779
Epoch 3/30
306/306 - 574s - loss: 16.2778 - accuracy: 0.9783 - val_loss_val: 15.2724 - val_val_accuracy: 0.9779
Epoch 4/30
306/306 - 570s - loss: 14.2263 - accuracy: 0.9786 - val_loss_val: 13.0576 - val_val_accuracy: 0.9789
Epoch 5/30
306/306 - 567s - loss: 12.6747 - accuracy: 0.9800 - val_loss_val: 11.6631 - val_val_accuracy: 0.9811
Epoch 6/30
306/306 - 567s - loss: 11.4805 - accuracy: 0.9816 - val_loss_val: 10.5260 - val_val_accuracy: 0.9823
Epoch 7/30
306/306 - 568s - loss: 10.4916 - accuracy: 0.9831 - val_loss_val: 9.6243 - val_val_accuracy: 0.9838
Epoch 8/30
306/306 - 567s - loss: 9.6638 - accuracy: 0.9844 - val_loss_val: 8.8915 - val_val_accuracy: 0.9855
Epoch 9/30
306/306 - 567s - loss: 9.0110 - accuracy: 0.9854 - val_loss_val: 8.3505 - val_val_accuracy: 0.9

In [86]:
print(tag2index)
y_pred= model.predict(test_sentences_X)
print(y_pred.shape)

{'B_SURGERY': 2, 'B_METRIC': 3, 'I_FAMILY': 4, 'B_FREQ': 5, 'I_STAGE': 6, 'B_IMPLICIT_DATE': 7, 'I_FREQ': 8, 'B_FAMILY': 9, 'B_TNM': 10, 'I_CANCER_CONCEPT': 11, 'B_STAGE': 12, 'B_QUANTITY': 13, 'I_METRIC': 14, 'B_DRUG': 15, 'B_SMOKER_STATUS': 16, 'B_CANCER_CONCEPT': 17, 'I_DATE': 18, 'B_CHEMOTHERAPY': 19, 'B_OCURRENCE_EVENT': 20, 'I_SMOKER_STATUS': 21, 'I_OCURRENCE_EVENT': 22, 'B_INTERVAL': 23, 'I_SURGERY': 24, 'I_INTERVAL': 25, 'I_IMPLICIT_DATE': 26, 'I_TNM': 27, 'O': 28, 'B_RADIOTHERAPY': 29, 'I_DRUG': 30, 'B_DATE': 31, '-PAD-': 0, '-OOV-': 1}
(2496, 202)


In [87]:
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
index2tag = {i: t for t, i in tag2index.items()}
print(index2tag)
y1_pred = logits_to_tokens(y_pred, index2tag)
print(y1_pred[10])

{2: 'B_SURGERY', 3: 'B_METRIC', 4: 'I_FAMILY', 5: 'B_FREQ', 6: 'I_STAGE', 7: 'B_IMPLICIT_DATE', 8: 'I_FREQ', 9: 'B_FAMILY', 10: 'B_TNM', 11: 'I_CANCER_CONCEPT', 12: 'B_STAGE', 13: 'B_QUANTITY', 14: 'I_METRIC', 15: 'B_DRUG', 16: 'B_SMOKER_STATUS', 17: 'B_CANCER_CONCEPT', 18: 'I_DATE', 19: 'B_CHEMOTHERAPY', 20: 'B_OCURRENCE_EVENT', 21: 'I_SMOKER_STATUS', 22: 'I_OCURRENCE_EVENT', 23: 'B_INTERVAL', 24: 'I_SURGERY', 25: 'I_INTERVAL', 26: 'I_IMPLICIT_DATE', 27: 'I_TNM', 28: 'O', 29: 'B_RADIOTHERAPY', 30: 'I_DRUG', 31: 'B_DATE', 0: '-PAD-', 1: '-OOV-'}
['B_OCURRENCE_EVENT', 'I_OCURRENCE_EVENT', 'O', 'B_QUANTITY', 'B_METRIC', 'B_DRUG', 'O', 'B_DRUG', 'O', 'B_QUANTITY', 'B_METRIC', 'O', 'B_OCURRENCE_EVENT', 'O', 'B_DATE', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD

In [88]:
#print(Y_test[4])
print(test_tags_y.shape)

(2496, 202)


In [89]:
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
index2tag = {i: t for t, i in tag2index.items()}
print(index2tag)
y1_true = logits_to_tokens(test_tags_y, index2tag)
print(y1_true[10])

{2: 'B_SURGERY', 3: 'B_METRIC', 4: 'I_FAMILY', 5: 'B_FREQ', 6: 'I_STAGE', 7: 'B_IMPLICIT_DATE', 8: 'I_FREQ', 9: 'B_FAMILY', 10: 'B_TNM', 11: 'I_CANCER_CONCEPT', 12: 'B_STAGE', 13: 'B_QUANTITY', 14: 'I_METRIC', 15: 'B_DRUG', 16: 'B_SMOKER_STATUS', 17: 'B_CANCER_CONCEPT', 18: 'I_DATE', 19: 'B_CHEMOTHERAPY', 20: 'B_OCURRENCE_EVENT', 21: 'I_SMOKER_STATUS', 22: 'I_OCURRENCE_EVENT', 23: 'B_INTERVAL', 24: 'I_SURGERY', 25: 'I_INTERVAL', 26: 'I_IMPLICIT_DATE', 27: 'I_TNM', 28: 'O', 29: 'B_RADIOTHERAPY', 30: 'I_DRUG', 31: 'B_DATE', 0: '-PAD-', 1: '-OOV-'}
['B_OCURRENCE_EVENT', 'I_OCURRENCE_EVENT', 'O', 'B_QUANTITY', 'B_METRIC', 'B_DRUG', 'O', 'B_DRUG', 'O', 'B_QUANTITY', 'B_METRIC', 'O', 'B_OCURRENCE_EVENT', 'O', 'B_DATE', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD

In [90]:
#hh1 = seqclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', hh1)
from seqeval.metrics import classification_report as seqclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
print("precision: {:.1%}".format(precision_score(y1_true, y1_pred)))
print("   recall: {:.1%}".format(recall_score(y1_true,    y1_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y1_true,  y1_pred)))
print(" F1-score: {:.1%}".format(f1_score(y1_true,        y1_pred)))



precision: 88.4%
   recall: 69.7%
 accuracy: 99.4%
 F1-score: 77.9%


In [91]:
import pandas as pd
li1 = sum(y1_true, [])
li2 = sum(y1_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected'] = li1
results['Predicted'] = li2

In [92]:
from sklearn.metrics import classification_report as eskclarep
report = eskclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', report)

print(report_to_df(report))

  _warn_prf(average, modifier, msg_start, len(result))


           Class Name precision recall f1-score support
0               -PAD-      1.00   1.00     1.00  466544
1    B_CANCER_CONCEPT      0.97   0.62     0.75     689
2      B_CHEMOTHERAPY      0.96   0.83     0.89     191
3              B_DATE      0.82   0.52     0.64     779
4              B_DRUG      0.92   0.84     0.88     675
5            B_FAMILY      0.94   0.74     0.83     147
6              B_FREQ      0.92   0.58     0.71     161
7     B_IMPLICIT_DATE      0.78   0.69     0.73      26
8          B_INTERVAL      0.00   0.00     0.00      21
9            B_METRIC      0.94   0.87     0.90    1461
10  B_OCURRENCE_EVENT      0.88   0.65     0.75     597
11         B_QUANTITY      0.90   0.76     0.83    1493
12     B_RADIOTHERAPY      0.93   0.76     0.84      89
13    B_SMOKER_STATUS      0.95   0.38     0.55      55
14            B_STAGE      0.97   0.93     0.95     155
15          B_SURGERY      1.00   0.15     0.26     108
16              B_TNM      0.83   0.47     0.60 

In [93]:
test_samples = []

test_samples.append(datasets['test'][203]['sentencia'])
test_samples.append(datasets['test'][133]['sentencia'])
test_samples.append("Padre fallecido a los 78 años por ECV isquémica. Madre con HTA y DM2, fallecida a los 82 años.".split())
test_samples.append("Paciente sin antecedentes de tabaquismo ni consumo de alcohol. No refiere alergias conocidas.".split())

print(test_samples)

[['Padre', 'fallecio', 'por', 'IAM', 'con', '85', 'años', ',', 'Madre', 'fallecida', 'con', '84', 'años', '.'], ['No', 'HTA', ',', 'Dislipemia', '.'], ['Padre', 'fallecido', 'a', 'los', '78', 'años', 'por', 'ECV', 'isquémica.', 'Madre', 'con', 'HTA', 'y', 'DM2,', 'fallecida', 'a', 'los', '82', 'años.'], ['Paciente', 'sin', 'antecedentes', 'de', 'tabaquismo', 'ni', 'consumo', 'de', 'alcohol.', 'No', 'refiere', 'alergias', 'conocidas.']]


In [94]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)
print(test_samples_X.shape)

[[3282 5382 3822 2889 6143 9882 7116 5962  766 4940 6143 5974 7116 7175
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [95]:
predictions = model.predict(test_samples_X)
print(predictions, predictions.shape)

  return py_builtins.overload_of(f)(*args)


[[ 9 28 28 28 28 13  3 28  9 20 28 13  3 28  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0]
 [28 28 28 28 28  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0 

In [96]:
#print(len(predictions))
log_tokens = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})
print(log_tokens)

[['B_FAMILY', 'O', 'O', 'O', 'O', 'B_QUANTITY', 'B_METRIC', 'O', 'B_FAMILY', 'B_OCURRENCE_EVENT', 'O', 'B_QUANTITY', 'B_METRIC', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-

In [97]:
#!pip install tabulate
from tabulate import tabulate

for i in range(len(test_samples)):
    heads = test_samples[i]
    body = [log_tokens[i][:len(test_samples[i])]]

    print(tabulate(body, headers=heads))


Padre     fallecio    por    IAM    con    85          años      ,    Madre     fallecida          con    84          años      .
--------  ----------  -----  -----  -----  ----------  --------  ---  --------  -----------------  -----  ----------  --------  ---
B_FAMILY  O           O      O      O      B_QUANTITY  B_METRIC  O    B_FAMILY  B_OCURRENCE_EVENT  O      B_QUANTITY  B_METRIC  O
No    HTA    ,    Dislipemia    .
----  -----  ---  ------------  ---
O     O      O    O             O
Padre     fallecido          a    los    78          años      por    ECV    isquémica.    Madre     con    HTA    y    DM2,    fallecida    a    los    82    años.
--------  -----------------  ---  -----  ----------  --------  -----  -----  ------------  --------  -----  -----  ---  ------  -----------  ---  -----  ----  -------
B_FAMILY  B_OCURRENCE_EVENT  O    O      B_QUANTITY  B_METRIC  O      O      O             B_FAMILY  O      O      O    O       O            O    O      O     O
Paciente   