
# Este modelo de red entrena un BiLSTM + CRF para la clasificación de NER sobre el corpus Biobert. Este modelo tiene como entrada a la red la enterización del conjunto X de entrenamiento y la enterización y categorización  de los vectores de etiquetas. En este modelo no hay matriz de embedding.

Taller 2 PLN 2024 - Punto 2.1

In [1]:
try:
    import seqeval
except ModuleNotFoundError as err:
    !pip install seqeval

In [2]:

import tensorflow as tf
#matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/embedding/embedding.py
/kaggle/input/embedding/text_mapping.json
/kaggle/input/embedding/text_embeddings.gensimmodel
/kaggle/input/vectors/word2index.npy
/kaggle/input/vectors/tag2index.npy
/kaggle/input/libscrf4/crfta.py
/kaggle/input/libscrf4/utils.py
/kaggle/input/libs2021/tf2crf.py
/kaggle/input/libs2021/utils.py
/kaggle/input/libs2021/mwrapper.py
/kaggle/input/fasttext-spanish/cc.es.300.vec/cc.es.300.vec
/kaggle/input/biobert/Biobert_json.py
/kaggle/input/biobert/data56/train.json
/kaggle/input/biobert/data56/test.json
/kaggle/input/biobert/data56/valid.json
/kaggle/input/fasttextspanish/word2vec_skip-gram_model_300.txt
/kaggle/input/utf8conll2002/ut8_esp-train.train
/kaggle/input/utf8conll2002/ut8_esp.testb
/kaggle/input/utf8conll2002/ut8_esp.testa


In [3]:
import sys
sys.path.append('/kaggle/input/libs2021')
sys.path.append('/kaggle/input/embedding')
sys.path.append('/kaggle/input/libscrf4')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
#from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
#from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten
from tensorflow.keras.optimizers import Adam, schedules
from crfta import CRF as crf4
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

from IPython.core.display import display, HTML

import datetime, os
import random



# Instalación de Biobert

In [4]:
!pip install datasets



In [5]:
import datasets
from datasets import load_dataset

In [6]:
%%time
datasets = load_dataset("/kaggle/input/biobert/Biobert_json.py")

Downloading and preparing dataset biobert_json/Biobert_json to /root/.cache/huggingface/datasets/biobert_json/Biobert_json/1.0.0/0f24ba34d3a708d805a8cfe89936a0d237c160e596311aa3df1bbfc5e9f033f1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset biobert_json downloaded and prepared to /root/.cache/huggingface/datasets/biobert_json/Biobert_json/1.0.0/0f24ba34d3a708d805a8cfe89936a0d237c160e596311aa3df1bbfc5e9f033f1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 1.57 s, sys: 43.8 ms, total: 1.62 s
Wall time: 1.7 s


In [7]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentencia', 'tag'],
        num_rows: 9788
    })
    validation: Dataset({
        features: ['sentencia', 'tag'],
        num_rows: 2758
    })
    test: Dataset({
        features: ['sentencia', 'tag'],
        num_rows: 2496
    })
})

# PARTE  1. PREPROCESAMIENTO DE LOS DATOS

In [8]:
datasets['train'][0]['sentencia']

['Abuela',
 'materna',
 'con',
 'cancer',
 'de',
 'mama',
 'a',
 'los',
 '70',
 'años',
 '.']

In [9]:
labels = datasets["train"].features['tag'].feature.names
labels

['B_CANCER_CONCEPT',
 'B_CHEMOTHERAPY',
 'B_DATE',
 'B_DRUG',
 'B_FAMILY',
 'B_FREQ',
 'B_IMPLICIT_DATE',
 'B_INTERVAL',
 'B_METRIC',
 'B_OCURRENCE_EVENT',
 'B_QUANTITY',
 'B_RADIOTHERAPY',
 'B_SMOKER_STATUS',
 'B_STAGE',
 'B_SURGERY',
 'B_TNM',
 'I_CANCER_CONCEPT',
 'I_DATE',
 'I_DRUG',
 'I_FAMILY',
 'I_FREQ',
 'I_IMPLICIT_DATE',
 'I_INTERVAL',
 'I_METRIC',
 'I_OCURRENCE_EVENT',
 'I_SMOKER_STATUS',
 'I_STAGE',
 'I_SURGERY',
 'I_TNM',
 'O']

In [10]:
def get_labels_tags(dataset):
    X_value = []
    y_value = []
    
    for i in range(datasets[dataset].num_rows):
        train_tags = []
        for j in range(len(datasets[dataset][i]['tag'])):
            tag_int = datasets[dataset][i]['tag'][j]
            train_tags.append(labels[tag_int])

        X_value.append(datasets[dataset][i]['sentencia'])
        y_value.append(train_tags)

    return X_value, y_value


In [11]:
X_train, y_train = get_labels_tags('train')
X_test, y_test = get_labels_tags('test')
X_val, y_val = get_labels_tags('validation')

In [12]:
print(X_train[2])
print(y_train[2])

['-', 'Quiste', 'renal', 'izquierdo', 'complicado', '(', 'ecografia', 'noviembre', '2013', 'quistes', 'renales', 'bilaterales', ')', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_DATE', 'I_DATE', 'O', 'O', 'O', 'O', 'O']


In [13]:
import numpy as np

words, tagsss = set([]), set([])
 
for s in (X_train + X_val + X_test):
    for w in s:
        words.add(w.lower())

for ts in (y_train + y_val + y_test):
    for t in ts:
        tagsss.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 2 for i, t in enumerate(list(tagsss))}
tag2index['-PAD-'] = 0  # The special value used to padding
tag2index['-OOV-'] = 1  # The special value used to padding

print (len(word2index))
print (len(tag2index))
print(tag2index)


print(tagsss)

9883
32
{'B_SMOKER_STATUS': 2, 'B_STAGE': 3, 'B_INTERVAL': 4, 'B_RADIOTHERAPY': 5, 'I_SMOKER_STATUS': 6, 'B_DATE': 7, 'I_FREQ': 8, 'I_DRUG': 9, 'I_STAGE': 10, 'B_FREQ': 11, 'I_TNM': 12, 'B_CANCER_CONCEPT': 13, 'B_OCURRENCE_EVENT': 14, 'B_FAMILY': 15, 'I_CANCER_CONCEPT': 16, 'B_TNM': 17, 'B_METRIC': 18, 'B_IMPLICIT_DATE': 19, 'I_METRIC': 20, 'B_QUANTITY': 21, 'I_IMPLICIT_DATE': 22, 'B_SURGERY': 23, 'I_SURGERY': 24, 'B_CHEMOTHERAPY': 25, 'I_INTERVAL': 26, 'O': 27, 'I_FAMILY': 28, 'I_DATE': 29, 'B_DRUG': 30, 'I_OCURRENCE_EVENT': 31, '-PAD-': 0, '-OOV-': 1}
{'B_SMOKER_STATUS', 'B_STAGE', 'B_INTERVAL', 'B_RADIOTHERAPY', 'I_SMOKER_STATUS', 'B_DATE', 'I_FREQ', 'I_DRUG', 'I_STAGE', 'B_FREQ', 'I_TNM', 'B_CANCER_CONCEPT', 'B_OCURRENCE_EVENT', 'B_FAMILY', 'I_CANCER_CONCEPT', 'B_TNM', 'B_METRIC', 'B_IMPLICIT_DATE', 'I_METRIC', 'B_QUANTITY', 'I_IMPLICIT_DATE', 'B_SURGERY', 'I_SURGERY', 'B_CHEMOTHERAPY', 'I_INTERVAL', 'O', 'I_FAMILY', 'I_DATE', 'B_DRUG', 'I_OCURRENCE_EVENT'}


In [14]:
train_sentences_X, val_sentences_X, test_sentences_X, train_tags_y, val_tags_y, test_tags_y = [], [], [], [], [], []

for s in X_train:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)

for s in X_val:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    val_sentences_X.append(s_int)

for s in X_test:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)

for s in y_train:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    train_tags_y.append(s_int)

for s in y_val:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    val_tags_y.append(s_int)

for s in y_test:
    s_int = []
    for w in s:
        try:
            s_int.append(tag2index[w])
        except KeyError:
            s_int.append(tag2index['-OOV-'])
            
    test_tags_y.append(s_int)


# Las matrices de los tags son de números indexados pequeños porque solo son 11 tags.  ({ORG, LOC, PER}  X IOB)

In [15]:
print("Longitudes de las Matrices:")
print(len(train_sentences_X))
print(len(val_sentences_X))
print(len( test_sentences_X))
print(len(train_tags_y))
print(len(val_tags_y))
print(len(test_tags_y))

print("\nMuestra de Datos presentes en las Matrices con las transformaciones:\n")


print(train_sentences_X[0])
print(val_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(val_tags_y[0])
print(test_tags_y[0])


Longitudes de las Matrices:
9788
2758
2496
9788
2758
2496

Muestra de Datos presentes en las Matrices con las transformaciones:

[6410, 7992, 6021, 7162, 221, 5027, 2919, 5810, 8944, 9446, 7011]
[7352, 2444, 4023, 3862, 1821, 8404, 4971, 3663, 235, 2405, 3835, 3964, 131, 4151, 880, 7011]
[7696, 221, 1932, 9446, 6021, 5562, 1613, 96, 4600, 4790, 7011]
[15, 28, 27, 13, 16, 16, 27, 27, 21, 18, 27]
[18, 21, 30, 18, 21, 27, 30, 21, 18, 27, 18, 27, 11, 8, 8, 27]
[27, 27, 21, 18, 27, 27, 27, 27, 27, 7, 27]


# Se procede a Normalizar las matrices con la longitud de la columna=MAX_LENGTH1 para que todas contengan el mismo numero de columnas, con la longitud máxima de palabras encontradas anteriormente y se agregan ceros a la derecha en las posiciones que hacen falta en el vector. 

In [16]:

MAX_LENGTH=202
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
val_sentences_X = pad_sequences(val_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
val_tags_y = pad_sequences(val_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(train_sentences_X.shape)
print(val_sentences_X[0])
print(val_sentences_X.shape)
print(test_sentences_X[0])
print(test_sentences_X.shape)
print(train_tags_y[0])
print(train_tags_y.shape)
print(val_tags_y[0])
print(val_tags_y.shape)
print(test_tags_y[0])
print(test_tags_y.shape)



[6410 7992 6021 7162  221 5027 2919 5810 8944 9446 7011    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [17]:
def to_categoricals(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [18]:


def encode(data):
    print('Shape of data (BEFORE encode): %s' % str(data.shape))
    encoded = to_categorical(data)
    print('Shape of data (AFTER  encode): %s\n' % str(encoded.shape))
    return encoded

# Se realiza la categorización one-hot de las etiquetas o labels de entrenamiento, testeo y validación

In [19]:
cat_train_tags_y = to_categoricals(train_tags_y, len(tag2index))
cat_val_tags_y  = to_categoricals(val_tags_y, len(tag2index))
cat_test_tags_y  = to_categoricals(test_tags_y, len(tag2index))

print(cat_train_tags_y[1])
print(len(cat_train_tags_y))
print(cat_train_tags_y.shape)
print(len(cat_test_tags_y))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
9788
(9788, 202, 32)
2496


# PARTE 2. ENTRENAMIENTO DEL MODELO DE RED.

In [20]:
#EMBED_DIM=300
#file = '/kaggle/input/fasttextspanish/word2vec_skip-gram_model_300.txt'
#embedding_matrix = bme(file, len(word2index), EMBED_DIM, word2index)

In [21]:
#EMBED_DIM=300
#file = '/kaggle/input/fasttext-spanish/cc.es.300.vec/cc.es.300.vec'
#embedding_matrix = bme(file, len(word2index), EMBED_DIM, word2index)

# 2. MODELO DE LA RED NEURONAL RECURSIVA (BILSTM+ CRF)

#  Modelo matématico
# Aqui  se define el modelo matémático de la  máquina de aprendizaje que en este caso en una red neuronal recursiva RNN. Específicamente es un Bilstm que  es un Lstm en dos direcciones  y un CRF  para mejorar la distribución probabilística producto de la  red neunoral. En esta red de prueba una estructura de Masking para mejorar las distribuciones probabilídticas. En pocas palabras cumple la función de un distributed. Esta red neuronal tiene una entrada **input = Input(shape=(MAX_LENGTH,))**  esta entrada tiene la longitud de la máxima sentencia  que es 202. Luego se define la matriz de embedding que es una vectorización de palabras usando Word2vec. Esta es una matriz que se vuelve de tres dimensiones 28384 X 202 X 300 y cuando se tiene embedding de palabras se llena son los pesos de la matriz preentrenada, ya sea de word2vec, fasttext o glove. También se pueden concatenar matrices de embedding de otras características lingúísticas como el POS, lema, ect. Luego viene la 

In [22]:
from tf2crf import CRF as crf6
from mwrapper import ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from tensorflow.keras.layers import Concatenate, Lambda, Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, InputLayer, Activation, Flatten, Masking
from tensorflow.keras.optimizers import Adam, schedules
input = Input(shape=(MAX_LENGTH,))
word_embedding_size = 300
EMBED_DIM=300
# Embedding Layer
model = Embedding(input_dim=len(word2index), 
                #weights=[embedding_matrix],  
                output_dim=word_embedding_size, 
                input_length=MAX_LENGTH,
                mask_zero=False)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=50, 
                     return_sequences=True, 
                     dropout=0.5, 
                     recurrent_dropout=0.5))(model)
model  = Dropout(0.5, name='dropout_lstm')(model)
model  = Dense(units=EMBED_DIM * 2, activation='relu')(model)
model  = Dense(units=len(tag2index), activation='relu')(model)
    
model  = Masking(mask_value=0.,input_shape=(MAX_LENGTH, len(tag2index)))(model)
    
crf = crf6(units=len(tag2index), name="ner_crf")
predictions = crf(model)

base_model = Model(inputs=input, outputs=predictions)
model = ModelWithCRFLoss(base_model, sparse_target=True)
    
model.compile(optimizer='adam')
#model.summary()

  return py_builtins.overload_of(f)(*args)


In [23]:
history= model.fit(train_sentences_X, cat_train_tags_y,
                       validation_data=(val_sentences_X, cat_val_tags_y),
                       batch_size=128, 
                       epochs=30,
                       verbose=2)

Epoch 1/30
77/77 - 123s - loss: 97.5495 - accuracy: 0.9188 - val_loss_val: 23.9212 - val_val_accuracy: 0.9770
Epoch 2/30
77/77 - 114s - loss: 19.6216 - accuracy: 0.9779 - val_loss_val: 17.4776 - val_val_accuracy: 0.9778
Epoch 3/30
77/77 - 114s - loss: 15.5216 - accuracy: 0.9782 - val_loss_val: 14.9460 - val_val_accuracy: 0.9779
Epoch 4/30
77/77 - 114s - loss: 13.4538 - accuracy: 0.9793 - val_loss_val: 13.2176 - val_val_accuracy: 0.9805
Epoch 5/30
77/77 - 114s - loss: 11.5170 - accuracy: 0.9827 - val_loss_val: 11.0284 - val_val_accuracy: 0.9843
Epoch 6/30
77/77 - 114s - loss: 8.9216 - accuracy: 0.9873 - val_loss_val: 8.3345 - val_val_accuracy: 0.9885
Epoch 7/30
77/77 - 114s - loss: 6.6354 - accuracy: 0.9907 - val_loss_val: 6.4854 - val_val_accuracy: 0.9914
Epoch 8/30
77/77 - 113s - loss: 5.0531 - accuracy: 0.9932 - val_loss_val: 5.3196 - val_val_accuracy: 0.9932
Epoch 9/30
77/77 - 114s - loss: 3.9053 - accuracy: 0.9948 - val_loss_val: 4.4887 - val_val_accuracy: 0.9947
Epoch 10/30
77/77 

In [24]:
print(tag2index)
print(test_sentences_X)
y_pred= model.predict(test_sentences_X)
print(y_pred.shape)
print(y_pred)

{'B_SMOKER_STATUS': 2, 'B_STAGE': 3, 'B_INTERVAL': 4, 'B_RADIOTHERAPY': 5, 'I_SMOKER_STATUS': 6, 'B_DATE': 7, 'I_FREQ': 8, 'I_DRUG': 9, 'I_STAGE': 10, 'B_FREQ': 11, 'I_TNM': 12, 'B_CANCER_CONCEPT': 13, 'B_OCURRENCE_EVENT': 14, 'B_FAMILY': 15, 'I_CANCER_CONCEPT': 16, 'B_TNM': 17, 'B_METRIC': 18, 'B_IMPLICIT_DATE': 19, 'I_METRIC': 20, 'B_QUANTITY': 21, 'I_IMPLICIT_DATE': 22, 'B_SURGERY': 23, 'I_SURGERY': 24, 'B_CHEMOTHERAPY': 25, 'I_INTERVAL': 26, 'O': 27, 'I_FAMILY': 28, 'I_DATE': 29, 'B_DRUG': 30, 'I_OCURRENCE_EVENT': 31, '-PAD-': 0, '-OOV-': 1}
[[7696  221 1932 ...    0    0    0]
 [3398 6021 4224 ...    0    0    0]
 [2875 5459  221 ...    0    0    0]
 ...
 [5157 1579 5287 ...    0    0    0]
 [8256 9110  221 ...    0    0    0]
 [8256 9110  221 ...    0    0    0]]
(2496, 202)
[[27 27 21 ...  0  0  0]
 [27 27 30 ...  0  0  0]
 [27 27 27 ...  0  0  0]
 ...
 [27 27 27 ...  0  0  0]
 [14 31 27 ...  0  0  0]
 [14 31 27 ...  0  0  0]]


In [25]:
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
index2tag = {i: t for t, i in tag2index.items()}
print(index2tag)
y1_pred = logits_to_tokens(y_pred, index2tag)
print(y1_pred[10])

{2: 'B_SMOKER_STATUS', 3: 'B_STAGE', 4: 'B_INTERVAL', 5: 'B_RADIOTHERAPY', 6: 'I_SMOKER_STATUS', 7: 'B_DATE', 8: 'I_FREQ', 9: 'I_DRUG', 10: 'I_STAGE', 11: 'B_FREQ', 12: 'I_TNM', 13: 'B_CANCER_CONCEPT', 14: 'B_OCURRENCE_EVENT', 15: 'B_FAMILY', 16: 'I_CANCER_CONCEPT', 17: 'B_TNM', 18: 'B_METRIC', 19: 'B_IMPLICIT_DATE', 20: 'I_METRIC', 21: 'B_QUANTITY', 22: 'I_IMPLICIT_DATE', 23: 'B_SURGERY', 24: 'I_SURGERY', 25: 'B_CHEMOTHERAPY', 26: 'I_INTERVAL', 27: 'O', 28: 'I_FAMILY', 29: 'I_DATE', 30: 'B_DRUG', 31: 'I_OCURRENCE_EVENT', 0: '-PAD-', 1: '-OOV-'}
['B_OCURRENCE_EVENT', 'I_OCURRENCE_EVENT', 'O', 'B_QUANTITY', 'B_METRIC', 'B_DRUG', 'O', 'B_DRUG', 'O', 'B_QUANTITY', 'B_METRIC', 'O', 'O', 'O', 'B_DATE', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-P

In [26]:
#print(Y_test[4])
print(test_tags_y.shape)

(2496, 202)


In [27]:
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
index2tag = {i: t for t, i in tag2index.items()}
print(index2tag)
y1_true = logits_to_tokens(test_tags_y, index2tag)
print(y1_true[10])

{2: 'B_SMOKER_STATUS', 3: 'B_STAGE', 4: 'B_INTERVAL', 5: 'B_RADIOTHERAPY', 6: 'I_SMOKER_STATUS', 7: 'B_DATE', 8: 'I_FREQ', 9: 'I_DRUG', 10: 'I_STAGE', 11: 'B_FREQ', 12: 'I_TNM', 13: 'B_CANCER_CONCEPT', 14: 'B_OCURRENCE_EVENT', 15: 'B_FAMILY', 16: 'I_CANCER_CONCEPT', 17: 'B_TNM', 18: 'B_METRIC', 19: 'B_IMPLICIT_DATE', 20: 'I_METRIC', 21: 'B_QUANTITY', 22: 'I_IMPLICIT_DATE', 23: 'B_SURGERY', 24: 'I_SURGERY', 25: 'B_CHEMOTHERAPY', 26: 'I_INTERVAL', 27: 'O', 28: 'I_FAMILY', 29: 'I_DATE', 30: 'B_DRUG', 31: 'I_OCURRENCE_EVENT', 0: '-PAD-', 1: '-OOV-'}
['B_OCURRENCE_EVENT', 'I_OCURRENCE_EVENT', 'O', 'B_QUANTITY', 'B_METRIC', 'B_DRUG', 'O', 'B_DRUG', 'O', 'B_QUANTITY', 'B_METRIC', 'O', 'B_OCURRENCE_EVENT', 'O', 'B_DATE', 'O', 'O', 'O', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD

In [28]:
#hh1 = seqclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', hh1)
from seqeval.metrics import classification_report as seqclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
print("precision: {:.1%}".format(precision_score(y1_true, y1_pred)))
print("   recall: {:.1%}".format(recall_score(y1_true,    y1_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y1_true,  y1_pred)))
print(" F1-score: {:.1%}".format(f1_score(y1_true,        y1_pred)))



precision: 90.6%
   recall: 90.2%
 accuracy: 99.7%
 F1-score: 90.4%


In [29]:
import pandas as pd
li1 = sum(y1_true, [])
li2 = sum(y1_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected'] = li1
results['Predicted'] = li2

In [30]:
from sklearn.metrics import classification_report as eskclarep
report = eskclarep(results['Expected'], results['Predicted'])
#print('\nclassification_report:\n', report)

print(report_to_df(report))

           Class Name precision recall f1-score support
0               -PAD-      1.00   1.00     1.00  466544
1    B_CANCER_CONCEPT      0.95   0.96     0.95     689
2      B_CHEMOTHERAPY      0.96   1.00     0.98     191
3              B_DATE      0.97   0.71     0.82     779
4              B_DRUG      0.94   0.97     0.96     675
5            B_FAMILY      0.99   0.99     0.99     147
6              B_FREQ      0.87   0.98     0.92     161
7     B_IMPLICIT_DATE      0.38   0.69     0.49      26
8          B_INTERVAL      0.73   0.76     0.74      21
9            B_METRIC      0.94   0.95     0.95    1461
10  B_OCURRENCE_EVENT      0.88   0.89     0.88     597
11         B_QUANTITY      0.94   0.96     0.95    1493
12     B_RADIOTHERAPY      0.90   1.00     0.95      89
13    B_SMOKER_STATUS      0.81   0.98     0.89      55
14            B_STAGE      0.97   0.98     0.98     155
15          B_SURGERY      0.85   0.80     0.82     108
16              B_TNM      0.94   0.83     0.88 

In [42]:
test_samples = []

test_samples.append(datasets['test'][203]['sentencia'])
test_samples.append(datasets['test'][133]['sentencia'])
test_samples.append("Padre fallecido a los 78 años por ECV isquémica. Madre con HTA y DM2, fallecida a los 82 años.".split())
test_samples.append("Paciente sin antecedentes de tabaquismo ni consumo de alcohol. No refiere alergias conocidas.".split())

print(test_samples)

[['Padre', 'fallecio', 'por', 'IAM', 'con', '85', 'años', ',', 'Madre', 'fallecida', 'con', '84', 'años', '.'], ['No', 'HTA', ',', 'Dislipemia', '.'], ['Padre', 'fallecido', 'a', 'los', '78', 'años', 'por', 'ECV', 'isquémica.', 'Madre', 'con', 'HTA', 'y', 'DM2,', 'fallecida', 'a', 'los', '82', 'años.'], ['Paciente', 'sin', 'antecedentes', 'de', 'tabaquismo', 'ni', 'consumo', 'de', 'alcohol.', 'No', 'refiere', 'alergias', 'conocidas.']]


In [43]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)
print(test_samples_X.shape)

[[ 406 9262 8139 5008 6021 4191 9446 6307 6934 5417 6021 5935 9446 7011
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [44]:
predictions = model.predict(test_samples_X)
print(predictions, predictions.shape)

[[15 14 27 27 27 21 18 27 15 14 27 21 18 27  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0]
 [27 27 27 27 27  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0 

In [45]:
#print(len(predictions))
log_tokens = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})
print(log_tokens)

[['B_FAMILY', 'B_OCURRENCE_EVENT', 'O', 'O', 'O', 'B_QUANTITY', 'B_METRIC', 'O', 'B_FAMILY', 'B_OCURRENCE_EVENT', 'O', 'B_QUANTITY', 'B_METRIC', 'O', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PA

In [46]:
from tabulate import tabulate

for i in range(len(test_samples)):
    heads = test_samples[i]
    body = [log_tokens[i][:len(test_samples[i])]]

    print(tabulate(body, headers=heads))

Padre     fallecio           por    IAM    con    85          años      ,    Madre     fallecida          con    84          años      .
--------  -----------------  -----  -----  -----  ----------  --------  ---  --------  -----------------  -----  ----------  --------  ---
B_FAMILY  B_OCURRENCE_EVENT  O      O      O      B_QUANTITY  B_METRIC  O    B_FAMILY  B_OCURRENCE_EVENT  O      B_QUANTITY  B_METRIC  O
No    HTA    ,    Dislipemia    .
----  -----  ---  ------------  ---
O     O      O    O             O
Padre     fallecido          a    los    78          años      por    ECV    isquémica.    Madre     con    HTA    y    DM2,    fallecida          a    los    82          años.
--------  -----------------  ---  -----  ----------  --------  -----  -----  ------------  --------  -----  -----  ---  ------  -----------------  ---  -----  ----------  --------
B_FAMILY  B_OCURRENCE_EVENT  O    O      B_QUANTITY  B_METRIC  O      O      O             B_FAMILY  O      O      O    O     