In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer
import pandas as pd
import matplotlib.pyplot as plt

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert
import re

# GPU가 있으면 사용, 아닐경우 CPU사용 (GPU사용을 추천합니다)

In [2]:
tf.get_logger().setLevel('ERROR')

gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
  # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    tf.config.experimental.set_memory_growth(gpus[0], True)

# Data 읽어오기 및 전처리
result_text가 원본 text이며, result_text_parse가 전처리된 text입니다.

이때, THE UPDATED SYDNEY SYSTEM 표를 삭제하고 그 아래에 있는 텍스트만 추출합니다. 앞으로 이것을 "텍스트 전처리" 라고 부르겠습니다.

텍스트 전처리를 수행한이유는 Bert 전처리에서 설명되어있습니다.

In [14]:
csv = pd.read_csv('./helico_sample.csv')
p = re.compile('==\r\n\r')
csv['result_text_parse'] = csv['result_text'].map(lambda x: p.split(x)[-1])

# 텍스트 전처리 전

In [15]:
print(csv['result_text'][0])



(219350)

                              THE UPDATED SYDNEY SYSTEM
Site            H. pylori       Neutrophil    Mononuclear   Atrophy     Intestinal
                colonization     activity          cells                 metaplasia
------------------------------------------------------------------------------------
1.#1x1 : antrum absent       absent         mild          not applicable  marked    
------------------------------------------------------------------------------------
2.#2x1 : antrum absent       absent         mild          not applicable  marked

1. Stomach, #1x1 : Posterior wall of distal antrum, biopsy :

  . Chronic gastritis, inactive, with intestinal metaplasia (incomplete type) and erosion
  . No H. pylori identified.

2. Stomach, #2x1 : Posterior wall of mid antrum, biopsy :

  . Chronic gastritis, inactive, with intestinal metaplasia (incomplete type) and erosion
  . No H. pylori identified.






# 텍스트 전처리 후

In [16]:
print(csv['result_text_parse'][0])


1. Stomach, #1x1 : Posterior wall of distal antrum, biopsy :

  . Chronic gastritis, inactive, with intestinal metaplasia (incomplete type) and erosion
  . No H. pylori identified.

2. Stomach, #2x1 : Posterior wall of mid antrum, biopsy :

  . Chronic gastritis, inactive, with intestinal metaplasia (incomplete type) and erosion
  . No H. pylori identified.






In [17]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(csv, test_size=0.2, random_state=1004, stratify=csv['h_pyl_positive'])

# Bert 전처리 모델 읽어오기

In [20]:
bert_model_name = 'experts_wiki_books' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

BERT model selected           : https://tfhub.dev/google/experts/bert/wiki_books/2
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


# 텍스트 전처리의 이유
## 텍스트 전처리 이전

THE UPDATED SYDNEY SYSTEM를 삭제한 이유는 다음과같습니다. 텍스트 전처리 이전의 text인 'result_text'를 넣었을때에는 bert preprocess model의 output에서 input_word_ids에 1027이라는 숫자가 많이보입니다. 즉 의미없는 데이터(분류하는데 크게 기여하지않으리라고 예상이되는)가 많이 존재합니다.

In [23]:
bert_preprocess_model([train['result_text'][0]])

{'input_word_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[  101,  1006, 20636, 19481,  2692,  1007,  1996,  7172,  3994,
          2291,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  2609,  1044,  1012,  1052,  8516,
         10050, 11265,  4904, 18981, 19466, 18847, 11231, 14321,  2906,
          2012, 18981, 10536, 20014, 19126, 18962,  4023,  4442, 1880

In [24]:
bert_preprocess_model([train['result_text'][1]])

{'input_word_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[  101,  1006, 16798, 11387,  2683,  1007,  1996,  7172,  3994,
          2291,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,  1027,
          1027,  1027,  1027,  1027,  2609,  1044,  1012,  1052,  8516,
         10050, 11265,  4904, 18981, 19466, 18847, 11231, 14321,  2906,
          2012, 18981, 10536, 20014, 19126, 18962,  4023,  4442, 1880

## 텍스트 전처리 후
반대로, THE UPDATED SYDNEY SYSTEM를 제거한 이후 bert_preprocess_model에 넣어준 결과는 다음과같습니다. 텍스트 전처리 이전과는 다르게 많은부분을 차지하고있던 1027이 삭제된것을 확인할 수 있습니다.

In [22]:
bert_preprocess_model([train['result_text_parse'][0]])

{'input_word_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[  101,  1015,  1012,  4308,  1010,  1001,  1015,  2595,  2487,
          1024, 15219,  2813,  1997, 29333, 14405,  6824,  1010, 16012,
         18075,  1024,  1012, 11888,  3806, 18886,  7315,  1010, 16389,
          1010,  2007, 20014, 19126, 18804, 24759, 15396,  1006, 12958,
          2828,  1007,  1998, 14173,  1012,  2053,  1044,  1012,  1052,
          8516, 10050,  4453,  1012,  1016,  1012,  4308,  1010,  1001,
          1016,  2595,  2487,  1024, 15219,  2813,  1997,  3054, 14405,
          6824,  1010, 16012, 18075,  1024,  1012, 11888,  3806, 18886,
          7315,  1010, 16389,  1010,  2007, 20014, 19126, 18804, 24759,
         15396,  1006, 12958,  2828,  1007,  1998, 14173,  1012,  2053,
          1044,  1012,  1052,  8516, 10050,  4453,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     

In [25]:
bert_preprocess_model([train['result_text_parse'][1]])

{'input_word_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[  101,  4308,  1010,  1001,  1015,  2595,  2487,  1024,  1043,
          2278,  1997,  2659,  2303,  1010, 16012, 18075,  1024,  1012,
         11888,  3806, 18886,  7315,  1010, 16389,  1010,  2007, 14173,
          1012,  2053,  1044,  1012,  1052,  8516, 10050,  4453,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     

# 모델 build 및 학습.

In [26]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [27]:
classifier_model = build_classifier_model()
# bert_raw_result = classifier_model(tf.constant(test_text))
# print(tf.sigmoid(bert_raw_result))

# metrics = [tf.metrics.BinaryAccuracy(), tf.metrics.AUC(), tf.metrics.Recall()]
batch_size = 32
epochs = 5
steps_per_epoch = len(train) // batch_size
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss='binary_crossentropy',
                         metrics=['accuracy'])

# 텍스트 전처리 후의 데이터를 사용한 모델

In [28]:
history = classifier_model.fit(train['result_text_parse'], train['h_pyl_positive'],
                               batch_size=batch_size,
                               epochs=2)

Epoch 1/2
Epoch 2/2


In [29]:
classifier_model.evaluate(val['result_text_parse'], val['h_pyl_positive'])



[0.012008952908217907, 0.998388409614563]

# 텍스트 전처리 이전의 데이터를 사용한 모델

In [36]:
classifier_model = build_classifier_model()
# bert_raw_result = classifier_model(tf.constant(test_text))
# print(tf.sigmoid(bert_raw_result))

# metrics = [tf.metrics.BinaryAccuracy(), tf.metrics.AUC(), tf.metrics.Recall()]
batch_size = 32
epochs = 2
steps_per_epoch = len(train) // batch_size
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss='binary_crossentropy',
                         metrics=['accuracy'])

In [37]:
history = classifier_model.fit(train['result_text'], train['h_pyl_positive'],
                               batch_size=batch_size,
                               epochs=epochs)

Epoch 1/2
Epoch 2/2
