In [1]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [2]:
!pip install sentencepiece



In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

# Helper Functions

In [4]:
def bert_encode(texts, tokenizer, max_len=40):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [5]:
from keras import backend as K

def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = K.sum(y_target_yn)

    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + K.epsilon())

    # return a single tensor value
    return recall


def precision(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_pred_yn = K.round(K.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    y_target_yn = K.round(K.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = K.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Positive) = 예측 값이 1(Positive) 전체
    count_true_positive_false_positive = K.sum(y_pred_yn)

    # Precision = (True Positive) / (True Positive + False Positive)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    precision = count_true_positive / (count_true_positive_false_positive + K.epsilon())

    # return a single tensor value
    return precision


def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ K.epsilon())
    
    # return a single tensor value
    return _f1score

In [6]:
def build_model(bert_layer, max_len=40):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(12, activation='softmax')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-6), loss='categorical_crossentropy', metrics=['acc'])
    
    return model

# Load and Preprocess

- Load BERT from the Tensorflow Hub
- Load CSV files containing training data
- Load tokenizer from the bert layer
- Encode the text into tokens, masks, and segment flags

In [7]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 11.8 s, sys: 2.54 s, total: 14.4 s
Wall time: 14.1 s


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
!ls '/content/drive/My Drive/'

 프로젝트
'증상 발화데이터 메타 (1).csv'
'2020 공모전 제출서식 및 참고자료_파일.zip'
'[서울-2반-8조] 2차 산출물 제출 건.hwp'
'(서울-2반-8조) 발표자료.pdf'
'[서울-2반-8조] 발표 PPT 초안.pdf'
'(서울-2반-8조) 발표자료.pptx'
'증상 발화데이터 메타 (2).csv'
'(서울 2반 - 홍윤표) 전기차.zip'
'(서울 2반_홍윤표) 민원분석.zip'
'(서울2반_홍윤표) 관광분석.zip'
 3-min-pytorch-master
 %5B%EC%B2%A8%EB%B6%80%5D+2020%EB%85%84+%ED%95%98%EB%B0%98%EA%B8%B0+%EA%B8%80%EB%A1%9C%EB%B2%8C%EA%B3%BC%EC%A0%95+%EC%97%B0%EC%88%98%EC%97%85%EC%B2%B4+%ED%98%84%ED%99%A9.xlsx.exe
 ADV
'BERT 감성분석 모델.ipynb'
 BiLSTM.ipynb
 boaz_study
'Colab Notebooks'
'복부 발화데이터.csv'
'증상 발화데이터 메타.csv'
'증상 발화데이터.csv'
'하이닥 데이터 전처리.csv'
'하이닥 데이터.csv'
 dacon
'제목 없는 스프레드시트.gsheet'
 mnist_assignment.ipynb
 model.h5
 출석체크.png
 작업관리자.png
'Pytorch DNN.ipynb'
'pytorch tutorial'
'Resume Template 5의 사본 (1).gdoc'


In [10]:
data = pd.read_csv("/content/drive/My Drive/증상 발화데이터 메타 (2).csv", encoding = 'cp949')

In [11]:
data = data.sample(frac = 1, axis = 0)

In [12]:
data['병명'].unique()

array(['위염', '장염', '비염', '간염', '축농증', '중이염', '구내염', '고막염', '외이도염', '질염',
       '방광염', '다낭성 난소 증후군'], dtype=object)

In [13]:
data.reset_index(drop = True, inplace = True)

In [14]:
data.shape

(17302, 2)

In [15]:
data["증상질문"] = data["증상질문"].astype("string")

In [16]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit(data['병명'])
data['병명'] = encoder.transform(data['병명'])

In [17]:
mapping = dict(zip( range(len(encoder.classes_)), encoder.classes_))
mapping

{0: '간염',
 1: '고막염',
 2: '구내염',
 3: '다낭성 난소 증후군',
 4: '방광염',
 5: '비염',
 6: '외이도염',
 7: '위염',
 8: '장염',
 9: '중이염',
 10: '질염',
 11: '축농증'}

In [18]:
train_labels = data['병명'].values

In [19]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import utils
train = data.drop("병명", axis = 1)
train_labels = utils.to_categorical(train_labels)

In [20]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [21]:
data.dropna(axis = 0, inplace = True)

In [22]:
train_input = bert_encode(data["증상질문"].values, tokenizer, max_len=40)

# Model: Build, Train, Predict, Submit

In [23]:
model = build_model(bert_layer, max_len=40)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 40)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 40)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [24]:
checkpoint = ModelCheckpoint('/content/drive/My Drive/final_model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.3,
    epochs=5,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
model.load_weights('/content/drive/My Drive/final_model.h5')

In [58]:
test = pd.DataFrame(['테스트문장'], columns = ['증상'])

In [59]:
test_input = bert_encode(test['증상'].values, tokenizer, max_len=40)

In [60]:
pred = model.predict(test_input)
mapping[np.where(pred[0] == pred[0].max())[0][0]]

'위염'

In [61]:
pred[0].max()

0.80330306