In [27]:
import os
import re
import numpy as np
from tqdm import tqdm
import json
import copy

import tensorflow as tf
from transformers import *

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import matplotlib.pyplot as plt

try:
#     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    import setGPU
except:
    print('no setGPU')

no setGPU


In [28]:
# 시각화

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [29]:
#random seed 고정
tf.random.set_seed(0)
np.random.seed(0)

BATCH_SIZE = 64
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN = 41 # EDA에서 추출된 Max Length
DATA_IN_PATH = 'data_in/KOR'
DATA_OUT_PATH = "data_out/KOR"

In [30]:
# 데이터 전처리 준비
DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, "NER", "train.tsv")
DATA_LABEL_PATH = os.path.join(DATA_IN_PATH, "NER", "label.txt")

In [31]:
def read_file(input_path):
    """Read tsv file, and return words and label as list"""
    with open(input_path, "r", encoding="utf-8") as f:
        sentences = []
        labels = []
        for line in f:
            split_line = line.strip().split("\t")
            sentences.append(split_line[0])
            labels.append(split_line[1])
        return sentences, labels

train_sentences, train_labels = read_file(DATA_TRAIN_PATH)

train_ner_dict = {"sentence": train_sentences, "label": train_labels}
train_ner_df = pd.DataFrame(train_ner_dict)

print("개체명 인식 학습 데이터 개수: {}".format(len(train_ner_df)))

개체명 인식 학습 데이터 개수: 81000


In [32]:
# Label 불러오기

def get_labels(label_path):
    return [label.strip() for label in open(os.path.join(label_path), 'r', encoding='utf-8')]

ner_labels = get_labels(DATA_LABEL_PATH)

print("개체명 인식 레이블 개수: {}".format(len(ner_labels)))

개체명 인식 레이블 개수: 30


In [33]:
# train_ner_df = train_ner_df[:100]

In [34]:
# 버트 토크나이저 설정

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", cache_dir='bert_ckpt')

pad_token_id = tokenizer.pad_token_id # 0
pad_token_label_id=-100

In [35]:
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        truncation=True,
        add_special_tokens = True, #'[CLS]'와 '[SEP]' 추가
        max_length = MAX_LEN,           # 문장 패딩 및 자르기 진행
        pad_to_max_length = True,
        return_attention_mask = True   # 어탠션 마스크 생성
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] 
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

def convert_label(words, labels_idx, max_seq_len):
            
    tokens = []
    label_ids = []

    for word, slot_label in zip(words, labels_idx):

        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [unk_token]
        tokens.extend(word_tokens)
        
        label_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))
    
    # [CLS] and [SEP] 설정
    special_tokens_count = 2
    if len(label_ids) > max_seq_len - special_tokens_count:
        label_ids = label_ids[: (max_seq_len - special_tokens_count)]

    # [SEP] 토큰 추가
    label_ids += [pad_token_label_id]

    # [CLS] 토큰 추가
    label_ids = [pad_token_label_id] + label_ids
    
    padding_length = max_seq_len - len(label_ids)
    label_ids = label_ids + ([pad_token_label_id] * padding_length)
    
    return label_ids

In [36]:
train_input_ids = []
train_attention_masks = []
train_token_type_ids = []
train_labels = []

for i, data in enumerate(train_ner_df[['sentence', 'label']].values):
    sentence, labels = data
    words = sentence.split()
    labels = labels.split()
    
#     print(words)
#     print(labels)
    
    labels_idx = []
    for label in labels:
        labels_idx.append(ner_labels.index(label) if label in ner_labels else ner_labels.index("UNK"))
        
#     print(labels_idx)
    
    assert len(words) == len(labels_idx)

    input_ids, attention_mask, token_type_ids = bert_tokenizer(sentence, MAX_LEN)
    
    convert_label_ids = convert_label(words, labels_idx, MAX_LEN)
    
#     print(input_ids)
#     print(convert_label_ids)
    
    train_input_ids.append(input_ids)
    train_attention_masks.append(attention_mask)
    train_token_type_ids.append(token_type_ids)
    train_labels.append(convert_label_ids)
    
train_input_ids = np.array(train_input_ids, dtype=int)
train_attention_masks = np.array(train_attention_masks, dtype=int)
train_token_type_ids = np.array(train_token_type_ids, dtype=int)
train_labels = np.asarray(train_labels, dtype=int) #레이블 토크나이징 리스트
train_inputs = (train_input_ids, train_attention_masks, train_token_type_ids)

In [50]:
class TFBertNERClassification(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        
        self.num_labels = config.num_labels
        
        print(self.num_labels)
        
        self.bert = TFBertMainLayer(config, name="bert")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(config.num_labels, 
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(config.initializer_range), 
                                                name="classifier")
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
        #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output, training=training)
        logits = self.classifier(sequence_output)
        
#         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

#         if labels is not None:
#             loss = self.compute_loss(labels, logits)
#             outputs = (loss,) + outputs

        return logits

In [51]:
config =  BertConfig.from_pretrained('bert-base-multilingual-cased',
                                    num_labels=len(ner_labels),
                                    finetuning_task="naver_ner",
                                    id2label={str(i): label for i, label in enumerate(ner_labels)},
                                    label2id={label: i for i, label in enumerate(ner_labels)}
                                    )

model = TFBertNERClassification.from_pretrained("bert-base-multilingual-cased", config=config, cache_dir='bert_ckpt')
# model = TFBertForTokenClassification.from_pretrained("bert-base-multilingual-cased", config=config, cache_dir='bert_ckpt')

30


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertNERClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertNERClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertNERClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertNERClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['dropout_227', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
def compute_loss(labels, logits):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE
    )
    # -100의 레이블 값은 손실 값에서 제외 한다.
    active_loss = tf.reshape(labels, (-1,)) != -100    
    reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
    labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
    
    return loss_fn(labels, reduced_logits)

In [53]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=compute_loss, metrics=[metric])

In [54]:
model_name = "tf2_bert_ner"

# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

history = model.fit(train_inputs, train_labels, batch_size=2, epochs=10, validation_split = 0.2, callbacks=[earlystop_callback, cp_callback])

print(history.history)

data_out/KOR/tf2_bert_ner -- Folder already exists 

Epoch 1/10

KeyboardInterrupt: 

In [36]:
# 테스트
DATA_TEST_PATH = os.path.join(DATA_IN_PATH, "NER", "test.tsv")

test_sentences, test_labels = read_file(DATA_TEST_PATH)

test_ner_dict = {"sentence": test_sentences, "label": test_labels}
test_ner_df = pd.DataFrame(test_ner_dict)

print("개체명 인식 테스트 데이터 개수: {}".format(len(test_ner_df)))

개체명 인식 테스트 데이터 개수: 9000


In [None]:
test_input_ids = []
test_attention_masks = []
test_token_type_ids = []
test_labels = []

for i, data in enumerate(test_ner_df[['sentence', 'label']].values):
    sentence, labels = data
    words = sentence.split()
    labels = labels.split()
        
    labels_idx = []
    for label in labels:
        labels_idx.append(ner_labels.index(label) if label in ner_labels else ner_labels.index("UNK"))
        
    assert len(words) == len(labels_idx)

    input_ids, attention_mask, token_type_ids = bert_tokenizer(sentence, MAX_LEN)
    
    convert_label_ids = convert_label(words, labels_idx, MAX_LEN)
        
    test_input_ids.append(input_ids)
    test_attention_masks.append(attention_mask)
    test_token_type_ids.append(token_type_ids)
    test_labels.append(convert_label_ids)
    
test_input_ids = np.array(test_input_ids, dtype=int)
test_attention_masks = np.array(test_attention_masks, dtype=int)
test_token_type_ids = np.array(test_token_type_ids, dtype=int)
test_labels = np.asarray(test_labels, dtype=int) #레이블 토크나이징 리스트
test_inputs = (test_input_ids, test_attention_masks, test_token_type_ids)

result = model.evaluat(test_inputs, test_labels, batch_size=64)
print("개체명 인식 테스트 결과 값 {}", results)

In [121]:
for i, data in enumerate(train_ner_df[['sentence', 'label']].values):
    sentence, labels = data
    words = sentence.split()
    labels = labels.split()
    
    labels_idx = []
    for label in labels:
        labels_idx.append(ner_labels.index(label) if label in ner_labels else ner_labels.index("UNK"))
    
    assert len(words) == len(labels_idx)

    input_ids, attention_mask, token_type_ids = bert_tokenizer(sentence, MAX_LEN)
    
    label_ids = []
    tokens = []
    
    for word, slot_label in zip(words, labels_idx):
        word_tokens = tokenizer.tokenize(word)
        if not word_tokens:
            word_tokens = [unk_token]
        tokens.extend(word_tokens)
        label_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))
        
    # Account for [CLS] and [SEP]
    special_tokens_count = 2
    if len(label_ids) > MAX_LEN - special_tokens_count:
        label_ids = label_ids[: (MAX_LEN - special_tokens_count)]

    # Add [SEP] token
    label_ids += [pad_token_label_id]

    # Add [CLS] token
    label_ids = [pad_token_label_id] + label_ids
    
    padding_length = MAX_LEN - len(label_ids)
    label_ids = label_ids + ([pad_token_label_id] * padding_length)
    
    ######
    tokens += [sep_token]
    token_type_ids = [sequence_a_segment_id] * len(tokens)
    
    tokens = [cls_token] + tokens
    token_type_ids = [cls_token_segment_id] + token_type_ids
    
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    
    attention_mask = None
    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
    
    input_ids = input_ids + ([pad_token_id] * padding_length)
    attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
    token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

    

In [23]:
train_input_ids = []
train_attention_masks = []
train_token_type_ids = []
train_labels = []

for (i, data) in enumerate(train_ner_data):

    sentence, labels = data.split('\t')
    
    input_ids, attention_mask, token_type_ids = bert_tokenizer(sentence, MAX_LEN)
    
    words = sentence.split()
    labels = labels.split()
    
    new_ner_label = []
    
    for label in labels:
        new_ner_label.append(ner_labels.index(label) if label in ner_labels else ner_labels.index("UNK"))

    assert len(words) == len(new_ner_label)
        
    labels_idx = convert_label(words, ner_labels, MAX_LEN)

    train_input_ids.append(input_ids)
    train_attention_masks.append(attention_mask)
    train_token_type_ids.append(token_type_ids)
    train_labels.append(labels_idx)
        
train_input_ids = np.array(train_input_ids, dtype=int)
train_attention_masks = np.array(train_attention_masks, dtype=int)
train_token_type_ids = np.array(train_token_type_ids, dtype=int)
train_labels = np.asarray(train_labels, dtype=int) #레이블 토크나이징 리스트
train_inputs = (train_input_ids, train_attention_masks, train_token_type_ids)

ValueError: invalid literal for int() with base 10: 'ORG-B'

In [25]:
# import easydict
 
# args = easydict.EasyDict({
#         "max_seq_len": 32,
#         "epoch": 20,
#         "gpu": 0,
#         "out": "result",
#         "resume": False,
#         "unit": 1000
# })

# class TFBertNER(TFBertPreTrainedModel):
#     def __init__(self, config):
#         super().__init__(config)
        
#         self.num_labels = len(ner_labels)
#         self.bert = TFBertMainLayer(config, name="bert")
#         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
#         self.classifier = tf.keras.layers.Dense(config.num_labels, 
#                                                 kernel_initializer=tf.keras.initializers.TruncatedNormal(config.initializer_range), 
#                                                 name="classifier")
        
#     def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
#         #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)
#         outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
#         sequence_output = outputs[0]
#         sequence_output = self.dropout(sequence_output, training=training)
#         logits = self.classifier(sequence_output)
        
#         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

# #         if labels is not None:
# #             loss = self.compute_loss(labels, logits)
# #             outputs = (loss,) + outputs

#         return outputs