In [1]:
# !git clone https://github.com/ukairia777/tensorflow-bert-ner.git

In [2]:
%ls

README.md  [0m[01;34mdata[0m/  tensorflow_bert_ner.ipynb


In [3]:
# %cd tensorflow-bert-ner/

In [4]:
# !pip install transformers

In [5]:
# !pip install seqeval

In [6]:
import glob, re
import numpy as np
import codecs
import os
import json
from tqdm import tqdm
import tensorflow as tf
from seqeval.metrics import f1_score, classification_report
from transformers import shape_list, BertTokenizer, TFBertModel

2023-06-07 17:04:57.305114: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-07 17:04:57.343865: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-07 17:04:57.472710: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-07 17:04:57.473530: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
train_dataset = glob.glob('data/train_data/*.txt')
test_dataset = glob.glob('data/validation_data/*.txt')

In [8]:
len(train_dataset)

1425

In [9]:
len(test_dataset)

2

In [10]:
def read_ner_data(dataset):

  tagged_sentences = []
  sentence = []

  for file_name in tqdm(dataset):
    f = open(file_name, 'r', encoding='utf-8-sig')
    for line in f:
      if len(line)==0 or line[:2]=="##" or line[0]=="\n":
          if len(sentence) > 0:
              tagged_sentences.append(sentence)
              sentence = []
          continue
      splits = line.split('\t') # 공백을 기준으로 속성을 구분한다.
      splits[-1] = re.sub(r'\n', '', splits[-1]) # 줄바꿈 표시 \n을 제거한다.
      word = splits[0]
      sentence.append([word, splits[-1]]) # 단어와 개체명 태깅만 기록한다.

  return tagged_sentences

In [11]:
train_tagged_sentences = read_ner_data(train_dataset)
test_tagged_sentences = read_ner_data(test_dataset)

100%|██████████| 1425/1425 [00:01<00:00, 798.03it/s]
100%|██████████| 2/2 [00:00<00:00, 47.63it/s]


In [12]:
print(len(train_tagged_sentences))
print(len(test_tagged_sentences))

23033
931


In [13]:
def split_sentence_and_label(tagged_sentences):
  index = 0
  sentences = []
  ner_tags = []

  for tagged_sentence in tqdm(tagged_sentences):
    sentence = []
    ner_tag = []
    for word, label in tagged_sentence:
      if word == '_' or word == '\xad':
        continue
      sentence.append(word)
      ner_tag.append(label)

    assert len(sentence) == len(ner_tag), "Error with input length {} vs {}".format(len(sentence), len(ner_tag))

    sentences.append(sentence)
    ner_tags.append(ner_tag)
  
  return sentences, ner_tags

In [14]:
train_sentences, train_labels = split_sentence_and_label(train_tagged_sentences)
test_sentences, test_labels = split_sentence_and_label(test_tagged_sentences)

100%|██████████| 23033/23033 [00:00<00:00, 193354.80it/s]
100%|██████████| 931/931 [00:00<00:00, 163967.96it/s]


In [15]:
print(train_sentences[:3])
print(train_labels[:3])

[['인천시', '는', '최근', '김정은', '북한', '국방위원회', '제', '1', '위원장', '이', '신년사', '에서', '남북', '관계', '개선', '을', '강조한', '만큼', '친선', '경기', '성사', '가능성', '을', '높', '게', '보', '고', '있', '다', '.'], ['1990', '년', '명량', '대첩', '을', '기념하', '는', '기념', '공원', '이', '조성되', '면서', '국민', '관광지', '가', '됐', '고', '1992', '년', '전남', '도', '기념물', '제', '139', '호', '로', '지정', '됐', '다', '.'], ['고궁', '앞', '에서', '포즈', '를', '취하', '며', '사진', '을', '찍', '는', '이', '들', '10', '명', '중', '7', ',', '8', '명', '은', '한복', '차림', '.', '갓', '이나', '댕기', '까지', '갖춰', '전통', '한복', '을', '차려', '입', '은', '사람', '도', '많', '았', '지만', '짧', '은', '치마', '나', '긴', '저고리', ',', '레이스', '를', '활용한', '것', '등', '독특한', '한복', '을', '입', '은', '이', '들', '도', '눈', '에', '띄', '었', '다', '.']]
[['B-ORG', 'O', 'O', 'B-PER', 'B-ORG', 'I-ORG', 'B-NOH', 'I-NOH', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-DAT', 'I-DAT', 'B-POH', 'I-POH', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-

In [16]:
print(test_sentences[:3])
print(test_labels[:3])

[['SBS', '골프', '채널', '이준실', '본부', '장', '은', '4', '일', '“', '여자', '개막', '전', '인', '스포츠서울', '김영주', '골프', '여자', '오픈', '부터', '디지털', '고화질', '(', 'HD', ')', '방송', '을', '시작할', '예정', '이', '며', '중계', '홀', '을', '종전', '5', '~', '6', '개', '홀', '에서', '9', '개', '홀로', '늘릴', '것', '”', '이라고', '밝', '혔', '다', '.'], ['신지애', '-', '최경주', '부진한', '출발'], ['한국', '선수', '중', '에서', '는', '‘', '슈퍼', '땅콩', '’', '김미현', '(', 'KTF', ')', '이', '2', '언더파', '70', '타', '를', '기록해', '공동', '6', '위', '에', '오르', '는', '가장', '좋', '은', '출발', '을', '했', '다', '.']]
[['B-ORG', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'B-DAT', 'I-DAT', 'O', 'O', 'O', 'O', 'O', 'B-POH', 'I-POH', 'I-POH', 'I-POH', 'I-POH', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NOH', 'O', 'B-NOH', 'I-NOH', 'O', 'O', 'B-NOH', 'I-NOH', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PER', 'O', 'B-PER', 'O', 'O'], ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-ORG', 'O', 'O', 'B-NOH', 'I-NOH', 'B-NOH', 'I-NOH

In [17]:
labels = set(tag for train_label in train_labels for tag in train_label)

In [18]:
print(labels)

{'B-LOC', 'O', 'B-POH', 'I-NOH', 'I-DUR', 'B-PER', 'B-DAT', 'I-MNY', 'I-POH', 'I-ORG', 'B-DUR', 'I-LOC', 'I-TIM', 'I-PER', 'B-PNT', 'B-ORG', 'I-PNT', 'B-MNY', 'B-NOH', 'I-DAT', 'B-TIM'}


In [19]:
tag_to_index = {tag: index for index, tag in enumerate(labels)}
index_to_tag = {index: tag for index, tag in enumerate(labels)}

In [20]:
tag_to_index = {tag: index for index, tag in enumerate(labels)}
index_to_tag = {index: tag for index, tag in enumerate(labels)}

In [21]:
print(tag_to_index)
print(index_to_tag)

{'B-LOC': 0, 'O': 1, 'B-POH': 2, 'I-NOH': 3, 'I-DUR': 4, 'B-PER': 5, 'B-DAT': 6, 'I-MNY': 7, 'I-POH': 8, 'I-ORG': 9, 'B-DUR': 10, 'I-LOC': 11, 'I-TIM': 12, 'I-PER': 13, 'B-PNT': 14, 'B-ORG': 15, 'I-PNT': 16, 'B-MNY': 17, 'B-NOH': 18, 'I-DAT': 19, 'B-TIM': 20}
{0: 'B-LOC', 1: 'O', 2: 'B-POH', 3: 'I-NOH', 4: 'I-DUR', 5: 'B-PER', 6: 'B-DAT', 7: 'I-MNY', 8: 'I-POH', 9: 'I-ORG', 10: 'B-DUR', 11: 'I-LOC', 12: 'I-TIM', 13: 'I-PER', 14: 'B-PNT', 15: 'B-ORG', 16: 'I-PNT', 17: 'B-MNY', 18: 'B-NOH', 19: 'I-DAT', 20: 'B-TIM'}


In [22]:
tag_size = len(tag_to_index)
print('개체명 태깅 정보의 개수 :',tag_size)

개체명 태깅 정보의 개수 : 21


In [23]:
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

In [24]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer,
                                 pad_token_id_for_segment=0, pad_token_id_for_label=-100):

    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []

    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        tokens = []
        labels_ids = []
        for one_word, label_token in zip(example, label):
            subword_tokens = tokenizer.tokenize(one_word)
            tokens.extend(subword_tokens)
            labels_ids.extend([tag_to_index[label_token]]+ [pad_token_id_for_label] * (len(subword_tokens) - 1))

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            labels_ids = labels_ids[:(max_seq_len - special_tokens_count)]

        tokens += [sep_token]
        labels_ids += [pad_token_id_for_label]

        tokens = [cls_token] + tokens
        labels_ids = [pad_token_id_for_label] + labels_ids


        input_id = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        padding_count = max_seq_len - len(input_id)
        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)
        token_type_id = [pad_token_id_for_segment] * max_seq_len
        label = labels_ids + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
        assert len(label) == max_seq_len, "Error with labels length {} vs {}".format(len(label), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [25]:
X_train, y_train = convert_examples_to_features(train_sentences, train_labels, max_seq_len=128, tokenizer=tokenizer)

100%|██████████| 23033/23033 [00:09<00:00, 2508.43it/s]


In [26]:
X_test, y_test = convert_examples_to_features(test_sentences, test_labels, max_seq_len=128, tokenizer=tokenizer)

100%|██████████| 931/931 [00:00<00:00, 2861.37it/s]


In [27]:
class TFBertForTokenClassification(tf.keras.Model):
    def __init__(self, model_name, num_labels):
        super(TFBertForTokenClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(num_labels,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        all_output = outputs[0]
        prediction = self.classifier(all_output)

        return prediction

In [28]:
def compute_loss(labels, logits):

  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
  active_loss = tf.reshape(labels, (-1,)) != -100
  reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
  labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)

  return loss_fn(labels, reduced_logits)

In [29]:
model = TFBertForTokenClassification("klue/bert-base", num_labels=tag_size)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=compute_loss)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'bert.embeddings.position_ids', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [30]:
class F1score(tf.keras.callbacks.Callback):
    def __init__(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test

    def sequences_to_tags(self, label_ids, pred_ids):
      label_list = []
      pred_list = []

      for i in range(0, len(label_ids)):
        label_tag = []
        pred_tag = []

        for label_index, pred_index in zip(label_ids[i], pred_ids[i]):
          if label_index != -100:
            label_tag.append(index_to_tag[label_index])
            pred_tag.append(index_to_tag[pred_index])
        
        label_list.append(label_tag)``
        pred_list.append(pred_tag)

      return label_list, pred_list

    def on_epoch_end(self, epoch, logs={}):

      y_predicted = self.model.predict(self.X_test)
      y_predicted = np.argmax(y_predicted, axis = 2)

      label_list, pred_list = self.sequences_to_tags(self.y_test, y_predicted)

      score = f1_score(label_list, pred_list)
      print(' - f1: {:04.2f}'.format(score * 100))
      print(classification_report(label_list, pred_list))

In [31]:
f1_score_report = F1score(X_test, y_test)

In [32]:
model.fit(
    X_train, y_train, epochs=3, batch_size=32,
    callbacks = [f1_score_report]
)

Epoch 1/3
  3/720 [..............................] - ETA: 1:09:45 - loss: 2.0552

: 

: 

In [None]:
# Only for korean dataset
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

In [None]:
from konlpy.tag import Mecab

mecab = Mecab()

In [None]:
def convert_examples_to_features_for_prediction(examples, max_seq_len, tokenizer,
                                 pad_token_id_for_segment=0, pad_token_id_for_label=-100):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []

    for example in tqdm(examples):
        tokens = []
        label_mask = []
        for one_word in example:
            subword_tokens = tokenizer.tokenize(one_word)
            tokens.extend(subword_tokens)
            label_mask.extend([0]+ [pad_token_id_for_label] * (len(subword_tokens) - 1))

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            label_mask = label_mask[:(max_seq_len - special_tokens_count)]

        tokens += [sep_token]
        label_mask += [pad_token_id_for_label]

        tokens = [cls_token] + tokens
        label_mask = [pad_token_id_for_label] + label_mask


        input_id = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        padding_count = max_seq_len - len(input_id)
        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)
        token_type_id = [pad_token_id_for_segment] * max_seq_len
        label_mask = label_mask + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
        assert len(label_mask) == max_seq_len, "Error with labels length {} vs {}".format(len(label_mask), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        label_masks.append(label_mask)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    label_masks = np.asarray(label_masks, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), label_masks

In [None]:
def ner_prediction(examples, max_seq_len, tokenizer, lang='ko'):
  
  if lang == 'ko':
    examples = [mecab.morphs(sent) for sent in examples]
  else:
    examples = [sent.split() for sent in examples]

  X_pred, label_masks = convert_examples_to_features_for_prediction(examples, max_seq_len=128, tokenizer=tokenizer)
  y_predicted = model.predict(X_pred)
  y_predicted = np.argmax(y_predicted, axis = 2)

  pred_list = []
  result_list = []

  for i in range(0, len(label_masks)):
    pred_tag = []
    for label_index, pred_index in zip(label_masks[i], y_predicted[i]):
      if label_index != -100:
        pred_tag.append(index_to_tag[pred_index])

    pred_list.append(pred_tag)

  for example, pred in zip(examples, pred_list):
    one_sample_result = []
    for one_word, label_token in zip(example, pred):
      one_sample_result.append((one_word, label_token))
    result_list.append(one_sample_result)

  return result_list

In [None]:
sent1 = '오리온스는 리그 최정상급 포인트가드 김동훈을 앞세우는 빠른 공수전환이 돋보이는 팀이다'
sent2 = '하이신사에 속한 섬들도 위로 솟아 있는데 타인은 살고 있어요'
sent3 = '유원준 연구원은 심심해서 탐앤탐스에서 커피를 마시면서 BERT 기반의 개체명 인식기를 만들었다.'

In [None]:
test_samples = [sent1, sent2, sent3]

In [None]:
result_list = ner_prediction(test_samples, max_seq_len=128, tokenizer=tokenizer, lang='ko')

In [None]:
result_list