In [1]:
!pip install transformers



In [2]:
from transformers import pipeline
fm = pipeline('fill-mask', model='klue/bert-base')

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
text = '한국의 수도는 [MASK]이다.'
fm(text)

[{'score': 0.6390591263771057,
  'token': 3671,
  'token_str': '서울',
  'sequence': '한국의 수도는 서울 이다.'},
 {'score': 0.04642043262720108,
  'token': 9474,
  'token_str': '광화문',
  'sequence': '한국의 수도는 광화문 이다.'},
 {'score': 0.037859588861465454,
  'token': 7141,
  'token_str': '평양',
  'sequence': '한국의 수도는 평양 이다.'},
 {'score': 0.018155217170715332,
  'token': 4873,
  'token_str': '수원',
  'sequence': '한국의 수도는 수원 이다.'},
 {'score': 0.016970166936516762,
  'token': 3902,
  'token_str': '부산',
  'sequence': '한국의 수도는 부산 이다.'}]

In [4]:
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [6]:
from transformers import AutoModelForSequenceClassification

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
"klue/bert-base", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import pipeline

In [9]:
model.config.id2label = {0: 'NEGATIVE', 1: 'POSITIVE'}

In [10]:
sm = pipeline('text-classification', model=model,
tokenizer=tokenizer, device='cuda:0')

In [11]:
sm('이 영화 재밌다')

[{'label': 'POSITIVE', 'score': 0.5377184152603149}]

In [12]:
sm('돈 아깝다')

[{'label': 'NEGATIVE', 'score': 0.5085831880569458}]

In [13]:
!pip install transformers datasets



In [14]:
import tensorflow as tf
from transformers import AutoTokenizer, BertForTokenClassification

In [15]:
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = BertForTokenClassification.from_pretrained(
 "klue/bert-base", num_labels=13)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from datasets import load_dataset
ner_train = load_dataset('klue', 'ner', split='train')
ner_val = load_dataset('klue', 'ner', split='validation')

In [17]:
from tokenizers.pre_tokenizers import BertPreTokenizer
pre_tokenizer = BertPreTokenizer()

In [18]:
def align_label(word, word_tokens, char_labels):
  i = j = 0
  token_labels = []
  while i < len(word) and j < len(word_tokens):
    step = len(word_tokens[j].replace('##', ''))
    token_labels.append(min(char_labels[i:i+step]))
    i += step
    j += 1
  return token_labels

In [19]:
def convert_example(example):
  tokens = ['[CLS]']
  labels = [12]
  text = ''.join(example['tokens'])
  pretokens = pre_tokenizer.pre_tokenize_str(text)

  for word, (begin, end) in pretokens:
    word_tokens = tokenizer.tokenize(word)
    if '[UNK]' in word_tokens:
      token_labels = [12] * len(word_tokens)
    else:
      char_labels = example['ner_tags'][begin:end]
      token_labels = align_label(word, word_tokens, char_labels)
    tokens += word_tokens
    labels += token_labels

  tokens.append('[SEP]')

  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  n = len(input_ids)
  token_type_ids = [0] * n
  attention_mask = [1] * n
  labels.append(12)

  return {'input_ids' : input_ids,
          'token_type_ids' : token_type_ids,
          'attention_mask' : attention_mask,
          'labels' : labels}

In [20]:
example = ner_train[0]
convert_example(example)

{'input_ids': [2,
  3727,
  30032,
  7825,
  4367,
  1091,
  2395,
  2198,
  2318,
  2024,
  27135,
  1038,
  2033,
  2377,
  2015,
  2532,
  2299,
  2118,
  25,
  3565,
  5757,
  2170,
  2259,
  8960,
  5119,
  5937,
  551,
  2454,
  2232,
  2200,
  2021,
  2138,
  3792,
  31302,
  2200,
  1902,
  2062,
  18,
  3],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [12,
  12,
  2,
  2,
  12,
  2,
  3,
  3,
  3,
  3,
  12,
  2,
  3,
  3,
  3,
  3,
  12,
  12,
  8,
  9,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12]}

In [21]:
train_ds = ner_train.map(convert_example)
eval_ds = ner_val.map(convert_example)

In [22]:
from transformers import DataCollatorForTokenClassification
data_collator = \
DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
!pip install accelerate -U



In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
output_dir="test_trainer", num_train_epochs=1,
evaluation_strategy="epoch")

trainer = Trainer(
model=model, args=training_args, train_dataset=train_ds,
eval_dataset=eval_ds, data_collator=data_collator)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0551,0.065404


TrainOutput(global_step=2626, training_loss=0.07811687179675832, metrics={'train_runtime': 387.7133, 'train_samples_per_second': 54.184, 'train_steps_per_second': 6.773, 'total_flos': 572113064143440.0, 'train_loss': 0.07811687179675832, 'epoch': 1.0})

In [25]:
model.config.id2label = {
0: 'B-DT', 1: 'I-DT', 2: 'B-LC', 3: 'I-LC',
4: 'B-OG', 5: 'I-OG', 6: 'B-PS', 7: 'I-PS',
8: 'B-QT', 9: 'I-QT', 10: 'B-TI', 11: 'I-TI',
12: 'O'
}

In [30]:
from transformers import pipeline
ner = pipeline(
'token-classification', model=model, tokenizer=tokenizer, device='cuda:0')

In [31]:
ner('''오늘 12시 서울특별시 성북구에 있는 국민대 경영대학원에
서 유재명 교수가 1가지 주제로 수업을 한다.''')

[{'entity': 'B-DT',
  'score': 0.93459827,
  'index': 1,
  'word': '오늘',
  'start': 0,
  'end': 2},
 {'entity': 'B-TI',
  'score': 0.97116077,
  'index': 2,
  'word': '12',
  'start': 3,
  'end': 5},
 {'entity': 'I-TI',
  'score': 0.9945708,
  'index': 3,
  'word': '##시',
  'start': 5,
  'end': 6},
 {'entity': 'B-LC',
  'score': 0.9757789,
  'index': 4,
  'word': '서울특별시',
  'start': 7,
  'end': 12},
 {'entity': 'I-LC',
  'score': 0.9563617,
  'index': 5,
  'word': '성북구',
  'start': 13,
  'end': 16},
 {'entity': 'B-OG',
  'score': 0.93390363,
  'index': 9,
  'word': '국민대',
  'start': 21,
  'end': 24},
 {'entity': 'I-OG',
  'score': 0.92530555,
  'index': 10,
  'word': '경영',
  'start': 25,
  'end': 27},
 {'entity': 'I-OG',
  'score': 0.8847801,
  'index': 11,
  'word': '##대',
  'start': 27,
  'end': 28},
 {'entity': 'I-OG',
  'score': 0.8656683,
  'index': 12,
  'word': '##학',
  'start': 28,
  'end': 29},
 {'entity': 'I-OG',
  'score': 0.7711842,
  'index': 13,
  'word': '##원',
  'start'