# Readme

- This example illustrates processing CoNLL-2003 corpus with BERT for sequence tagging from https://github.com/huggingface/transformers
- Using this script it is possible to achieve 91.4 F1 entity-level score and 92.8 F1 token-level score with bert-base
- With 100 epochs and lr=1e-5, batch_size=8 it is possible to achieve the same score as in https://gluon-nlp.mxnet.io/model_zoo/bert/index.html 

# Install dependencies

In [None]:
!pip install pytorch_transformers flair seqeval

# Download CoNLL-2003

In [None]:
!mkdir -p conll2003
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa -O ./conll2003/eng.testa
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb -O ./conll2003/eng.testb
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train -O ./conll2003/eng.train

# Initialization

In [1]:
# Choosing cuda devices if there are multiple

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger('sequence_tagger_bert')

In [3]:
import torch

device = torch.device('cuda')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

Tesla V100-DGXS-16GB


In [4]:
CACHE_DIR = 'cache'
BATCH_SIZE = 16
PRED_BATCH_SIZE = 100
MAX_LEN = 128
MAX_N_EPOCHS = 4
WEIGHT_DECAY = 0.01
LEARNING_RATE = 5e-5

In [5]:
import torch
torch.manual_seed(117)

<torch._C.Generator at 0x7fd95505e530>

# Load corpus

In [6]:
from flair.datasets import ColumnCorpus


data_folder = 'conll2003'
corpus = ColumnCorpus(data_folder, 
                      {0 : 'text', 3 : 'ner'},
                      train_file='eng.train',
                      test_file='eng.testb',
                      dev_file='eng.testa')

print(corpus.obtain_statistics())

INFO:transformers.file_utils:TensorFlow version 2.0.0 available.
INFO:transformers.file_utils:PyTorch version 1.3.1 available.
INFO:transformers.modeling_xlnet:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2019-12-13 00:35:35,712 Reading data from conll2003
2019-12-13 00:35:35,713 Train: conll2003/eng.train
2019-12-13 00:35:35,713 Dev: conll2003/eng.testa
2019-12-13 00:35:35,714 Test: conll2003/eng.testb
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 14987,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 204567,
            "min": 1,
            "max": 113,
            "avg": 13.649629679055181
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 3684,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "t

# Create model

In [7]:
from bert_sequence_tagger import SequenceTaggerBert, BertForTokenClassificationCustom
from bert_sequence_tagger.bert_utils import make_bert_tag_dict_from_flair_corpus

from pytorch_transformers import BertTokenizer, BertForTokenClassification


bpe_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', 
                                              cache_dir=CACHE_DIR, 
                                              do_lower_case=False)

idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)

model = BertForTokenClassificationCustom.from_pretrained('bert-base-cased', 
                                                         cache_dir=CACHE_DIR, 
                                                         num_labels=len(tag2idx)).cuda()

seq_tagger = SequenceTaggerBert(bert_model=model, bpe_tokenizer=bpe_tokenizer, 
                                idx2tag=idx2tag, tag2idx=tag2idx, max_len=MAX_LEN,
                                pred_batch_size=PRED_BATCH_SIZE)

INFO:pytorch_transformers.modeling_bert:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
INFO:pytorch_transformers.modeling_xlnet:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at cache/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at cache/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.d7a3af18ce3a2ab7c0f48f04dc8daff45ed9a3ed333b9e9a79d012a0dedf87a6
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidd

# Train model

In [8]:
from bert_sequence_tagger.bert_utils import get_model_parameters, prepare_flair_corpus
from bert_sequence_tagger.model_trainer_bert import ModelTrainerBert
from bert_sequence_tagger.metrics import f1_entity_level, f1_token_level

from pytorch_transformers import AdamW, WarmupLinearSchedule


train_dataset = prepare_flair_corpus(corpus.train)
val_dataset = prepare_flair_corpus(corpus.dev)

optimizer = AdamW(get_model_parameters(model), 
                  lr=LEARNING_RATE, betas=(0.9, 0.999), 
                  eps =1e-6, weight_decay=0.01, correct_bias=True)
lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1, 
                                    t_total=(len(corpus.train) / BATCH_SIZE)*MAX_N_EPOCHS)

trainer = ModelTrainerBert(model=seq_tagger, 
                           optimizer=optimizer, 
                           lr_scheduler=lr_scheduler,
                           train_dataset=train_dataset, 
                           val_dataset=val_dataset,
                           update_scheduler='es',
                           validation_metrics=[f1_entity_level],
                           batch_size=BATCH_SIZE)

trainer.train(epochs=MAX_N_EPOCHS)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:sequence_tagger_bert:Current learning rate: 3.828420055249356e-05
INFO:sequence_tagger_bert:Train loss: 0.08635571034180195
INFO:sequence_tagger_bert:Validation loss: 0.03886955951792047
INFO:sequence_tagger_bert:Validation metrics: (0.9362345627152818,)


Epoch:  25%|██▌       | 1/4 [02:33<07:41, 153.72s/it]

INFO:sequence_tagger_bert:Current learning rate: 2.6567066579477667e-05
INFO:sequence_tagger_bert:Train loss: 0.022030211970791358
INFO:sequence_tagger_bert:Validation loss: 0.032396575959865004
INFO:sequence_tagger_bert:Validation metrics: (0.9428475374012439,)


Epoch:  50%|█████     | 2/4 [05:02<05:04, 152.37s/it]

INFO:sequence_tagger_bert:Current learning rate: 1.4849932606461773e-05
INFO:sequence_tagger_bert:Train loss: 0.009602448526282073
INFO:sequence_tagger_bert:Validation loss: 0.031968948459534935
INFO:sequence_tagger_bert:Validation metrics: (0.9522608841822156,)


Epoch:  75%|███████▌  | 3/4 [07:31<02:31, 151.32s/it]

INFO:sequence_tagger_bert:Current learning rate: 3.1327986334458785e-06
INFO:sequence_tagger_bert:Train loss: 0.003817373911283591
INFO:sequence_tagger_bert:Validation loss: 0.03424496421454629
INFO:sequence_tagger_bert:Validation metrics: (0.9553616378587012,)


Epoch: 100%|██████████| 4/4 [10:05<00:00, 151.47s/it]


In [9]:
test_dataset = prepare_flair_corpus(corpus.test)

_, __, test_metrics = seq_tagger.predict(test_dataset, evaluate=True, 
                                         metrics=[f1_entity_level, f1_token_level])
logger.info(f'Entity-level f1: {test_metrics[1]}')
logger.info(f'Token-level f1: {test_metrics[2]}')

INFO:sequence_tagger_bert:Entity-level f1: 0.9146330719760374
INFO:sequence_tagger_bert:Token-level f1: 0.9302211603259204


# Predicting

In [10]:
seq_tagger.predict([['We', 'are', 'living', 'in', 'New', 'York', 'city', '.'],
                    ['Satya', 'Narayana', 'Nadella', 'is', 'an', 'engineer', 'and', 'business', 'executive', '.']])

([['O', 'O', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'O'],
  ['I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']],
 [10.285023, 10.41528])

# Save model

In [8]:
seq_tagger.save_serialize('./model')

# Restore model

In [9]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
import torch

device = torch.device('cuda')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

In [None]:
from bert_sequence_tagger.bert_for_token_classification_custom import BertForTokenClassificationCustom
from bert_sequence_tagger import SequenceTaggerBert


seq_tagger = SequenceTaggerBert.load_serialized('./model', BertForTokenClassificationCustom)

In [None]:
seq_tagger.predict([['We', 'are', 'living', 'in', 'New', 'York', 'city', '.'],
                    ['Satya', 'Narayana', 'Nadella', 'is', 'an', 'engineer', 'and', 'business', 'executive', '.']])