# Readme

- This example illustrates processing CoNLL-2003 corpus with BERT for sequence tagging from https://github.com/huggingface/transformers
- Using this script it is possible to achieve 91.4 F1 entity-level score and 92.8 F1 token-level score with bert-base
- With 100 epochs and lr=1e-5, batch_size=8 it is possible to achieve the same score as in https://gluon-nlp.mxnet.io/model_zoo/bert/index.html 

# Install dependencies

In [None]:
!pip install pytorch_transformers flair seqeval

# Download CoNLL-2003

In [11]:
!mkdir -p data
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa && mv eng.testa ./data/dev.txt
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb && mv eng.testb ./data/test.txt
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train && mv eng.train ./data/train.txt

--2019-11-23 15:05:42--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 827012 (808K) [text/plain]
Saving to: ‘eng.testa’


2019-11-23 15:05:43 (2.64 MB/s) - ‘eng.testa’ saved [827012/827012]

--2019-11-23 15:05:43--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748096 (731K) [text/plain]
Saving to: ‘eng.testb’


2019-11-23 15:05:44 (2.91 MB/s) - ‘eng.testb’ saved [748096/748096]

# Initialization

In [None]:
%load_ext autoreload
%autoreload 2

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [8]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger('sequence_tagger_bert')

In [9]:
import torch

device = torch.device('cuda')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

Tesla V100-DGXS-16GB


In [10]:
CACHE_DIR = 'cache'
BATCH_SIZE = 16
PRED_BATCH_SIZE = 100
MAX_LEN = 128
MAX_N_EPOCHS = 4
WEIGHT_DECAY = 0.01
LEARNING_RATE = 5e-5

# Load corpus

In [5]:
from flair.datasets import ColumnCorpus


data_folder = 'data'
corpus = ColumnCorpus(data_folder, 
                      {0 : 'text', 3 : 'ner'},
                      train_file='train.txt',
                      test_file='test.txt',
                      dev_file='dev.txt')

print(corpus.obtain_statistics())

2019-11-23 15:14:09,439 Reading data from data
2019-11-23 15:14:09,440 Train: data/train.txt
2019-11-23 15:14:09,440 Dev: data/dev.txt
2019-11-23 15:14:09,441 Test: data/test.txt
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 14987,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 204567,
            "min": 1,
            "max": 113,
            "avg": 13.649629679055181
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 3684,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 46666,
            "min": 1,
            "max": 124,
            "avg": 12.667209554831704
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 3466,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_ta

# Create model

In [6]:
from bert_sequence_tagger import SequenceTaggerBert, BertForTokenClassificationCustom
from pytorch_transformers import BertTokenizer, BertForTokenClassification
import torch.nn as nn

from bert_sequence_tagger.bert_utils import make_bert_tag_dict_from_flair_corpus


bpe_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', 
                                              cache_dir=CACHE_DIR, 
                                              do_lower_case=False)

idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)

model = BertForTokenClassificationCustom.from_pretrained('bert-base-cased', 
                                                         cache_dir=CACHE_DIR, 
                                                         num_labels=len(tag2idx)).cuda()

seq_tagger = SequenceTaggerBert(bert_model=model, bpe_tokenizer=bpe_tokenizer, 
                                idx2tag=idx2tag, tag2idx=tag2idx, max_len=MAX_LEN)

100%|██████████| 435779157/435779157 [01:14<00:00, 5816928.08B/s] 


# Train model

In [7]:
from torch.utils.data import RandomSampler, SequentialSampler

from bert_sequence_tagger.bert_utils import create_loader_from_flair_corpus, get_model_parameters, get_parameters_without_decay
from bert_sequence_tagger.model_trainer_bert import ModelTrainerBert

from pytorch_transformers import AdamW, WarmupLinearSchedule

from bert_sequence_tagger.metrics import f1_entity_level, f1_token_level


train_dataloader = create_loader_from_flair_corpus(corpus.train, 
                                                   RandomSampler, 
                                                   batch_size=BATCH_SIZE)
val_dataloader = create_loader_from_flair_corpus(corpus.dev,
                                                 SequentialSampler,
                                                 batch_size=PRED_BATCH_SIZE)

optimizer = AdamW(get_model_parameters(model), 
                  lr=LEARNING_RATE, betas=(0.9, 0.999), 
                  eps =1e-6, weight_decay=0.01, correct_bias=True)
lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1, 
                                    t_total=(len(corpus.train) / BATCH_SIZE)*MAX_N_EPOCHS)

trainer = ModelTrainerBert(model=seq_tagger, 
                           optimizer=optimizer, 
                           lr_scheduler=lr_scheduler,
                           train_dataloader=train_dataloader, 
                           val_dataloader=val_dataloader,
                           update_scheduler='es',
                           keep_best_model=False,
                           restore_bm_on_lr_change=False,
                           max_grad_norm=1.,
                           validation_metrics=[f1_entity_level],
                           decision_metric=lambda metrics: -metrics[1])

trainer.train(epochs=MAX_N_EPOCHS)

Epoch: 100%|██████████| 4/4 [10:21<00:00, 155.32s/it]


In [11]:
test_dataloader = create_loader_from_flair_corpus(corpus.test,
                                                  SequentialSampler,
                                                  batch_size=PRED_BATCH_SIZE)

_, __, test_metrics = seq_tagger.predict(test_dataloader, evaluate=True, 
                                         metrics=[f1_entity_level, f1_token_level])
logger.info(f'Entity-level f1: {test_metrics[1]}')
logger.info(f'Token-level f1: {test_metrics[2]}')

INFO:sequence_tagger_bert:Entity-level f1: 0.9160722148833114
INFO:sequence_tagger_bert:Token-level f1: 0.930110159118727
