In [None]:
%pip install flair spacy

In [1]:
import os
os.chdir('/mnt/d/demo')

In [2]:
import torch
print(torch.__version__)

[torch.cuda.get_device_name(d) for d in range(torch.cuda.device_count())]

  from .autonotebook import tqdm as notebook_tqdm


1.11.0


['NVIDIA GeForce RTX 3070 Laptop GPU']

In [3]:
from flair.datasets.sequence_labeling import ColumnCorpus
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from transformers import BertConfig, BertTokenizerFast

In [4]:
BERT_MODEL_DIR = 'FinBERT-FinVocab-Uncased'
CACHE_DIR = 'embeddings'
INPUT_PATH = 'data'
torch.set_default_tensor_type(torch.FloatTensor)

config = BertConfig.from_pretrained(BERT_MODEL_DIR, cache_dir=CACHE_DIR)
config.save_pretrained(BERT_MODEL_DIR)
tokenizer = BertTokenizerFast(vocab_file=os.path.join(BERT_MODEL_DIR, 'vocab.txt'))
tokenizer.save_pretrained(BERT_MODEL_DIR)

('FinBERT-FinVocab-Uncased/tokenizer_config.json',
 'FinBERT-FinVocab-Uncased/special_tokens_map.json',
 'FinBERT-FinVocab-Uncased/vocab.txt',
 'FinBERT-FinVocab-Uncased/added_tokens.json',
 'FinBERT-FinVocab-Uncased/tokenizer.json')

In [5]:
MAX_LENGTH = 512
BATCH_SIZE = 4
MAX_EPOCHS = 5
HIDDEN_SIZE = 256

In [6]:
def filter_long(dataset):
    # Bug in Flair: skip very long sentences to avoid errors!
    return [x for x in dataset if len(x) <= MAX_LENGTH]

def train_tagger(input_path, train_file, test_file, dev_file):
    columns = {0 : 'text', 1 : 'ner'}
    corpus = ColumnCorpus(input_path, columns, train_file=train_file, test_file=test_file, dev_file=dev_file)
    corpus._train = filter_long(corpus.train)
    corpus._dev = filter_long(corpus.dev)
    corpus._test = filter_long(corpus.test)
    embeddings = TransformerWordEmbeddings(BERT_MODEL_DIR, cache_dir=CACHE_DIR, allow_long_sentences=False, fine_tune=True, layers='-1')
    tagger = SequenceTagger(hidden_size=HIDDEN_SIZE, embeddings=embeddings, tag_dictionary=corpus.make_label_dictionary(label_type='ner'), tag_type='ner', use_crf=True)
    trainer = ModelTrainer(tagger, corpus)
    trainer.train('finbert', learning_rate=0.1, mini_batch_size=BATCH_SIZE, max_epochs=MAX_EPOCHS, embeddings_storage_mode='gpu')

In [7]:
train_tagger('data', 'data_train.txt', 'data_test.txt', None)

2022-05-04 15:17:06,229 Reading data from data
2022-05-04 15:17:06,229 Train: data/data_train.txt
2022-05-04 15:17:06,230 Dev: None
2022-05-04 15:17:06,230 Test: data/data_test.txt
2022-05-04 15:17:11,896 Computing label dictionary. Progress:


1046it [00:00, 95009.25it/s]

2022-05-04 15:17:11,910 Dictionary created for label 'ner' with 5 values: PERSON (seen 661 times), ORG (seen 230 times), LOC (seen 162 times), MISC (seen 7 times)
2022-05-04 15:17:11,911 SequenceTagger predicts: Dictionary with 17 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-ORG, B-ORG, E-ORG, I-ORG, S-LOC, B-LOC, E-LOC, I-LOC, S-MISC, B-MISC, E-MISC, I-MISC





2022-05-04 15:17:12,568 ----------------------------------------------------------------------------------------------------
2022-05-04 15:17:12,570 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30873, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): D

100%|██████████| 29/29 [00:01<00:00, 19.40it/s]

2022-05-04 15:17:38,748 Evaluating as a multi-label problem: False
2022-05-04 15:17:38,754 DEV : loss 0.06763340532779694 - f1-score (micro avg)  0.7484
2022-05-04 15:17:38,758 BAD EPOCHS (no improvement): 0
2022-05-04 15:17:38,761 saving best model





2022-05-04 15:17:40,505 ----------------------------------------------------------------------------------------------------
2022-05-04 15:17:43,374 epoch 2 - iter 26/262 - loss 0.19393792 - samples/sec: 36.28 - lr: 0.100000
2022-05-04 15:17:45,941 epoch 2 - iter 52/262 - loss 0.20235445 - samples/sec: 40.55 - lr: 0.100000
2022-05-04 15:17:48,613 epoch 2 - iter 78/262 - loss 0.22486070 - samples/sec: 38.94 - lr: 0.100000
2022-05-04 15:17:50,985 epoch 2 - iter 104/262 - loss 0.22641256 - samples/sec: 43.89 - lr: 0.100000
2022-05-04 15:17:53,429 epoch 2 - iter 130/262 - loss 0.22687826 - samples/sec: 42.57 - lr: 0.100000
2022-05-04 15:17:55,808 epoch 2 - iter 156/262 - loss 0.23221175 - samples/sec: 43.76 - lr: 0.100000
2022-05-04 15:17:58,383 epoch 2 - iter 182/262 - loss 0.24994298 - samples/sec: 40.42 - lr: 0.100000
2022-05-04 15:18:00,774 epoch 2 - iter 208/262 - loss 0.25629811 - samples/sec: 43.52 - lr: 0.100000
2022-05-04 15:18:03,231 epoch 2 - iter 234/262 - loss 0.26105162 - sam

100%|██████████| 29/29 [00:01<00:00, 21.51it/s]

2022-05-04 15:18:06,918 Evaluating as a multi-label problem: False
2022-05-04 15:18:06,923 DEV : loss 0.19801780581474304 - f1-score (micro avg)  0.0
2022-05-04 15:18:06,927 BAD EPOCHS (no improvement): 1
2022-05-04 15:18:06,930 ----------------------------------------------------------------------------------------------------





2022-05-04 15:18:09,391 epoch 3 - iter 26/262 - loss 0.32531911 - samples/sec: 42.30 - lr: 0.100000
2022-05-04 15:18:11,741 epoch 3 - iter 52/262 - loss 0.36047539 - samples/sec: 44.29 - lr: 0.100000
2022-05-04 15:18:14,526 epoch 3 - iter 78/262 - loss 0.34787363 - samples/sec: 37.37 - lr: 0.100000
2022-05-04 15:18:17,147 epoch 3 - iter 104/262 - loss 0.33406861 - samples/sec: 39.70 - lr: 0.100000
2022-05-04 15:18:19,696 epoch 3 - iter 130/262 - loss 0.31748376 - samples/sec: 40.83 - lr: 0.100000
2022-05-04 15:18:22,131 epoch 3 - iter 156/262 - loss 0.29436651 - samples/sec: 42.74 - lr: 0.100000
2022-05-04 15:18:24,544 epoch 3 - iter 182/262 - loss 0.28469263 - samples/sec: 43.13 - lr: 0.100000
2022-05-04 15:18:26,846 epoch 3 - iter 208/262 - loss 0.28003047 - samples/sec: 45.21 - lr: 0.100000
2022-05-04 15:18:29,181 epoch 3 - iter 234/262 - loss 0.27645516 - samples/sec: 44.58 - lr: 0.100000
2022-05-04 15:18:31,669 epoch 3 - iter 260/262 - loss 0.26877406 - samples/sec: 41.84 - lr: 0.

100%|██████████| 29/29 [00:01<00:00, 21.40it/s]

2022-05-04 15:18:33,169 Evaluating as a multi-label problem: False
2022-05-04 15:18:33,175 DEV : loss 0.1679811179637909 - f1-score (micro avg)  0.0
2022-05-04 15:18:33,179 BAD EPOCHS (no improvement): 2
2022-05-04 15:18:33,181 ----------------------------------------------------------------------------------------------------





2022-05-04 15:18:35,622 epoch 4 - iter 26/262 - loss 0.29412592 - samples/sec: 42.67 - lr: 0.100000
2022-05-04 15:18:38,270 epoch 4 - iter 52/262 - loss 0.23144518 - samples/sec: 39.29 - lr: 0.100000
2022-05-04 15:18:40,586 epoch 4 - iter 78/262 - loss 0.24261276 - samples/sec: 44.94 - lr: 0.100000
2022-05-04 15:18:43,079 epoch 4 - iter 104/262 - loss 0.23874697 - samples/sec: 41.75 - lr: 0.100000
2022-05-04 15:18:45,967 epoch 4 - iter 130/262 - loss 0.23686611 - samples/sec: 36.03 - lr: 0.100000
2022-05-04 15:18:48,427 epoch 4 - iter 156/262 - loss 0.23985840 - samples/sec: 42.30 - lr: 0.100000
2022-05-04 15:18:50,911 epoch 4 - iter 182/262 - loss 0.24235813 - samples/sec: 41.90 - lr: 0.100000
2022-05-04 15:18:53,278 epoch 4 - iter 208/262 - loss 0.24547239 - samples/sec: 43.97 - lr: 0.100000
2022-05-04 15:18:55,530 epoch 4 - iter 234/262 - loss 0.24109890 - samples/sec: 46.23 - lr: 0.100000
2022-05-04 15:18:58,220 epoch 4 - iter 260/262 - loss 0.24817566 - samples/sec: 38.69 - lr: 0.

100%|██████████| 29/29 [00:01<00:00, 21.25it/s]

2022-05-04 15:18:59,755 Evaluating as a multi-label problem: False
2022-05-04 15:18:59,760 DEV : loss 0.1901572346687317 - f1-score (micro avg)  0.0
2022-05-04 15:18:59,764 BAD EPOCHS (no improvement): 3
2022-05-04 15:18:59,768 ----------------------------------------------------------------------------------------------------





2022-05-04 15:19:02,523 epoch 5 - iter 26/262 - loss 0.16439327 - samples/sec: 37.77 - lr: 0.100000
2022-05-04 15:19:04,686 epoch 5 - iter 52/262 - loss 0.20681163 - samples/sec: 48.13 - lr: 0.100000
2022-05-04 15:19:07,519 epoch 5 - iter 78/262 - loss 0.19798867 - samples/sec: 36.74 - lr: 0.100000
2022-05-04 15:19:09,964 epoch 5 - iter 104/262 - loss 0.21422582 - samples/sec: 42.55 - lr: 0.100000
2022-05-04 15:19:12,296 epoch 5 - iter 130/262 - loss 0.23460667 - samples/sec: 44.65 - lr: 0.100000
2022-05-04 15:19:14,790 epoch 5 - iter 156/262 - loss 0.23826677 - samples/sec: 41.73 - lr: 0.100000
2022-05-04 15:19:17,551 epoch 5 - iter 182/262 - loss 0.22842899 - samples/sec: 37.70 - lr: 0.100000
2022-05-04 15:19:20,111 epoch 5 - iter 208/262 - loss 0.23721945 - samples/sec: 40.65 - lr: 0.100000
2022-05-04 15:19:22,473 epoch 5 - iter 234/262 - loss 0.24012122 - samples/sec: 44.05 - lr: 0.100000
2022-05-04 15:19:24,786 epoch 5 - iter 260/262 - loss 0.24124429 - samples/sec: 45.00 - lr: 0.

100%|██████████| 29/29 [00:01<00:00, 21.00it/s]

2022-05-04 15:19:26,380 Evaluating as a multi-label problem: False
2022-05-04 15:19:26,385 DEV : loss 0.15713681280612946 - f1-score (micro avg)  0.0
2022-05-04 15:19:26,389 Epoch     5: reducing learning rate of group 0 to 5.0000e-02.
2022-05-04 15:19:26,391 BAD EPOCHS (no improvement): 4
2022-05-04 15:19:26,392 ----------------------------------------------------------------------------------------------------





2022-05-04 15:19:28,825 epoch 6 - iter 26/262 - loss 0.24538227 - samples/sec: 42.80 - lr: 0.050000
2022-05-04 15:19:31,643 epoch 6 - iter 52/262 - loss 0.25067848 - samples/sec: 36.93 - lr: 0.050000
2022-05-04 15:19:34,117 epoch 6 - iter 78/262 - loss 0.21943403 - samples/sec: 42.06 - lr: 0.050000
2022-05-04 15:19:36,771 epoch 6 - iter 104/262 - loss 0.21974063 - samples/sec: 39.21 - lr: 0.050000
2022-05-04 15:19:38,957 epoch 6 - iter 130/262 - loss 0.21515353 - samples/sec: 47.63 - lr: 0.050000
2022-05-04 15:19:41,470 epoch 6 - iter 156/262 - loss 0.20713159 - samples/sec: 41.43 - lr: 0.050000
2022-05-04 15:19:43,763 epoch 6 - iter 182/262 - loss 0.21636648 - samples/sec: 45.39 - lr: 0.050000
2022-05-04 15:19:46,326 epoch 6 - iter 208/262 - loss 0.21682485 - samples/sec: 40.61 - lr: 0.050000
2022-05-04 15:19:48,716 epoch 6 - iter 234/262 - loss 0.21491867 - samples/sec: 43.55 - lr: 0.050000
2022-05-04 15:19:51,210 epoch 6 - iter 260/262 - loss 0.21310584 - samples/sec: 41.73 - lr: 0.

100%|██████████| 29/29 [00:01<00:00, 21.28it/s]

2022-05-04 15:19:52,721 Evaluating as a multi-label problem: False
2022-05-04 15:19:52,726 DEV : loss 0.15270304679870605 - f1-score (micro avg)  0.0
2022-05-04 15:19:52,730 BAD EPOCHS (no improvement): 1
2022-05-04 15:19:52,733 ----------------------------------------------------------------------------------------------------





2022-05-04 15:19:54,938 epoch 7 - iter 26/262 - loss 0.25855455 - samples/sec: 47.22 - lr: 0.050000
2022-05-04 15:19:57,587 epoch 7 - iter 52/262 - loss 0.23541923 - samples/sec: 39.28 - lr: 0.050000
2022-05-04 15:20:00,020 epoch 7 - iter 78/262 - loss 0.22740225 - samples/sec: 42.78 - lr: 0.050000
2022-05-04 15:20:02,608 epoch 7 - iter 104/262 - loss 0.22660614 - samples/sec: 40.22 - lr: 0.050000
2022-05-04 15:20:05,076 epoch 7 - iter 130/262 - loss 0.21518701 - samples/sec: 42.18 - lr: 0.050000
2022-05-04 15:20:07,762 epoch 7 - iter 156/262 - loss 0.21259904 - samples/sec: 38.74 - lr: 0.050000
2022-05-04 15:20:10,236 epoch 7 - iter 182/262 - loss 0.20854599 - samples/sec: 42.07 - lr: 0.050000
2022-05-04 15:20:12,476 epoch 7 - iter 208/262 - loss 0.21941108 - samples/sec: 46.49 - lr: 0.050000
2022-05-04 15:20:14,797 epoch 7 - iter 234/262 - loss 0.21313670 - samples/sec: 44.85 - lr: 0.050000
2022-05-04 15:20:17,445 epoch 7 - iter 260/262 - loss 0.21412424 - samples/sec: 39.30 - lr: 0.

100%|██████████| 29/29 [00:01<00:00, 20.96it/s]

2022-05-04 15:20:19,018 Evaluating as a multi-label problem: False
2022-05-04 15:20:19,023 DEV : loss 0.14573818445205688 - f1-score (micro avg)  0.0
2022-05-04 15:20:19,028 BAD EPOCHS (no improvement): 2
2022-05-04 15:20:19,030 ----------------------------------------------------------------------------------------------------





2022-05-04 15:20:21,904 epoch 8 - iter 26/262 - loss 0.21247075 - samples/sec: 36.22 - lr: 0.050000
2022-05-04 15:20:24,412 epoch 8 - iter 52/262 - loss 0.21374248 - samples/sec: 41.50 - lr: 0.050000
2022-05-04 15:20:27,027 epoch 8 - iter 78/262 - loss 0.19672160 - samples/sec: 39.80 - lr: 0.050000
2022-05-04 15:20:29,525 epoch 8 - iter 104/262 - loss 0.19664724 - samples/sec: 41.66 - lr: 0.050000
2022-05-04 15:20:31,877 epoch 8 - iter 130/262 - loss 0.20012333 - samples/sec: 44.26 - lr: 0.050000
2022-05-04 15:20:34,318 epoch 8 - iter 156/262 - loss 0.20301133 - samples/sec: 42.64 - lr: 0.050000
2022-05-04 15:20:36,928 epoch 8 - iter 182/262 - loss 0.20675537 - samples/sec: 39.87 - lr: 0.050000
2022-05-04 15:20:39,250 epoch 8 - iter 208/262 - loss 0.20905129 - samples/sec: 44.82 - lr: 0.050000
2022-05-04 15:20:41,628 epoch 8 - iter 234/262 - loss 0.20382814 - samples/sec: 43.77 - lr: 0.050000
2022-05-04 15:20:44,109 epoch 8 - iter 260/262 - loss 0.20641349 - samples/sec: 41.95 - lr: 0.

100%|██████████| 29/29 [00:01<00:00, 21.04it/s]

2022-05-04 15:20:45,768 Evaluating as a multi-label problem: False
2022-05-04 15:20:45,773 DEV : loss 0.1505756378173828 - f1-score (micro avg)  0.0
2022-05-04 15:20:45,778 BAD EPOCHS (no improvement): 3
2022-05-04 15:20:45,781 ----------------------------------------------------------------------------------------------------





2022-05-04 15:20:48,267 epoch 9 - iter 26/262 - loss 0.16672544 - samples/sec: 41.87 - lr: 0.050000
2022-05-04 15:20:50,709 epoch 9 - iter 52/262 - loss 0.22085376 - samples/sec: 42.62 - lr: 0.050000
2022-05-04 15:20:53,331 epoch 9 - iter 78/262 - loss 0.22643759 - samples/sec: 39.70 - lr: 0.050000
2022-05-04 15:20:55,911 epoch 9 - iter 104/262 - loss 0.21694350 - samples/sec: 40.34 - lr: 0.050000
2022-05-04 15:20:58,560 epoch 9 - iter 130/262 - loss 0.20652611 - samples/sec: 39.28 - lr: 0.050000
2022-05-04 15:21:01,139 epoch 9 - iter 156/262 - loss 0.19397056 - samples/sec: 40.36 - lr: 0.050000
2022-05-04 15:21:03,581 epoch 9 - iter 182/262 - loss 0.20702496 - samples/sec: 42.61 - lr: 0.050000
2022-05-04 15:21:06,327 epoch 9 - iter 208/262 - loss 0.20730400 - samples/sec: 37.90 - lr: 0.050000
2022-05-04 15:21:08,839 epoch 9 - iter 234/262 - loss 0.20487681 - samples/sec: 41.44 - lr: 0.050000
2022-05-04 15:21:11,139 epoch 9 - iter 260/262 - loss 0.20530463 - samples/sec: 45.26 - lr: 0.

100%|██████████| 29/29 [00:01<00:00, 21.13it/s]

2022-05-04 15:21:12,653 Evaluating as a multi-label problem: False
2022-05-04 15:21:12,658 DEV : loss 0.1411881297826767 - f1-score (micro avg)  0.0
2022-05-04 15:21:12,663 Epoch     9: reducing learning rate of group 0 to 2.5000e-02.
2022-05-04 15:21:12,665 BAD EPOCHS (no improvement): 4
2022-05-04 15:21:12,667 ----------------------------------------------------------------------------------------------------





2022-05-04 15:21:15,481 epoch 10 - iter 26/262 - loss 0.17958983 - samples/sec: 36.99 - lr: 0.025000
2022-05-04 15:21:18,021 epoch 10 - iter 52/262 - loss 0.18921819 - samples/sec: 40.97 - lr: 0.025000
2022-05-04 15:21:20,254 epoch 10 - iter 78/262 - loss 0.18866043 - samples/sec: 46.62 - lr: 0.025000
2022-05-04 15:21:23,024 epoch 10 - iter 104/262 - loss 0.18026621 - samples/sec: 37.57 - lr: 0.025000
2022-05-04 15:21:25,618 epoch 10 - iter 130/262 - loss 0.17473228 - samples/sec: 40.11 - lr: 0.025000
2022-05-04 15:21:27,719 epoch 10 - iter 156/262 - loss 0.17772628 - samples/sec: 49.56 - lr: 0.025000
2022-05-04 15:21:30,377 epoch 10 - iter 182/262 - loss 0.18227537 - samples/sec: 39.15 - lr: 0.025000
2022-05-04 15:21:32,703 epoch 10 - iter 208/262 - loss 0.18994752 - samples/sec: 44.74 - lr: 0.025000
2022-05-04 15:21:35,123 epoch 10 - iter 234/262 - loss 0.19844643 - samples/sec: 43.01 - lr: 0.025000
2022-05-04 15:21:37,796 epoch 10 - iter 260/262 - loss 0.19398956 - samples/sec: 38.9

100%|██████████| 29/29 [00:01<00:00, 18.83it/s]

2022-05-04 15:21:39,479 Evaluating as a multi-label problem: False
2022-05-04 15:21:39,484 DEV : loss 0.14075566828250885 - f1-score (micro avg)  0.0
2022-05-04 15:21:39,489 BAD EPOCHS (no improvement): 1





2022-05-04 15:21:41,262 ----------------------------------------------------------------------------------------------------
2022-05-04 15:21:41,265 loading file finbert/best-model.pt
2022-05-04 15:23:10,072 SequenceTagger predicts: Dictionary with 19 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-ORG, B-ORG, E-ORG, I-ORG, S-LOC, B-LOC, E-LOC, I-LOC, S-MISC, B-MISC, E-MISC, I-MISC, <START>, <STOP>


100%|██████████| 75/75 [00:04<00:00, 15.60it/s]

2022-05-04 15:23:16,177 Evaluating as a multi-label problem: False
2022-05-04 15:23:16,184 0.6	0.594	0.597	0.455
2022-05-04 15:23:16,185 
Results:
- F-score (micro) 0.597
- F-score (macro) 0.1867
- Accuracy 0.455

By class:
              precision    recall  f1-score   support

      PERSON     0.6390    0.8985    0.7468       197
         ORG     0.0000    0.0000    0.0000        56
         LOC     0.0000    0.0000    0.0000        39
        MISC     0.0000    0.0000    0.0000         6

   micro avg     0.6000    0.5940    0.5970       298
   macro avg     0.1597    0.2246    0.1867       298
weighted avg     0.4224    0.5940    0.4937       298

2022-05-04 15:23:16,186 ----------------------------------------------------------------------------------------------------



