<a href="https://colab.research.google.com/github/Jefffish09/MachineLearning/blob/dev/NER/bert_ner_simpletransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:

* https://github.com/ThilinaRajapakse/simpletransformers
* https://simpletransformers.ai/

Data Source:

* https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


In [1]:
!pip install -U -q seqeval
!pip install -U simpletransformers

[?25l[K     |███████▌                        | 10kB 20.9MB/s eta 0:00:01[K     |███████████████                 | 20kB 27.3MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 22.4MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 25.6MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 7.4MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/6d/ee/99e2809fb311841376fe01f3524a912b1907d7b45d445f16ad27b4422c9f/simpletransformers-0.60.9-py3-none-any.whl (206kB)
[K     |████████████████████████████████| 215kB 17.1MB/s 
Collecting transformers>=4.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 42.7MB/s 
Collecting streamlit
[?25l  Downloading https://files.pythonhost

In [1]:
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.ner import NERModel
from seqeval.metrics import classification_report

In [2]:
seed = 2021

In [3]:
# Get the raw dataset

!curl -s -LO https://github.com/MahmooudTaha/NLP-2019/raw/master/ner_dataset.csv.zip
!unzip -o ner_dataset.csv.zip
!rm -f ner_dataset.csv.zip


class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

df = pd.read_csv("ner_dataset.csv", encoding="Windows-1252").fillna(method="ffill")

getter = SentenceGetter(df)

sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
tag_values = list(set(df["Tag"].values))

Archive:  ner_dataset.csv.zip
  inflating: ner_dataset.csv         


In [4]:
tr_sents, test_sents, tr_labels, test_labels = train_test_split(sentences, labels, random_state=seed, test_size=0.1)

train_data = []
for i, pair in enumerate(zip(tr_sents, tr_labels)):
  sent, label = pair
  for ii in zip(sent, label):
    l = list(ii)
    l.insert(0, i)
    train_data.append(l)
train_data = pd.DataFrame(
    train_data, columns=["sentence_id", "words", "labels"]
)

test_data = []
for i, pair in enumerate(zip(test_sents, test_labels)):
  sent, label = pair
  for ii in zip(sent, label):
    l = list(ii)
    l.insert(0, i)
    test_data.append(l)
test_data = pd.DataFrame(
    test_data, columns=["sentence_id", "words", "labels"]
)

In [5]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Configure the model
# https://simpletransformers.ai/docs/usage/
model_args = {
    "output_dir": "outputs/",
    "best_model_dir": "outputs/best_model",
    "num_train_epochs": 100,
    "train_batch_size": 128,
    "learning_rate": 4e-5,
    "max_seq_length": 128,
    "max_grad_norm": 1.0,
    "gradient_accumulation_steps": 1,
    "fp16": True,
    "optimizer": "AdamW",
    "labels_list": tag_values,
    "do_lower_case": True,
    "evaluate_during_training": True,
    "evaluate_during_training_verbose": True,
    "evaluate_during_training_steps": 1000,
    "eval_batch_size": 128,
    "warmup_steps": 0,
    "warmup_ratio": 0.06,
    "weight_decay": 0,
    "use_early_stopping": True,
    "early_stopping_patience": 3,
    "early_stopping_metric": "eval_loss",
    "early_stopping_delta": 0.001,
    "early_stopping_consider_epochs": True,
    "early_stopping_metric_minimize": True,
    "logging_steps": 10,
    "reprocess_input_data": True,
    "no_save": True,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "overwrite_output_dir": True,
    "use_multiprocessing": True,
    "use_multiprocessing_for_evaluation": False,
    "no_cache": True
}

model = NERModel(
    # https://simpletransformers.ai/docs/ner-specifics/
    # https://huggingface.co/transformers/pretrained_models.html
    "bert",
    "bert-base-uncased",
    args=model_args,
    use_cuda=True
    )

# Train the model
model.train_model(
    train_data,
    eval_data=test_data,
    show_running_loss=True
    )

# Evaluate the model
result, model_outputs, preds_list = model.eval_model(
    test_data,
    # acc=classification_report
    )

test_data_new = test_data.groupby("sentence_id")["labels"].apply(list).reset_index(name="grouped_labels")
true_list = test_data_new["grouped_labels"].tolist()
report = classification_report(y_true=true_list, y_pred=preds_list, digits=2)
print(report)

INFO:filelock:Lock 140660220264720 acquired on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

INFO:filelock:Lock 140660220264720 released on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock
INFO:filelock:Lock 140660076169168 acquired on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

INFO:filelock:Lock 140660076169168 released on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

INFO:filelock:Lock 140660071514704 released on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Running Epoch 0 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.17628910863085798, 'precision': 0.7004254609159923, 'recall': 0.7485734664764622, 'f1_score': 0.7236995216135844}


Running Epoch 1 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1312605615117048, 'precision': 0.7779024600776867, 'recall': 0.8034950071326676, 'f1_score': 0.7904916451032851}


Running Epoch 2 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10979211840190385, 'precision': 0.8022574153469245, 'recall': 0.8174928673323824, 'f1_score': 0.8098034886288364}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11281444563677437, 'precision': 0.7918939984411536, 'recall': 0.8152639087018545, 'f1_score': 0.8034090409875675}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 1
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 3 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10183607669253099, 'precision': 0.8048567435359888, 'recall': 0.8215049928673324, 'f1_score': 0.8130956583127427}


Running Epoch 4 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10457245064409156, 'precision': 0.8056426332288401, 'recall': 0.8248930099857347, 'f1_score': 0.8151541850220264}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 1
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 5 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.09909482496349435, 'precision': 0.8218416173744151, 'recall': 0.8299750356633381, 'f1_score': 0.8258883023554983}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10192971127597909, 'precision': 0.8143749450162752, 'recall': 0.8253388017118403, 'f1_score': 0.8198202187486163}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 1
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 6 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10433638252710041, 'precision': 0.8174938401971137, 'recall': 0.828281027104137, 'f1_score': 0.8228520814880426}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 2
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 7 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11243889559256404, 'precision': 0.8059187887130076, 'recall': 0.8352353780313837, 'f1_score': 0.8203152364273204}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 3
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 8 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11404411769226978, 'precision': 0.8261711870376893, 'recall': 0.8364835948644793, 'f1_score': 0.8312954102427786}
INFO:simpletransformers.ner.ner_model: Patience of 3 steps reached
INFO:simpletransformers.ner.ner_model: Training terminated.
INFO:simpletransformers.ner.ner_model: Training of bert model complete. Saved to outputs/.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11404411769226978, 'precision': 0.8261711870376893, 'recall': 0.8364835948644793, 'f1_score': 0.8312954102427786}


              precision    recall  f1-score   support

         art       0.38      0.27      0.31        49
         eve       0.33      0.33      0.33        30
         geo       0.84      0.90      0.87      3712
         gpe       0.96      0.92      0.94      1649
         nat       0.36      0.53      0.43        17
         org       0.70      0.66      0.68      2033
         per       0.79      0.80      0.79      1699
         tim       0.87      0.88      0.87      2027

   micro avg       0.83      0.84      0.83     11216
   macro avg       0.65      0.66      0.65     11216
weighted avg       0.83      0.84      0.83     11216

