<a href="https://colab.research.google.com/github/Jefffish09/MachineLearning/blob/dev/NER/bert_ner_simpletransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:

* https://github.com/ThilinaRajapakse/simpletransformers
* https://simpletransformers.ai/

Data Source:

* https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


In [1]:
!pip install -U -q seqeval
!pip install -U simpletransformers

[?25l[K     |███████▌                        | 10kB 22.8MB/s eta 0:00:01[K     |███████████████                 | 20kB 17.3MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 14.3MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 13.0MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 4.5MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/6d/ee/99e2809fb311841376fe01f3524a912b1907d7b45d445f16ad27b4422c9f/simpletransformers-0.60.9-py3-none-any.whl (206kB)
[K     |████████████████████████████████| 215kB 8.5MB/s 
Collecting tensorboardx
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)
[K     |████████████████████████████████| 317kB 14.3MB/s 
[?25hCollecting wandb
[?25l  Downloading https://files.pythonhosted.o

In [1]:
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.ner import NERModel
from seqeval.metrics import classification_report, accuracy_score

In [2]:
seed = 2021

In [3]:
# Get the raw dataset

!curl -s -LO https://github.com/MahmooudTaha/NLP-2019/raw/master/ner_dataset.csv.zip
!unzip -o ner_dataset.csv.zip
!rm -f ner_dataset.csv.zip


class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

df = pd.read_csv("ner_dataset.csv", encoding="Windows-1252").fillna(method="ffill")

getter = SentenceGetter(df)

sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
tag_values = list(set(df["Tag"].values))

Archive:  ner_dataset.csv.zip
  inflating: ner_dataset.csv         


In [4]:
tr_sents, test_sents, tr_labels, test_labels = train_test_split(sentences, labels, random_state=seed, test_size=0.1)

train_data = []
for i, pair in enumerate(zip(tr_sents, tr_labels)):
  sent, label = pair
  for ii in zip(sent, label):
    l = list(ii)
    l.insert(0, i)
    train_data.append(l)
train_data = pd.DataFrame(
    train_data, columns=["sentence_id", "words", "labels"]
)

test_data = []
for i, pair in enumerate(zip(test_sents, test_labels)):
  sent, label = pair
  for ii in zip(sent, label):
    l = list(ii)
    l.insert(0, i)
    test_data.append(l)
test_data = pd.DataFrame(
    test_data, columns=["sentence_id", "words", "labels"]
)

In [5]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Configure the model
# https://simpletransformers.ai/docs/usage/
model_args = {
    "output_dir": "outputs/",
    "best_model_dir": "outputs/best_model",
    "num_train_epochs": 100,
    "train_batch_size": 128,
    "learning_rate": 4e-5,
    "max_seq_length": 128,
    "max_grad_norm": 1.0,
    "gradient_accumulation_steps": 1,
    "fp16": True,
    "optimizer": "AdamW",
    "labels_list": tag_values,
    "do_lower_case": True,
    "evaluate_during_training": True,
    "evaluate_during_training_verbose": True,
    "evaluate_during_training_steps": 1000,
    "eval_batch_size": 128,
    "warmup_steps": 0,
    "warmup_ratio": 0.06,
    "weight_decay": 0,
    "use_early_stopping": True,
    "early_stopping_patience": 3,
    "early_stopping_metric": "eval_loss",
    "early_stopping_delta": 0.001,
    "early_stopping_consider_epochs": True,
    "early_stopping_metric_minimize": True,
    "logging_steps": 10,
    "reprocess_input_data": True,
    "no_save": True,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "overwrite_output_dir": True,
    "use_multiprocessing": True,
    "use_multiprocessing_for_evaluation": False,
    "no_cache": True
}

model = NERModel(
    # https://simpletransformers.ai/docs/ner-specifics/
    # https://huggingface.co/transformers/pretrained_models.html
    "bert",
    "bert-base-uncased",
    args=model_args,
    use_cuda=True
    )

# Train the model
model.train_model(
    train_data,
    eval_data=test_data,
    show_running_loss=True
    )

# Evaluate the model
result, model_outputs, preds_list = model.eval_model(
    test_data,
    # acc=classification_report
    )

test_data_new = test_data.groupby("sentence_id")["labels"].apply(list).reset_index(name="grouped_labels")
true_list = test_data_new["grouped_labels"].tolist()
report = classification_report(y_true=true_list, y_pred=preds_list, digits=2)
print(report)
print("")
print("Accuracy: {}".format(accuracy_score(y_true=true_list, y_pred=preds_list)))

INFO:filelock:Lock 140172668846352 acquired on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

INFO:filelock:Lock 140172668846352 released on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock
INFO:filelock:Lock 140172798629904 acquired on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

INFO:filelock:Lock 140172798629904 released on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

INFO:filelock:Lock 140172649782672 released on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Running Epoch 0 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.18422266919361918, 'precision': 0.6471736414740787, 'recall': 0.7390335235378032, 'f1_score': 0.6900599400599401}


Running Epoch 1 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12810308388189265, 'precision': 0.7915024091108191, 'recall': 0.8055456490727532, 'f1_score': 0.7984622862445319}


Running Epoch 2 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10971223464921902, 'precision': 0.8012837421964302, 'recall': 0.8125, 'f1_score': 0.8068528930010181}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11043327321347438, 'precision': 0.8035446609646416, 'recall': 0.8125, 'f1_score': 0.8079975174003635}


Running Epoch 3 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10193549939676334, 'precision': 0.811300827319134, 'recall': 0.8218616262482168, 'f1_score': 0.8165470812295155}


Running Epoch 4 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10317617125417057, 'precision': 0.8032545659136155, 'recall': 0.8273894436519258, 'f1_score': 0.8151433967236155}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 1
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 5 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10196255127850332, 'precision': 0.8091903719912472, 'recall': 0.8242689015691869, 'f1_score': 0.8166600415176007}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.09935791790485382, 'precision': 0.8167120639662595, 'recall': 0.8287268188302425, 'f1_score': 0.8226755764039473}


Running Epoch 6 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10462512644497972, 'precision': 0.811270047680971, 'recall': 0.8343437945791726, 'f1_score': 0.8226451584545734}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 1
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 7 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11154128297379143, 'precision': 0.8161739130434783, 'recall': 0.8368402282453637, 'f1_score': 0.8263778834301814}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 2
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 8 of 100:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12159324594234165, 'precision': 0.8147441457068517, 'recall': 0.8375534950071327, 'f1_score': 0.8259913831003254}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 3
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11559421725963291, 'precision': 0.8179036629075969, 'recall': 0.8341654778887304, 'f1_score': 0.8259545354226441}
INFO:simpletransformers.ner.ner_model: Patience of 3 steps reached
INFO:simpletransformers.ner.ner_model: Training terminated.
INFO:simpletransformers.ner.ner_model: Training of bert model complete. Saved to outputs/.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11559421725963291, 'precision': 0.8179036629075969, 'recall': 0.8341654778887304, 'f1_score': 0.8259545354226441}


              precision    recall  f1-score   support

         art       0.33      0.24      0.28        49
         eve       0.30      0.37      0.33        30
         geo       0.84      0.89      0.87      3712
         gpe       0.96      0.93      0.94      1649
         nat       0.58      0.41      0.48        17
         org       0.69      0.66      0.67      2033
         per       0.75      0.80      0.78      1699
         tim       0.87      0.88      0.87      2027

   micro avg       0.82      0.83      0.83     11216
   macro avg       0.67      0.65      0.65     11216
weighted avg       0.82      0.83      0.83     11216


Accuracy: 0.9692265434086647
