<a href="https://colab.research.google.com/github/Jefffish09/MachineLearning/blob/main/NER/bert/bert_simpletransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Get the raw dataset

!curl -s -LO https://github.com/MahmooudTaha/NLP-2019/raw/master/ner_dataset.csv.zip
!unzip -o ner_dataset.csv.zip
!rm -f ner_dataset.csv.zip


import pandas as pd
from sklearn.model_selection import train_test_split


class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

df = pd.read_csv("/content/ner_dataset.csv", encoding="Windows-1252").fillna(method="ffill")

getter = SentenceGetter(df)

sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[2] for s in sentence] for sentence in getter.sentences]

tr_sents, eval_sents, tr_labels, eval_labels = train_test_split(sentences, labels, random_state=2018, test_size=0.1)

train_data = []
for i, pair in enumerate(zip(tr_sents, tr_labels)):
  sent, label = pair
  for ii in zip(sent, label):
    l = list(ii)
    l.insert(0, i)
    train_data.append(l)
train_data = pd.DataFrame(
    train_data, columns=["sentence_id", "words", "labels"]
)

eval_data = []
for i, pair in enumerate(zip(eval_sents, eval_labels)):
  sent, label = pair
  for ii in zip(sent, label):
    l = list(ii)
    l.insert(0, i)
    eval_data.append(l)
eval_data = pd.DataFrame(
    eval_data, columns=["sentence_id", "words", "labels"]
)

Archive:  ner_dataset.csv.zip
  inflating: ner_dataset.csv         


In [1]:
!pip install -U pandas
!pip install -U tqdm
!pip install -U seqeval
!pip install -U simpletransformers


Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/4c/33/87b15a5baeeb71bd677da3579f907e97476c5247c0e56a37079843af5424/pandas-1.2.2-cp37-cp37m-manylinux1_x86_64.whl (9.9MB)
[K     |████████████████████████████████| 9.9MB 19.1MB/s 
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.1.0; python_version >= "3.0", but you'll have pandas 1.2.2 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 1.1.5
    Uninstalling pandas-1.1.5:
      Successfully uninstalled pandas-1.1.5
Successfully installed pandas-1.2.2


Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/d9/13/f3f815bb73804a8af9cfbb6f084821c037109108885f46131045e8cf044e/tqdm-4.57.0-py2.py3-none-any.whl (72kB)
[K     |████▌                           | 10kB 23.5MB/s eta 0:00:01[K     |█████████                       | 20kB 30.9MB/s eta 0:00:01[K     |█████████████▌                  | 30kB 20.9MB/s eta 0:00:01[K     |██████████████████              | 40kB 24.3MB/s eta 0:00:01[K     |██████████████████████▌         | 51kB 24.4MB/s eta 0:00:01[K     |███████████████████████████     | 61kB 26.9MB/s eta 0:00:01[K     |███████████████████████████████▍| 71kB 24.1MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 10.4MB/s 
[?25hInstalling collected packages: tqdm
  Found existing installation: tqdm 4.41.1
    Uninstalling tqdm-4.41.1:
      Successfully uninstalled tqdm-4.41.1
Successfully installed tqdm-4.57.0
Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/

In [2]:
import logging
from simpletransformers.ner import NERModel
from seqeval.metrics import classification_report

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Configure the model
# https://simpletransformers.ai/docs/usage/
model_args = {
    "output_dir": "outputs/",
    "best_model_dir": "outputs/best_model",
    "num_train_epochs": 10,
    "train_batch_size": 128,
    "learning_rate": 4e-5,
    "max_seq_length": 128,
    "max_grad_norm": 1.0,
    "gradient_accumulation_steps": 1,
    "fp16": True,
    "optimizer": "AdamW",
    "labels_list": ["O", "B-geo", "B-gpe", "B-tim", "B-org", "I-geo", "B-per", "I-per", "I-org", "I-tim", "B-art", "I-art", "B-nat", "I-gpe", "I-nat", "B-eve", "I-eve"],
    "do_lower_case": True,
    "evaluate_during_training": True,
    "evaluate_during_training_verbose": True,
    "evaluate_during_training_steps": 1000,
    "eval_batch_size": 128,
    "warmup_steps": 0,
    "warmup_ratio": 0.06,
    "weight_decay": 0,
    "use_early_stopping": True,
    "early_stopping_patience": 3,
    "early_stopping_metric": "eval_loss",
    "early_stopping_delta": 0.01,
    "early_stopping_consider_epochs": True,
    "early_stopping_metric_minimize": True,
    "logging_steps": 10,
    "reprocess_input_data": True,
    "no_save": True,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "overwrite_output_dir": True,
    "use_multiprocessing": True,
    "use_multiprocessing_for_evaluation": False,
    "no_cache": True
}

model = NERModel(
    # https://simpletransformers.ai/docs/ner-specifics/
    # https://huggingface.co/transformers/pretrained_models.html
    "bert",
    "bert-base-uncased",
    args=model_args,
    use_cuda=True
    )

# Train the model
model.train_model(
    train_data,
    eval_data=eval_data
    )

# Evaluate the model
result, model_outputs, preds_list = model.eval_model(
    eval_data,
    acc=classification_report
    )

print(result)

INFO:filelock:Lock 139840446922640 acquired on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

INFO:filelock:Lock 139840446922640 released on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170.lock
INFO:filelock:Lock 139840442206416 acquired on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

INFO:filelock:Lock 139840442206416 released on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

INFO:filelock:Lock 139840442358608 released on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12184307018392965, 'precision': 0.7997734599634051, 'recall': 0.8080816973325117, 'f1_score': 0.803906113154668}


Running Epoch 1 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10999647959282524, 'precision': 0.8100859300407951, 'recall': 0.8216392288053526, 'f1_score': 0.8158216783216783}


Running Epoch 2 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10452365894850932, 'precision': 0.8279049295774648, 'recall': 0.827977814948499, 'f1_score': 0.8279413706589199}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10406274368104182, 'precision': 0.8266584333098095, 'recall': 0.8249845937142354, 'f1_score': 0.8258206653447897}


Running Epoch 3 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10954253355923452, 'precision': 0.8148212275993421, 'recall': 0.8285940663790826, 'f1_score': 0.8216499345264076}


Running Epoch 4 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10905046682608754, 'precision': 0.8238961038961039, 'recall': 0.837749801919183, 'f1_score': 0.8307652014492121}


Running Epoch 5 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12041746530877917, 'precision': 0.8160810926896315, 'recall': 0.836341227220706, 'f1_score': 0.8260869565217392}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 1
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12038034514376991, 'precision': 0.8272552783109405, 'recall': 0.8347565806849194, 'f1_score': 0.8309890013583979}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 2
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 6 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12418072298169136, 'precision': 0.8208102493074793, 'recall': 0.8347565806849194, 'f1_score': 0.8277246737375059}
INFO:simpletransformers.ner.ner_model: No improvement in eval_loss
INFO:simpletransformers.ner.ner_model: Current step: 3
INFO:simpletransformers.ner.ner_model: Early stopping patience: 3


Running Epoch 7 of 10:   0%|          | 0/338 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.13190018522896266, 'precision': 0.8277840563331305, 'recall': 0.8382780174311119, 'f1_score': 0.8329979879275653}
INFO:simpletransformers.ner.ner_model: Patience of 3 steps reached
INFO:simpletransformers.ner.ner_model: Training terminated.
INFO:simpletransformers.ner.ner_model: Training of bert model complete. Saved to outputs/.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/4796 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.13190018522896266, 'precision': 0.8277840563331305, 'recall': 0.8382780174311119, 'f1_score': 0.8329979879275653, 'acc': '              precision    recall  f1-score   support\n\n         art       0.31      0.20      0.25        49\n         eve       0.39      0.42      0.41        33\n         geo       0.85      0.90      0.87      3735\n         gpe       0.96      0.95      0.95      1596\n         nat       0.44      0.35      0.39        23\n         org       0.70      0.67      0.68      2071\n         per       0.76      0.80      0.78      1694\n         tim       0.88      0.87      0.88      2158\n\n   micro avg       0.83      0.84      0.83     11359\n   macro avg       0.66      0.65      0.65     11359\nweighted avg       0.83      0.84      0.83     11359\n'}


{'eval_loss': 0.13190018522896266, 'precision': 0.8277840563331305, 'recall': 0.8382780174311119, 'f1_score': 0.8329979879275653, 'acc': '              precision    recall  f1-score   support\n\n         art       0.31      0.20      0.25        49\n         eve       0.39      0.42      0.41        33\n         geo       0.85      0.90      0.87      3735\n         gpe       0.96      0.95      0.95      1596\n         nat       0.44      0.35      0.39        23\n         org       0.70      0.67      0.68      2071\n         per       0.76      0.80      0.78      1694\n         tim       0.88      0.87      0.88      2158\n\n   micro avg       0.83      0.84      0.83     11359\n   macro avg       0.66      0.65      0.65     11359\nweighted avg       0.83      0.84      0.83     11359\n'}


acc =               precision    recall  f1-score   support

         art       0.31      0.20      0.25        49
         eve       0.39      0.42      0.41        33
         geo       0.85      0.90      0.87      3735
         gpe       0.96      0.95      0.95      1596
         nat       0.44      0.35      0.39        23
         org       0.70      0.67      0.68      2071
         per       0.76      0.80      0.78      1694
         tim       0.88      0.87      0.88      2158

   micro avg       0.83      0.84      0.83     11359
   macro avg       0.66      0.65      0.65     11359
weighted avg       0.83      0.84      0.83     11359

eval_loss = 0.13190018522896266
f1_score = 0.8329979879275653
precision = 0.8277840563331305
recall = 0.8382780174311119
