<a href="https://colab.research.google.com/github/Krojan/NER-using-BERT/blob/master/NER_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers
!pip install seqeval

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/542.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m358.4/542.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
import pandas as pd
import numpy as np
import datasets
import csv

from sklearn import preprocessing
from transformers import BertTokenizerFast

In [None]:
#config

## MAX_LEN = 128
EPOCHS = 2
MODEL_CHECKPOINT = 'bert-base-uncased'
MODEL_PATH = "model.bin"
TRAINING_FILE_PATH = 'ner_dataset_sm.csv'
TOKENIZER = BertTokenizerFast.from_pretrained(MODEL_CHECKPOINT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
#number of words of input sentence match with tokens/word ids
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = TOKENIZER(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
#add padding to labels such that all sentences have same size
from transformers import DataCollatorForTokenClassification


def apply_padding(tokenized_datasets):
  data_collator = DataCollatorForTokenClassification(
      tokenizer=TOKENIZER, return_tensors="np"
  )

  tf_train_data = tokenized_datasets["train"].to_tf_dataset(
      columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
      collate_fn=data_collator,
      shuffle=True,
      batch_size=16,
  )

  tf_eval_data = tokenized_datasets["test"].to_tf_dataset(
      columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
      collate_fn=data_collator,
      shuffle=True,
      batch_size=16,
  )

  return tf_train_data, tf_eval_data


In [None]:
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification


def train_and_save(id2label, label2id, tf_train_data, tf_eval_data):
  model = TFAutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    id2label=id2label,
    label2id=label2id,
    )
  num_train_steps = len(tf_train_data) * EPOCHS

  optimizer, schedule = create_optimizer(
      init_lr=2e-5,
      num_warmup_steps=0,
      num_train_steps=num_train_steps,
      weight_decay_rate=0.01,
  )
  model.compile(optimizer=optimizer)

  model.fit(
      tf_train_data,
      validation_data=tf_eval_data,
      epochs=EPOCHS,
  )


  model.save_pretrained(MODEL_PATH)
  TOKENIZER.save_pretrained('tokenizer')

In [None]:
def compute_metrics():

  all_predictions = []
  all_labels = []
  for batch in tf_eval_data:
      logits = model.predict_on_batch(batch)["logits"]
      labels = batch["labels"]
      predictions = np.argmax(logits, axis=-1)

      for prediction, label in zip(predictions, labels):
          for predicted_idx, label_idx in zip(prediction, label):
              if label_idx == -100:
                  continue
              all_predictions.append(id2label[predicted_idx])
              all_labels.append(id2label[label_idx])
  results = metric.compute(predictions=[all_predictions], references=[all_labels])
  return {
    "precision": results["overall_precision"],
    "recall": results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy": results["overall_accuracy"],
    }

In [None]:
#predict

from transformers import pipeline

def predict(text):
  token_classifier = pipeline(
      "token-classification",
      model = MODEL_PATH,
      aggregation_strategy = "simple"
  )

  return token_classifier(text)

In [None]:
#load dataset
data = datasets.load_dataset('conll2003')
label_names = data["train"].features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i,label in id2label.items()}

tokenized_datasets = data.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns=data['train'].column_names
)

tf_train_data, tf_eval_data = apply_padding(tokenized_datasets)
train_and_save(id2label=id2label, label2id=label2id,tf_train_data=tf_train_data,tf_eval_data=tf_eval_data)
predict("My name is Rojan, from ktm, Nepal")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/2
122/878 [===>..........................] - ETA: 1:58:18 - loss: 0.0479

In [None]:
import tensorflow as tf
tf.config.run_functions_eagerly(True)


In [None]:
label_names = data["train"].features["ner_tags"].feature.names
print(label_names)

id2label = {i: label for i, label in enumerate(label_names)}
print(id2label)

label2id = {label: i for i,label in id2label.items()}
print(label2id)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}


In [None]:
sentence = data['train'][5]['tokens']

print(len(sentence), sentence[11])
#add beginning and end tokens to sentence
tokenized_sentence = TOKENIZER(sentence, is_split_into_words = True)

#sentence has 2 more tokens than tags(if tokens not split further)
print((tokenized_sentence))
print(len(tokenized_sentence.tokens()))

print(tokenized_sentence.word_ids())

#align labels with tokens, add -100 for corresponding CLS,SEP tags



33 n't
{'input_ids': [101, 1000, 2057, 2079, 1050, 1005, 1056, 2490, 2151, 2107, 12832, 2138, 2057, 2079, 1050, 1005, 1056, 2156, 2151, 5286, 2005, 2009, 1010, 1000, 1996, 3222, 1005, 1055, 2708, 14056, 24794, 2271, 3158, 4315, 14674, 2409, 1037, 2739, 27918, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
41
[None, 0, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 21, 22, 23, 24, 24, 25, 26, 27, 28, 29, 30, 31, 32, None]


In [None]:
!nvidia-smi


