# NER implementation using transformers



Firstly, we download library, containing datasets

In [2]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

Import libraries

In [3]:
import os
import keras
from keras import ops
import numpy as np
import tensorflow as tf
from keras import layers
from datasets import load_dataset
from collections import Counter

## Load the dataset from hugslib and process it

Define functions

In [4]:
def export_to_file(export_file_path, data):
    with open(export_file_path, "w") as f:
        for record in data:
            ner_tags = record["tags"]
            tokens = record["tokens"]
            if len(tokens) > 0:
                f.write(
                    str(len(tokens))
                    + "\t"
                    + "\t".join(tokens)
                    + "\t"
                    + "\t".join(map(str, ner_tags))
                    + "\n"
                )

def make_tag_lookup_table():
    iob_labels = ["1"]
    all_labels = [(label1) for label1 in iob_labels]
    all_labels = ["-".join([a]) for a in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))

Download dataset (10%)

In [5]:
ds = load_dataset("Gepe55o/mountain-ner-dataset", split=['train[:10%]','test[:10%]'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/27.5M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/6.86M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/88619 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22110 [00:00<?, ? examples/s]

Process dataset, retrieve tokens and vocabulary

In [6]:

os.makedirs("data", exist_ok=True)
export_to_file("./data/train.txt", ds[0])
export_to_file("./data/test.txt", ds[1])

mapping = make_tag_lookup_table()

all_tokens = sum(ds[0]["tokens"],[])

print()
all_tokens_array = np.array(list(map(str.lower, all_tokens)))

counter = Counter(all_tokens_array)

num_tags = len(mapping)
vocab_size = 20000

vocabulary = [token for token, count in counter.most_common(vocab_size - 1)]

lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)

train_data = tf.data.TextLineDataset("./data/train.txt")
test_data = tf.data.TextLineDataset("./data/test.txt")




## Describe model

Define TransformerBlock layer

In [7]:

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


Define TokenAndPositionEmbedding layer:

In [8]:

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = ops.shape(inputs)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings


## Build the NER model class as a keras.Model subclass

In [9]:

class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=1024, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x


Define functions for matching tokens with vocabulary

In [10]:
def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags


def lowercase_and_convert_to_ids(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

Transform data into more suitable form for training

In [11]:
batch_size = 32
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)
test_dataset = (
    test_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

Setup model

In [12]:
ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)

## Compile and fit the model

In [13]:
tf.config.run_functions_eagerly(True)
ner_model.compile(optimizer="Adam", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction=None), metrics=['accuracy'])
ner_model.fit(train_dataset, epochs=5,validation_data=test_dataset)

Epoch 1/5
    277/Unknown [1m62s[0m 192ms/step - accuracy: 0.9281 - loss: 0.2428

  self.gen.throw(typ, value, traceback)


[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 206ms/step - accuracy: 0.9282 - loss: 0.2425 - val_accuracy: 0.9742 - val_loss: 0.0760
Epoch 2/5
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 203ms/step - accuracy: 0.9789 - loss: 0.0643 - val_accuracy: 0.9798 - val_loss: 0.0593
Epoch 3/5
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 205ms/step - accuracy: 0.9832 - loss: 0.0506 - val_accuracy: 0.9814 - val_loss: 0.0542
Epoch 4/5
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 205ms/step - accuracy: 0.9848 - loss: 0.0446 - val_accuracy: 0.9740 - val_loss: 0.0673
Epoch 5/5
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 203ms/step - accuracy: 0.9839 - loss: 0.0469 - val_accuracy: 0.9797 - val_loss: 0.0604


<keras.src.callbacks.history.History at 0x7aa6b80b1450>

In [14]:
def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)

# Sample inference using the trained model
sample_input = tokenize_and_convert_to_ids(
    "The highest mountain in the world is Everest"
)
sample_input = ops.reshape(sample_input, [1, -1])
print(sample_input)

output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping[i] for i in prediction]

print(prediction)

tf.Tensor([[  1  68  20   4   1 206   7 325]], shape=(1, 8), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
['O', 'O', 'O', 'O', 'O', 'O', 'O', '1']


