# ***Named Entity Recognition using Recurrent Neural Networks***

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
import tabulate

# 1. Data loading

In [None]:
data = df = pd.read_csv("/content/ner.csv")
data

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
...,...,...,...,...
47954,Sentence: 47955,Indian border security forces are accusing the...,"['JJ', 'NN', 'NN', 'NNS', 'VBP', 'VBG', 'PRP$'...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe..."
47955,Sentence: 47956,Indian officials said no one was injured in Sa...,"['JJ', 'NNS', 'VBD', 'DT', 'NN', 'VBD', 'VBN',...","['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '..."
47956,Sentence: 47957,Two more landed in fields belonging to a nearb...,"['CD', 'JJR', 'VBD', 'IN', 'NNS', 'VBG', 'TO',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
47957,Sentence: 47958,They say not all of the rockets exploded upon ...,"['PRP', 'VBP', 'RB', 'DT', 'IN', 'DT', 'NNS', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47959 entries, 0 to 47958
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence #  47959 non-null  object
 1   Sentence    47959 non-null  object
 2   POS         47959 non-null  object
 3   Tag         47959 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB


In [None]:
df.isna().sum()

Unnamed: 0,0
Sentence #,0
Sentence,0
POS,0
Tag,0


In [None]:
sentences = df['Sentence'].to_numpy()
tags = df['Tag'].apply(lambda x: x[1:-1].replace("'", "").replace(",", "")).to_numpy()

In [None]:
sentences

array(['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
       'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "',
       'They marched from the Houses of Parliament to a rally in Hyde Park .',
       ..., 'Two more landed in fields belonging to a nearby village .',
       'They say not all of the rockets exploded upon impact .',
       'Indian forces said they responded to the attack'], dtype=object)

In [None]:
tags

array(['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O',
       'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O',
       'O O O O O O O O O O O B-geo I-geo O', ...,
       'O O O O O O O O O O O', 'O O O O O O O O O O O',
       'B-gpe O O O O O O O'], dtype=object)

In [None]:
X_train, X_tmp, y_train, y_tmp = train_test_split(sentences, tags, test_size=1000, random_state=101)

X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=101)

In [None]:
print(f"Total training samples: {len(X_train)}")
print(f"Total validation samples: {len(X_val)}")
print(f"Total testing samples: {len(X_test)}")

Total training samples: 46959
Total validation samples: 500
Total testing samples: 500


# 2 - Encoding

In [None]:
def get_sentence_vectorizer(sentences):
    sentence_vectorizer = layers.TextVectorization(standardize=None)
    sentence_vectorizer.adapt(sentences)
    vocab = sentence_vectorizer.get_vocabulary()

    return sentence_vectorizer, vocab

In [None]:
sentence_vectorizer, vocab = get_sentence_vectorizer(X_train)

In [None]:
print(f"Number of unique words in the training vocab: {len(vocab)}\n")

print(f"Sentence: {X_train[0]}\n")

print(f"Sentence vectorized: {sentence_vectorizer(X_train[0])}")

Number of unique words in the training vocab: 34894

Sentence: Nigeria 's parliament is debating a constitutional amendment proposed by supporters of Mr. Obasanjo that would permit presidents three terms in office instead of two .

Sentence vectorized: [  723    11   273    14 10143     8  1177  3763   944    23   761     5
    34  3601    16    93  4461  2413    94  1892     6   425  1853     5
    44     3]


In [None]:
print(f"Sentence: {X_train[0]}")
print(f"Labels: {y_train[0]}")

Sentence: Nigeria 's parliament is debating a constitutional amendment proposed by supporters of Mr. Obasanjo that would permit presidents three terms in office instead of two .
Labels: B-geo O O O O O O O O O O O B-per B-org O O O O O O O O O O O O


In [None]:
def get_tags(labels):
    tags = set()
    for l in labels:
        for tag in l.split(' '):
            tags.add(tag)

    return list(sorted(tags))

In [None]:
tags = get_tags(y_train)
print(tags)

['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [None]:
def label_vectorizer(lables, tags):
    label_ids = []
    for l in lables:
        tags_ids = [tags.index(tag) for tag in l.split(' ')]
        label_ids.append(tags_ids)

    return tf.keras.preprocessing.sequence.pad_sequences(label_ids, padding='post', value=-1)

In [None]:
label_vectorizer([y_train[0], 'O'], tags)

array([[ 2, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  6,  5, 16, 16,
        16, 16, 16, 16, 16, 16, 16, 16, 16, 16],
       [16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], dtype=int32)

# 3 - Building the Dataset

In [None]:
def generate_dataset(sentences, labels, sentence_vectorizer, label_vectorizer, tags):
    sentences_ids = sentence_vectorizer(sentences)
    label_ids = label_vectorizer(labels, tags)
    ds = tf.data.Dataset.from_tensor_slices((sentences_ids, label_ids))
    return ds

In [None]:
ds_train = generate_dataset(X_train, y_train, sentence_vectorizer, label_vectorizer, tags)
ds_val = generate_dataset(X_val, y_val, sentence_vectorizer, label_vectorizer, tags)
ds_test = generate_dataset(X_test, y_test, sentence_vectorizer, label_vectorizer, tags)

In [None]:
for x, y in ds_train.take(1):
    print(f"X: {x}")
    print(f"Y: {y}")

X: [  723    11   273    14 10143     8  1177  3763   944    23   761     5
    34  3601    16    93  4461  2413    94  1892     6   425  1853     5
    44     3     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
Y: [ 2 16 16 16 16 16 16 16 16 16 16 16  6  5 16 16 16 16 16 16 16 16 16 16
 16 16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1]


# 4 - Building the Model

In [None]:
def NER(vocab_size, emb_dim, tags_size):
    model = tf.keras.Sequential()

    # vocab_size +1 -> Why +1? the padding token '0'
    # why mask_zero? The value 0 should be reserved as the mask value, as it will be ignored in training.
    model.add(layers.Embedding(input_dim=vocab_size+1, output_dim=emb_dim, mask_zero=True))

    model.add(layers.LSTM(units=emb_dim, return_sequences=True))

    model.add(layers.Dense(units=tags_size, activation=tf.nn.log_softmax))

    return model

In [None]:
def masked_loss(y_true, y_pred):
    """
    Calculate the masked sparse categorical cross-entropy loss.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.

    Returns:
    loss (tensor): Calculated loss.
    """

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, ignore_class=-1)

    loss = loss_fn(y_true, y_pred)

    return  loss

In [None]:
# GRADED FUNCTION: masked_accuracy
def masked_accuracy(y_true, y_pred):
    """
    Calculate masked accuracy for predicted labels.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.

    Returns:
    accuracy (tensor): Masked accuracy.

    """

    y_true = tf.cast(y_true, tf.float32)

    mask = tf.not_equal(y_true, -1)
    mask = tf.cast(mask, tf.float32)

    y_pred_class = tf.math.argmax(y_pred, axis=-1)
    y_pred_class = tf.cast(y_pred_class, tf.float32)

    matches_true_pred  = tf.equal(y_true, y_pred_class)
    matches_true_pred = tf.cast(matches_true_pred , tf.float32)

    matches_true_pred *= mask

    masked_acc = tf.reduce_sum(matches_true_pred) / tf.reduce_sum(mask)

    return masked_acc

In [None]:
model = NER(len(vocab), 128, len(tags))

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.01),
    loss=masked_loss,
    metrics=[masked_accuracy]
)

# 5 - Training the Model

In [None]:
BATCH_SIZE = 64

history = model.fit(
    ds_train.batch(BATCH_SIZE),
    validation_data=ds_val.batch(BATCH_SIZE),
    shuffle=True,
    epochs=2
)

Epoch 1/2
[1m734/734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 308ms/step - loss: 0.3322 - masked_accuracy: 0.9192 - val_loss: 0.1161 - val_masked_accuracy: 0.9638
Epoch 2/2
[1m734/734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 302ms/step - loss: 0.1006 - masked_accuracy: 0.9671 - val_loss: 0.1119 - val_masked_accuracy: 0.9647


In [None]:
model.save_weights('NER.weights.h5')

# 6 - Evaluating the Model

In [None]:
model.load_weights('NER.weights.h5')

In [None]:
test_sentences_ids = sentence_vectorizer(X_test)
y_true = label_vectorizer(y_test, tags)

y_pred = model.predict(test_sentences_ids)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step


In [None]:
print(f"The model's accuracy in test set is: {masked_accuracy(y_true, y_pred).numpy():.4f}")

The model's accuracy in test set is: 0.9594


# 7 - Testing

In [None]:
def predict(sentence, model, sentence_vectorizer, tags):

    sentence_ids = sentence_vectorizer(sentence)
    sentence_ids = tf.expand_dims(sentence_ids, axis=0)

    output = model.predict(sentence_ids)

    outputs = np.argmax(output, axis=-1)
    outputs = outputs[0]

    pred = [tags[tag_idx] for tag_idx in outputs]

    return pred

In [None]:
sentence = "Mustafa is going to Gaza (Palestine) next Ramadan with Mohamed to kill the Israel Army"

ner = predict(sentence, model, sentence_vectorizer, tags)

table = [[word, tag] for word, tag in zip(sentence.split(' '), ner)]

print(f"\nSentence: {sentence}\n")
print(f"NER Tags: {' '.join(ner)}\n")
print(tabulate.tabulate(table, headers=['Word', 'Tag'], tablefmt='rounded_outline'))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step

Sentence: Mustafa is going to Gaza (Palestine) next Ramadan with Mohamed to kill the Israel Army

NER Tags: B-per O O O B-geo I-org O B-tim O B-per O O O B-geo I-org

╭─────────────┬───────╮
│ Word        │ Tag   │
├─────────────┼───────┤
│ Mustafa     │ B-per │
│ is          │ O     │
│ going       │ O     │
│ to          │ O     │
│ Gaza        │ B-geo │
│ (Palestine) │ I-org │
│ next        │ O     │
│ Ramadan     │ B-tim │
│ with        │ O     │
│ Mohamed     │ B-per │
│ to          │ O     │
│ kill        │ O     │
│ the         │ O     │
│ Israel      │ B-geo │
│ Army        │ I-org │
╰─────────────┴───────╯
