# importing the relavent libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

# importing the data

In [2]:
df = pd.read_csv('ner_dataset.csv', encoding="utf-8")

df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


## Essential info about tagged entities:

- geo = Geographical Entity
- org = Organization
- per = Person
- gpe = Geopolitical Entity
- tim = Time indicator
- art = Artifact
- eve = Event
- nat = Natural Phenomenon

# preprocessing

In [3]:
df['Tag'].value_counts()

Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [4]:
df.isnull().sum()

Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64

### Filling missing Values

In [5]:
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
df['Word'] = df['Word'].fillna(value='UnKnown').astype(str)

  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


In [6]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


### Droping unwanted columns

In [7]:
df.drop('POS', axis=1, inplace=True)

df

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,Sentence: 1,of,O
2,Sentence: 1,demonstrators,O
3,Sentence: 1,have,O
4,Sentence: 1,marched,O
...,...,...,...
1048570,Sentence: 47959,they,O
1048571,Sentence: 47959,responded,O
1048572,Sentence: 47959,to,O
1048573,Sentence: 47959,the,O


### Calculating Maximum Sentence Length

In [8]:
df.groupby('Sentence #').agg('count')['Word'].sort_values(ascending=False)

Sentence #
Sentence: 22480    104
Sentence: 33481     81
Sentence: 40153     73
Sentence: 21167     72
Sentence: 21776     70
                  ... 
Sentence: 22065      2
Sentence: 37093      2
Sentence: 4810       2
Sentence: 38917      1
Sentence: 8412       1
Name: Word, Length: 47959, dtype: int64

### Encoding the target

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Tag'] = le.fit_transform(df['Tag']).astype('O')

In [10]:
le.classes_

array(['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per',
       'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org',
       'I-per', 'I-tim', 'O'], dtype=object)

In [11]:
df['Tag'].value_counts()

Tag
16    887908
2      37644
7      20333
5      20143
14     17251
6      16990
13     16784
3      15870
10      7414
15      6528
0        402
1        308
8        297
9        253
4        201
11       198
12        51
Name: count, dtype: int64

# Data Preparation

In [74]:
from dataclasses import dataclass

@dataclass
class Args:
    max_length: int = 104
    batch_size: int = 32
    vocab_size: int = None
    n_classes: int = 18

In [75]:
# grouping by the Sentence column

data = df.groupby('Sentence #').agg(lambda x: list(x))

data.head()

Unnamed: 0_level_0,Word,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[16, 16, 7, 16, 16, 16, 16, 16, 2, 16, 16, 16,..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[2, 16, 16, 6, 14, 16, 7, 16, 2, 16, 3, 16, 3,..."


In [76]:
from keras.layers import TextVectorization

vect = TextVectorization(standardize='lower', 
                  split=None, 
                  output_sequence_length=Args.max_length)

text = df['Word']
vect.adapt(text.values)

In [77]:
vocab_size = len(vect.get_vocabulary())

In [78]:
vect(data['Word'].iloc[0])

<tf.Tensor: shape=(104,), dtype=int64, numpy=
array([ 254,    6,  973,   16, 1810,  238,  468,    7,  524,    2,  129,
          5,   61,    9,  576,    2,  833,    6,  186,   90,   22,   15,
         56,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0], dtype=int64)>

In [79]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['Word'], data['Tag'], test_size=.2)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=.5)

In [86]:
def transform(x, y):
    max_length = Args.max_length
    n = tf.shape(y)[0]
    if n < max_length:
        paddings = [[0, max_length - n]]
        y = tf.pad(y, paddings, constant_values=17)

    return vect(x), y

In [87]:
def train_generator():
    for word, tag in zip(X_train.tolist(), y_train.tolist()):
        yield (word, tag)

def test_generator():
    for word, tag in zip(X_test.tolist(), y_test.tolist()):
        yield (word, tag)

def val_generator():
    for word, tag in zip(X_val.tolist(), y_val.tolist()):
        yield (word, tag)

output_shapes = (tf.TensorShape([None]), tf.TensorShape([None]))
output_types = (tf.string, tf.int32)

train_ds = tf.data.Dataset.from_generator(train_generator, output_types=output_types, output_shapes=output_shapes)\
                          .map(transform)\
                          .batch(Args.batch_size)\
                          .prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_generator(test_generator, output_types=output_types, output_shapes=output_shapes)\
                          .map(transform)\
                          .batch(Args.batch_size)\
                          .prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_generator(val_generator, output_types=output_types, output_shapes=output_shapes)\
                          .map(transform)\
                          .batch(Args.batch_size)\
                          .prefetch(tf.data.AUTOTUNE)

In [88]:
next(iter(train_ds))

(<tf.Tensor: shape=(32, 104), dtype=int64, numpy=
 array([[    2,   510,    14, ...,     0,     0,     0],
        [    2,    41,   167, ...,     0,     0,     0],
        [    2,  3614,  5906, ...,     0,     0,     0],
        ...,
        [   62,     6,     2, ...,     0,     0,     0],
        [   36,    27,    41, ...,     0,     0,     0],
        [  176,  3116, 12335, ...,     0,     0,     0]], dtype=int64)>,
 <tf.Tensor: shape=(32, 104), dtype=int32, numpy=
 array([[16, 16, 16, ..., 17, 17, 17],
        [16, 16, 16, ..., 17, 17, 17],
        [16,  7, 16, ..., 17, 17, 17],
        ...,
        [16, 16, 16, ..., 17, 17, 17],
        [16, 16, 16, ..., 17, 17, 17],
        [ 5,  6, 14, ..., 17, 17, 17]])>)

# Build the model

In [89]:
from keras.layers import Embedding, LSTM, TimeDistributed, Input, Dense, Bidirectional
from keras import Model

inputs = Input(shape=(104,))
x = Embedding(vocab_size, 512)(inputs)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
outputs = TimeDistributed(Dense(Args.n_classes, activation='softmax'))(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
              metrics=['acc'])

model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 104)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 104, 512)          16288256  
                                                                 
 bidirectional_4 (Bidirectio  (None, 104, 100)         225200    
 nal)                                                            
                                                                 
 time_distributed_5 (TimeDis  (None, 104, 18)          1818      
 tributed)                                                       
                                                                 
Total params: 16,515,274
Trainable params: 16,515,274
Non-trainable params: 0
_________________________________________________________________


In [90]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(patience=1)

In [91]:
model.fit(train_ds, epochs=4, validation_data=val_ds, callbacks=[es])

Epoch 1/4
Epoch 2/4
Epoch 3/4


<keras.callbacks.History at 0x29acbcdf1c0>

# Evaluation

In [99]:
model.evaluate(test_ds)



[0.027907349169254303, 0.9916817545890808]

# Saving the model

In [101]:
model.save('saved_model')



INFO:tensorflow:Assets written to: saved_model\assets


INFO:tensorflow:Assets written to: saved_model\assets


# Prediction

In [104]:
my_model = tf.keras.models.load_model('saved_model')

In [109]:
import random

idx = random.randint(0, 100)

test = X_test.iloc[idx]
test = tf.expand_dims(vect(test), 0)

preds = my_model.predict(test)

outs = np.squeeze(np.argmax(preds, axis=-1), axis=0)
outs = [x for x in outs if x != 17]

print(outs)

[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 3, 16, 16, 16, 16, 13, 13, 16]


### true Label

In [110]:
print(y_test.iloc[idx])

[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 3, 16, 16, 16, 5, 13, 13, 16]
