In [1]:
import pandas as pd
import numpy as np
import math
from tensorflow import keras
from tqdm import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data_location = "E:/datasets/ner/ner_dataset.csv"
window_size = 7
num_cells = 64
input_length = (window_size * 2) + 1
embedding_size = 100

In [3]:
data = pd.read_csv(data_location, encoding="ISO-8859-1")

In [4]:
data = data.values.tolist()

In [5]:
outputs = []
inputs = []
ninputs = []
noutputs = []
for i in range(len(data)):
    if data[i][3] != "O":
        outputs.append(data[i][3])
        start_index = i - window_size
        end_index = i + window_size + 1
        if start_index < 0:
            start_index = 0
        if end_index > len(data) - 1:
            end_index = len(data) - 1
        input_window = []
        for j in range(start_index, i + 1):
            if type(data[j][0]) is float:
                input_window.append(data[j][1])
            else:
                input_window = ["UNK"] * len(input_window) + [data[j][1]]
        for j in range(i+1, end_index):
            if type(data[j][0]) is float:
                input_window.append(data[j][1])
            else:
                input_window = input_window + ((window_size * 2 + 1) - len(input_window)) * ["UNK"]
                break
        inputs.append(input_window)
    else:
        noutputs.append(data[i][3])
        start_index = i - window_size
        end_index = i + window_size + 1
        if start_index < 0:
            start_index = 0
        if end_index > len(data) - 1:
            end_index = len(data) - 1
        input_window = []
        for j in range(start_index, i + 1):
            if type(data[j][0]) is float:
                input_window.append(data[j][1])
            else:
                input_window = ["UNK"] * len(input_window) + [data[j][1]]
        for j in range(i+1, end_index):
            if type(data[j][0]) is float:
                input_window.append(data[j][1])
            else:
                input_window = input_window + ((window_size * 2 + 1) - len(input_window)) * ["UNK"]
                break
        ninputs.append(input_window)
outputs = np.array(outputs)
noutputs = np.array(noutputs)
inputs = np.array([" ".join(i) for i in inputs if len(i) == input_length])
ninputs = np.array([" ".join(i) for i in ninputs if len(i) == input_length])
nsample = np.arange(len(ninputs))
np.random.shuffle(nsample)
nsample = nsample[: len(inputs)]
ninputs = ninputs[nsample]
noutputs = noutputs[nsample]
inputs = np.concatenate([inputs, ninputs])
outputs = np.concatenate([outputs, noutputs])
sample = np.arange(len(inputs))
np.random.shuffle(sample)
inputs = inputs[sample]
outputs = outputs[sample]

In [6]:
tokenizer = keras.preprocessing.text.Tokenizer(filters="")
tokenizer.fit_on_texts(inputs)
model_inputs = tokenizer.texts_to_sequences(inputs)

In [7]:
unique_outputs = np.unique(outputs)
output_mapping = {j:i for i, j in enumerate(unique_outputs)}
outputs = [output_mapping[output] for output in outputs]

In [8]:
class NLPModel:
    def __init__(self, num_cells, embedding_size, vocab_size, input_length, output_size):
        self.num_cells = num_cells
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.embedding_layer = np.random.uniform(-0.05, 0.05, (vocab_size, embedding_size))
        self.output_size = output_size
        self.input_length = input_length
        self.W1 = np.random.uniform(-0.05, 0.05, (input_length * embedding_size, num_cells))
        self.B1 = np.zeros((num_cells))
        self.W2 = np.random.uniform(-0.05, 0.05, (num_cells, output_size))
        self.B2 = np.zeros((output_size))
    
    def relu(self, x):
        for i in range(len(x)):
            if x[i] < 0:
                x[i] = 0
        return x
    
    def softmax(self, x):
        x = np.exp(x - np.max(x))
        x = x/np.sum(x)
        return x
    
    
    def forward(self, input_data):
        self.input_data = input_data
        self.emb_out = []
        for i in input_data:
            self.emb_out.extend(self.embedding_layer[i])
        self.W1_dot_emb_out = np.dot(self.emb_out, self.W1)
        self.W1_dot_emb_out_bias = self.W1_dot_emb_out + self.B1
        self.h1 = self.relu(self.W1_dot_emb_out_bias)
        self.W2_dot_h1 = np.dot(self.h1, self.W2)
        self.W2_dot_h1_bias = self.W2_dot_h1 + self.B2
        self.out = self.softmax(self.W2_dot_h1_bias)
        return self.out
    
    def backward(self, output_data):
        self.delta_out = self.out - output_data
        self.delta_W2 = np.outer(self.h1, self.delta_out)
        self.delta_B2 = np.dot(self.delta_out, np.identity(self.output_size))
        temp = np.dot(self.delta_out, self.W2.transpose())
        self.delta_h1 = []
        for i in range(len(temp)):
            if (self.W1_dot_emb_out_bias[i] > 0):
                self.delta_h1.append(temp[i])
            else:
                self.delta_h1.append(0)
        self.delta_W1 = np.outer(self.emb_out, self.delta_h1)
        self.delta_B1 = np.dot(self.delta_h1, np.identity(self.num_cells))
        self.delta_emb_out = np.dot(self.delta_h1, self.W1.transpose())
    
    def apply_grad(self, lr):
        self.W2 -= lr * self.delta_W2
        self.B2 -= lr * self.delta_B2
        self.W1 -= lr * self.delta_W1
        self.B1 -= lr * self.delta_B1
        self.embedding_layer[self.input_data] -= lr * self.delta_emb_out.reshape(self.input_length, self.embedding_size)
    
    def fit(self, inputs, outputs, epochs, lr):
        for epoch in range(epochs):
            loss = 0.0
            t = tqdm(range(len(inputs)))
            for i in t:
                model_output = np.zeros((self.output_size))
                model_output[outputs[i]] = 1
                self.forward(inputs[i])
                self.backward(model_output)
                self.apply_grad(lr)
                loss += -1 * (np.log(np.sum(model_output * self.out)))
                t.set_postfix({"loss": loss/(i + 1), "epoch": str(epoch + 1)})
    
    def predict(self, input_sentence, tokenizer, window_size, output_mapping):
        input_sentence = "UNK " * window_size + input_sentence + " UNK" * window_size
        rev_output_mapping = {j: i for i, j in output_mapping.items()}
        sequences = tokenizer.texts_to_sequences([input_sentence])[0]
        rev_word_index = {j: i for i, j in tokenizer.word_index.items()}
        for i in range(window_size, len(sequences) - window_size):
            model_input = sequences[i - window_size: i + window_size + 1]
            out = self.forward(model_input)
            out_class = rev_output_mapping[np.argmax(out)]
            print(rev_word_index[sequences[i]], out_class)
    
    
            

In [9]:
model = NLPModel(num_cells, embedding_size, len(tokenizer.word_index) + 1, input_length, len(output_mapping))

In [10]:
model.fit(model_inputs, outputs, 10, 0.001)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 321330/321330 [28:15<00:00, 189.52it/s, loss=1.43, epoch=1]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 321330/321330 [28:31<00:00, 187.71it/s, loss=1.03, epoch=2]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 321330/321330 [27:06<00:00, 197.57it/s, loss=0.902, epoch=3]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 321330/321330 [27:32<00:00, 194.50it/s, loss=0.805, epoch=4]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 321330/321330 [27:25<00:00, 195.25it/s, loss=0.742, epoch=5]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 321330/321330 [27:51<00:00, 192.19it/s, loss=0.696, epoch=6]
100%

In [21]:
input_sentence = "International atomic agency is to hold second day of talks in vienna on wednesday on how to respond to iran 's resumption of low-level uranium conversion ."
model.predict(input_sentence, tokenizer, window_size, output_mapping)

international B-geo
atomic B-org
agency I-org
is O
to O
hold O
second I-org
day O
of O
talks O
in O
vienna I-tim
on O
wednesday B-geo
on O
how O
to O
respond O
to O
iran B-tim
's O
resumption B-geo
of O
low-level O
uranium O
conversion O
. O
