<a href="https://colab.research.google.com/github/Namanzz/AdvancedNER/blob/main/AdvancedNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import ast
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical

In [None]:
df = pd.read_csv("ner.csv")
df["POS"] = df["POS"].apply(ast.literal_eval)
df["Tag"] = df["Tag"].apply(ast.literal_eval)
df

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."
...,...,...,...,...
47954,Sentence: 47955,Indian border security forces are accusing the...,"[JJ, NN, NN, NNS, VBP, VBG, PRP$, JJ, NNS, IN,...","[B-gpe, O, O, O, O, O, O, B-gpe, O, O, O, O, O..."
47955,Sentence: 47956,Indian officials said no one was injured in Sa...,"[JJ, NNS, VBD, DT, NN, VBD, VBN, IN, NNP, POS,...","[B-gpe, O, O, O, O, O, O, O, B-tim, O, O, O, O..."
47956,Sentence: 47957,Two more landed in fields belonging to a nearb...,"[CD, JJR, VBD, IN, NNS, VBG, TO, DT, JJ, NN, .]","[O, O, O, O, O, O, O, O, O, O, O]"
47957,Sentence: 47958,They say not all of the rockets exploded upon ...,"[PRP, VBP, RB, DT, IN, DT, NNS, VBD, IN, NN, .]","[O, O, O, O, O, O, O, O, O, O, O]"


In [None]:
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(df["Sentence"])
word_index = tokenizer.word_index

In [None]:
tag_values = list(set(tag for tags in df["Tag"] for tag in tags))
tag2idx = {tag: i for i, tag in enumerate(tag_values)}

X = tokenizer.texts_to_sequences(df["Sentence"])
y = [[tag2idx[tag] for tag in tags] for tags in df["Tag"]]

In [None]:
max_len = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_len, padding="post")
y = pad_sequences(y, maxlen=max_len, padding="post")
y = np.array([to_categorical(seq, num_classes=len(tag2idx)) for seq in y])

In [None]:
embedding_dim = 100
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2)),
    TimeDistributed(Dense(len(tag2idx), activation="softmax"))
])



In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(X, y, batch_size=32, epochs=5, validation_split=0.1, verbose=1)

Epoch 1/5
[1m1349/1349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m707s[0m 511ms/step - accuracy: 0.9641 - loss: 0.2248 - val_accuracy: 0.9807 - val_loss: 0.0593
Epoch 2/5
[1m1349/1349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m721s[0m 496ms/step - accuracy: 0.9813 - loss: 0.0574 - val_accuracy: 0.9833 - val_loss: 0.0506
Epoch 3/5
[1m1349/1349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 487ms/step - accuracy: 0.9831 - loss: 0.0513 - val_accuracy: 0.9842 - val_loss: 0.0476
Epoch 4/5
[1m1349/1349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 487ms/step - accuracy: 0.9840 - loss: 0.0484 - val_accuracy: 0.9845 - val_loss: 0.0468
Epoch 5/5
[1m1349/1349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m700s[0m 501ms/step - accuracy: 0.9847 - loss: 0.0463 - val_accuracy: 0.9851 - val_loss: 0.0453


<keras.src.callbacks.history.History at 0x7ed603d82590>

In [None]:
def predict_entities(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_seq = pad_sequences(sequence, maxlen=max_len, padding="post")
    predictions = model.predict(padded_seq)
    predicted_tags = [list(tag2idx.keys())[np.argmax(tag)] for tag in predictions[0]]
    words = sentence.split()
    return [(word, tag) for word, tag in zip(words, predicted_tags[:len(words)])]

In [None]:
while True:
    user_input = input("Enter a sentence (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    print(predict_entities(user_input))

Enter a sentence (or type 'exit' to quit): An apple a day keeps the doctor away
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[('An', 'O'), ('apple', 'O'), ('a', 'O'), ('day', 'O'), ('keeps', 'O'), ('the', 'O'), ('doctor', 'O'), ('away', 'O')]
Enter a sentence (or type 'exit' to quit): exit
