<a href="https://colab.research.google.com/github/Muhammad-Gunzalas/Artificial_Neural_Network_with-_Tensorflow/blob/main/Named_Entity_Recognition_(NER).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd


In [None]:
data = pd.read_csv("/content/ner_dataset.csv",encoding= 'unicode_escape')

In [None]:
data

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [None]:
data.shape

(1048575, 4)

In [None]:
data.isnull().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

In [None]:
data['Tag'].value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [None]:
data['Tag'].value_counts().count()

17

In [None]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}

    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


In [None]:
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [None]:
print(token2idx)
print(idx2token)



In [None]:
print(tag2idx)
print(idx2tag)

{'B-art': 0, 'I-nat': 1, 'B-per': 2, 'B-geo': 3, 'O': 4, 'I-per': 5, 'I-tim': 6, 'I-org': 7, 'B-tim': 8, 'I-geo': 9, 'B-org': 10, 'I-art': 11, 'I-gpe': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'B-gpe': 16}
{0: 'B-art', 1: 'I-nat', 2: 'B-per', 3: 'B-geo', 4: 'O', 5: 'I-per', 6: 'I-tim', 7: 'I-org', 8: 'B-tim', 9: 'I-geo', 10: 'B-org', 11: 'I-art', 12: 'I-gpe', 13: 'B-nat', 14: 'B-eve', 15: 'I-eve', 16: 'B-gpe'}


In [None]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(


In [None]:
!pip install tensorflow
#!pip install keras==2.4.3



In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import pad_sequences
from keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [None]:
import numpy as np
import tensorflow
from keras import Sequential, Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [None]:

input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [None]:
from keras.layers import TimeDistributed
from keras.utils.vis_utils import plot_model
import numpy as np
# Now you can use the plot_model function


In [None]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [None]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 104, 64)           2251456   
                                                                 
 bidirectional_3 (Bidirectio  (None, 104, 128)         66048     
 nal)                                                            
                                                                 
 lstm_7 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed_3 (TimeDis  (None, 104, 17)          1105      
 tributed)                                                       
                                                                 
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________


In [None]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is Gunzalas \n I am from Pakistan \n I am a Student \n Steve Jobs is My Inspiration \n i have interest in Data Science \n i schedule a Meeting on 20 August 2023')
displacy.render(text, style = 'ent', jupyter=True)

In [None]:
test_tokens

array([[22811, 32217, 25802, ..., 35177, 35177, 35177],
       [ 7958,  2772, 21606, ..., 35177, 35177, 35177],
       [22811, 34991,  6061, ..., 35177, 35177, 35177],
       ...,
       [ 2316,  3817,  9745, ..., 35177, 35177, 35177],
       [28085,  3292,  2566, ..., 35177, 35177, 35177],
       [32907,  6686, 12778, ..., 35177, 35177, 35177]], dtype=int32)

In [None]:
prediction=model_bilstm_lstm.predict(test_tokens)



In [None]:
#test_tags

In [None]:
prediction=np.round(prediction)
#prediction

In [None]:
from sklearn.metrics import accuracy_score , confusion_matrix ,precision_score, recall_score, f1_score

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

# Assuming prediction and test_tags are the given arrays
predicted_labels = []
true_labels = []

for pred_seq, true_seq in zip(prediction, test_tags):
    # Flatten the sequences to 1D arrays
    pred_flat = np.argmax(pred_seq, axis=-1)  # Convert probabilities to labels
    true_flat = np.argmax(true_seq, axis=-1)  # Convert probabilities to labels

    predicted_labels.extend(pred_flat)
    true_labels.extend(true_flat)

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='weighted', zero_division=0)
precision = precision_score(true_labels, predicted_labels, average='weighted', zero_division=0)
f1 = f1_score(true_labels, predicted_labels, average='weighted', zero_division=0)

conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.7791629242317316
Precision: 0.9679113732928833
Recall: 0.7791629242317316
F1-Score: 0.8633068134391108
Confusion Matrix:
 [[    28      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0]
 [     6      0      0      0      1      0      0      0      0      0
       0      0      0      0      0      0      0]
 [  1693      0      0      0      4      0      0      0      0      0
       0      0      0      0      0      0      0]
 [  3682      0      0      0      8      0      0      0      0      0
       0      0      0      0      0      0      0]
 [ 94205      0      0      0 388606      0      0      0      0      0
       0      0      0      0      0      0      0]
 [  1712      0      0      0      3      0      0      0      0      0
       0      0      0      0      0      0      0]
 [   674      0      0      0      0      0      0      0      0      0
       0      0      0      0      0      0      0