In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import ast

In [3]:
def loading_data(data_path):
  data = pd.read_csv(data_path)
  data.dropna(inplace=True)
  print('Number of rows: ', data.shape[0], ' and the number of columns: ', data.shape[1])
  return data

In [4]:
data = loading_data('/content/drive/MyDrive/NLP/ner.csv')

data.head()

Number of rows:  47959  and the number of columns:  4


Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [5]:
def preprocess_data(data):
  for i in range(len(data)):
    pos = ast.literal_eval(data['POS'][i])
    tags = ast.literal_eval(data['Tag'][i])
    data['POS'][i] = [str(word) for word in pos]
    data['Tag'][i] = [str(word.upper()) for word in tags]
  return data

In [6]:
data = preprocess_data(data)
data.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,..."


In [7]:
data['Tag'][0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-GEO',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-GEO',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-GPE',
 'O',
 'O',
 'O',
 'O',
 'O']

In [8]:
df_final = data[['Sentence','Tag']]

df_train, df_test = train_test_split(df_final, test_size=0.2, random_state=42)
len(df_train), len(df_test)

(38367, 9592)

In [9]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Bidirectional, LSTM, Embedding
from keras.models import Model
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping

In [10]:
train_targets = list(df_train.Tag.values)
test_targets = list(df_test.Tag.values)

tokenizer = Tokenizer(lower=False, oov_token='UNK')
tokenizer.fit_on_texts(df_train['Sentence'])

train_inputs = tokenizer.texts_to_sequences(df_train['Sentence'])
test_inputs = tokenizer.texts_to_sequences(df_test['Sentence'])

In [12]:
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens '%V)

Found 28761 unique tokens 


In [13]:
train_tags = set([val for sublist in train_targets for val in sublist])
test_tags = set([val for sublist in test_targets for val in sublist])

print("Unique NER tags in train set: ",train_tags)
print("Unique NER tags in test set: ",test_tags)

Unique NER tags in train set:  {'B-GPE', 'B-ORG', 'B-EVE', 'B-NAT', 'O', 'I-NAT', 'I-EVE', 'I-GPE', 'B-ART', 'B-PER', 'I-ORG', 'I-PER', 'B-GEO', 'I-ART', 'I-GEO', 'I-TIM', 'B-TIM'}
Unique NER tags in test set:  {'B-GPE', 'B-ORG', 'B-EVE', 'B-NAT', 'O', 'I-NAT', 'I-EVE', 'I-GPE', 'B-ART', 'B-PER', 'I-ORG', 'I-PER', 'B-GEO', 'I-ART', 'I-GEO', 'I-TIM', 'B-TIM'}


In [14]:
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_targets)
train_tgt_int = tag_tokenizer.texts_to_sequences(train_targets)
test_tgt_int = tag_tokenizer.texts_to_sequences(test_targets)

In [15]:
# Max length
max_length_train = max(len(sent) for sent in train_inputs)
max_length_test = max(len(sent) for sent in test_inputs)
max_length = max(max_length_train,max_length_test)

# Pad input sequences
train_inputs_final = pad_sequences(train_inputs,maxlen=max_length,padding="post")
print("Shape of train inputs: ",train_inputs_final.shape)

test_inputs_final = pad_sequences(test_inputs,maxlen=max_length,padding="post")
print("Shape of test inputs: ",test_inputs_final.shape)

train_targets_final = pad_sequences(train_tgt_int,maxlen=max_length,padding="post")
print("Shape of train targets: ",train_targets_final.shape)

test_targets_final = pad_sequences(test_tgt_int,maxlen=max_length,padding="post")
print("Shape of test targets: ",test_targets_final.shape)

Shape of train inputs:  (38367, 89)
Shape of test inputs:  (9592, 89)
Shape of train targets:  (38367, 89)
Shape of test targets:  (9592, 89)


In [16]:
K = len(tag_tokenizer.word_index) + 1
K

18

In [17]:
vector_size = 16

i = Input(shape=(max_length,))
x = Embedding(input_dim= V+1,output_dim=vector_size,mask_zero=True)(i)
x = Bidirectional(LSTM(32,return_sequences=True))(x)
x = Dense(K)(x)

model = Model(i,x)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 89)]              0         
                                                                 
 embedding (Embedding)       (None, 89, 16)            460192    
                                                                 
 bidirectional (Bidirection  (None, 89, 64)            12544     
 al)                                                             
                                                                 
 dense (Dense)               (None, 89, 18)            1170      
                                                                 
Total params: 473906 (1.81 MB)
Trainable params: 473906 (1.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
# Compile and fit

model.compile(optimizer="adam",loss=SparseCategoricalCrossentropy(from_logits=True),metrics=["accuracy"])
model.fit(train_inputs_final,
          train_targets_final,
          epochs=6,
          validation_data=(test_inputs_final,test_targets_final))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.src.callbacks.History at 0x7dff1d92fd30>

In [19]:
sentence = "I am a student, i work at Google."
predictions = model.predict(pad_sequences(tokenizer.texts_to_sequences([sentence]),
                                          maxlen=max_length,
                                         padding="post"))

# predictions
prediction_ner = np.argmax(predictions,axis=-1)

NER_tags = [tag_tokenizer.index_word[num] for num in list(prediction_ner.flatten())]
NER_tags



['o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o']

In [None]:
prediction_ner = np.argmax(predictions,axis=-1)
prediction_ner

array([[8, 6, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]])

In [None]:
NER_tags = [tag_tokenizer.index_word[num] for num in list(prediction_ner.flatten())]
NER_tags

['b-gpe',
 'b-per',
 'o',
 'o',
 'i-per',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'b-geo',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o']