<a href="https://colab.research.google.com/github/Kunal2873/deep_learn/blob/main/basic_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded=files.upload()
import pandas as pd



Saving ner_dataset.csv to ner_dataset.csv


In [None]:
data=pd.read_csv('ner_dataset.csv',encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In the data, we can see that the words are broken into columns which will represent our feature X, and the Tag column in the right will represent our label Y.

**preparing the dataset**

In [None]:
from itertools import chain
def get_dict_map(data,token_or_tag):
  tok2idx={}
  idx2tok={}

  if token_or_tag=="token":
    vocab=list(set(data["Word"].to_list()))
  else:
    vocab=list(set(data["Tag"].to_list()))

  idx2tok={idx:tok for idx,tok in enumerate(vocab)}
  tok2idx={tok:idx for idx,tok in enumerate(vocab)}


  return tok2idx,idx2tok


token2idx,idx2token=get_dict_map(data,"token")
tag2idx,idx2tag=get_dict_map(data,"tag")



transforming the columns in the data

In [None]:
data['Word_idx']=data['Word'].map(token2idx)
data['Tag_idx']=data['Tag'].map(tag2idx)
data_fillna=data.fillna(method="ffill",axis=0)


#  groupby and collect columns
data_group=data_fillna.groupby(
    ["Sentence #"],as_index=False
)[["Word","POS","Tag","Word_idx","Tag_idx"]].agg(lambda x: list(x))

  data_fillna=data.fillna(method="ffill",axis=0)


In [None]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input
from keras.utils import to_categorical

def get_pad_train_test_val(data_group,data):
  n_token=len(list(set(data["Word"].to_list())))
  n_tag=len(list(set(data['Tag'].to_list())))


  tokens=data_group["Word_idx"].to_list()
  maxlen=max([len(s) for s in tokens ])
  pad_tokens=pad_sequences(tokens,maxlen=maxlen,dtype="int32",padding='post',value=n_token-1)


  tags=data_group['Tag_idx'].to_list()
  pad_tags=pad_sequences(tags,maxlen=maxlen,dtype="int32",padding='post',value=tag2idx["O"])

  n_tags=len(tag2idx)
  pad_tags=[to_categorical(i,num_classes=n_tags) for i in pad_tags ]

  tokens_,test_tokens,tags_,test_tags=train_test_split(pad_tokens,pad_tags,test_size=0.1,train_size=0.9,random_state=2025)
  train_tokens,val_tokens,train_tags,val_tags=train_test_split(tokens,tags,test_size=0.25,train_size=0.75,random_state=2025)

  print(
      "train_tokens length:",len(train_tokens),
      "\ntrain_tokesn length:",len(train_tokens),
      "\ntest_tokens length:",len(test_tokens),
      "\nval_tokens length:",len(val_tokens),\
      "\nval_tags length:",len(val_tags),

  )
  return train_tokens,val_tokens,test_tokens,train_tags,val_tags,test_tags


train_tokens,val_tokens,test_tokens,train_tags,val_tags,test_tags=get_pad_train_test_val(data_group,data)


train_tokens length: 35969 
train_tokesn length: 35969 
test_tokens length: 4796 
val_tokens length: 11990 
val_tags length: 11990


trainig the neural network

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM,Embedding,Dense,TimeDistributed,Dropout,Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tf.random.set_seed(2)

In [None]:
input_dim=len(list(set(data['Word'].to_list())))+1
output_dim=64
input_length=max([len(s) for s in data_group["Word_idx"].to_list()])
n_tags=len(tag2idx)

creating a function for the neural network summary and help

In [None]:
from tensorflow.keras.optimizers import Adam
def get_bilstm_lstm_model():
  model=Sequential()

  model.add(Input(shape=(input_length,)))

  model.add(Embedding(input_dim=input_dim,output_dim=output_dim))
  model.add(Bidirectional(LSTM(units=output_dim,return_sequences=True,dropout=0.2,recurrent_dropout=0.2),merge_mode='concat'))
  model.add(LSTM(units=output_dim,return_sequences=True,dropout=0.5,recurrent_dropout=0.5))
  model.add(TimeDistributed(Dense(n_tags,activation='relu')))

  optimizer = Adam(learning_rate=0.0001 )

  model.compile(loss="categorical_crossentropy",optimizer=optimizer ,metrics=['accuracy'])

  model.summary()

  return model


# def train_model(X,y,model):
#   loss=list()
#   for i in range(25):
#     hist=model.fit(X,y,batch_size=1000,verbose=1,epochs=1,validation_split=0.2)
#     loss.append(hist.history['loss'][0])
#   return loss

# results=pd.DataFrame()
# model_bilstm_lstm=get_bilstm_lstm_model()
# plot_model(model_bilstm_lstm)
# results['with_add_lstm']=train_model(train_tokens,np.array(train_tags),model_bilstm_lstm)



In [None]:
def train_model(X,y,model):
  loss=list()
  for i in range(25):
    hist=model.fit(X,y,batch_size=1000,verbose=1,epochs=1,validation_split=0.2)
    loss.append(hist.history['loss'][0])
  return loss

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
results=pd.DataFrame()

# 1) Pad your token inputs
train_tokens_padded = pad_sequences(train_tokens,
                                    maxlen=input_length,
                                    padding='post')
# 2) Pad your tag sequences
train_tags_padded = pad_sequences(train_tags,
                                  maxlen=input_length,
                                  padding='post')

# 3) One‑hot encode tags
train_tags_cat = to_categorical(train_tags_padded,
                                num_classes=n_tags)

# 4) Now both X and y are real NumPy arrays; call training:
results['with_add_lstm'] = train_model(train_tokens_padded,
                                       train_tags_cat,
                                       get_bilstm_lstm_model())


[1m19/29[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m9s[0m 965ms/step - accuracy: 0.6494 - loss: nan 

KeyboardInterrupt: 

In [None]:
# results=pd.DataFrame()
# model_bilstm_lstm=get_bilstm_lstm_model()
# plot_model(model_bilstm_lstm)
# results['with_add_lstm']=train_model(train_tokens,np.array(train_tags),model_bilstm_lstm)

In [None]:
import spacy
from spacy import displacy
nlp=spacy.load('en_core_web_sm')
# text = nlp('Hi, My name is Aman Kharwal \n I am from India \n I want to work with Google \n Steve Jobs is My Inspiration')

text=nlp('hello,My name is kunal \n i am from india \n i want to work in google ')
displacy.render(text,style='ent',jupyter=True)