In [5]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset, concatenate_datasets
from sklearn.metrics import classification_report
from transformers import AutoTokenizer

In [6]:
hiner = load_dataset('cfilt/HiNER-original')
train_data = hiner['train']
val_data = hiner['validation']
test_data = hiner['test']
combined_data = concatenate_datasets([train_data, val_data])

Downloading builder script:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

Downloading and preparing dataset hi_ner_config/HiNER to /root/.cache/huggingface/datasets/cfilt___hi_ner_config/HiNER/0.0.2/c2bf095b51bde10ac392c9203c0fcdd1d7c47d2b03b6b455bf277f1afd7feed0...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.39M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset hi_ner_config downloaded and prepared to /root/.cache/huggingface/datasets/cfilt___hi_ner_config/HiNER/0.0.2/c2bf095b51bde10ac392c9203c0fcdd1d7c47d2b03b6b455bf277f1afd7feed0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
token_vocab = tokenizer.get_vocab()

Downloading (…)lve/main/config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [8]:
tag_vocab = sorted(set(tag for sublist in combined_data['ner_tags'] for tag in sublist))

In [9]:
token_to_id = token_vocab
tag_to_id = {tag: i for i, tag in enumerate(tag_vocab)}

In [6]:
# def preprocess_data(data, tokenizer, token_to_id, tag_to_id, max_length):
#     tokens = data['tokens']
#     ner_tags = data['ner_tags']
#     tokens_ids = [[token_to_id.get(token, token_to_id.get('[UNK]',)) for token in sublist] for sublist in tokens]
#     tags_ids = [[tag_to_id.get(tag) for tag in sublist] for sublist in ner_tags]
#     tokens_ids = pad_sequences(tokens_ids, maxlen=max_length, padding='post')
#     tags_ids = pad_sequences(tags_ids, maxlen=max_length, padding='post')
#     tags_ids = to_categorical(tags_ids, num_classes=len(tag_to_id))
#     return tokens_ids, tags_ids

# max_length = 128

In [7]:
# train_tokens_ids, train_tags_ids = preprocess_data(train_data, tokenizer, token_to_id, tag_to_id, max_length)
# val_tokens_ids, val_tags_ids = preprocess_data(val_data, tokenizer, token_to_id, tag_to_id, max_length)
# test_tokens_ids, test_tags_ids = preprocess_data(test_data, tokenizer, token_to_id, tag_to_id, max_length)

In [10]:
import numpy as np

def preprocess_data(data, tokenizer, tag_to_id, max_length):
    tokens = data['tokens']
    ner_tags = data['ner_tags']
    # Encoding the tokens using the BERT tokenizer
    tokens_ids = [tokenizer.encode(' '.join(sublist), truncation=True, max_length=max_length, padding='max_length') for sublist in tokens]
    tags_ids = [[tag_to_id[tag] for tag in sublist] for sublist in ner_tags]
    tags_ids = pad_sequences(tags_ids, maxlen=max_length, padding='post')
    tags_ids = to_categorical(tags_ids, num_classes=len(tag_to_id))
    return np.array(tokens_ids), np.array(tags_ids)

max_length = 128

In [11]:
train_tokens_ids, train_tags_ids = preprocess_data(train_data, tokenizer, tag_to_id, max_length)
val_tokens_ids, val_tags_ids = preprocess_data(val_data, tokenizer, tag_to_id, max_length)
test_tokens_ids, test_tags_ids = preprocess_data(test_data, tokenizer, tag_to_id, max_length)

In [12]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=tokenizer.vocab_size, output_dim=64),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.TimeDistributed(layers.Dense(64, activation='relu')),
    layers.TimeDistributed(layers.Dense(len(tag_to_id), activation='softmax'))
])

In [13]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [14]:
model.fit(train_tokens_ids, train_tags_ids, validation_data=(val_tokens_ids, val_tags_ids), epochs=4, batch_size=128)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7a2a7ca5a440>

In [15]:
model.save('INDIC-BERT_NER_BiLSTM.h5')

In [16]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_tokens_ids, test_tags_ids)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)
print('Test Precision:',test_precision)
print('Test Recall:',test_recall)

Test Loss: 0.07507924735546112
Test Accuracy: 0.9797554016113281
Test Precision: 0.984639585018158
Test Recall: 0.9758846759796143


In [17]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions on test data
predictions = model.predict(test_tokens_ids)

# Convert predictions from categorical to label encoded
pred_tags_ids = np.argmax(predictions, axis=-1)

# Convert test tags from one-hot encoded to label encoded
true_tags_ids = np.argmax(test_tags_ids, axis=-1)

# Flatten both arrays
pred_tags_ids_flat = pred_tags_ids.flatten()
true_tags_ids_flat = true_tags_ids.flatten()

# Use inverse_transform to map predicted and true labels back to their original form.
# You can skip this step if you want the classification report for the encoded labels.
pred_tags = [tag_vocab[i] for i in pred_tags_ids_flat]
true_tags = [tag_vocab[i] for i in true_tags_ids_flat]

# Generate classification report
report = classification_report(true_tags, pred_tags, output_dict=True)

report = classification_report(true_tags, pred_tags)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2329699
           1       0.00      0.00      0.00       369
           2       0.89      0.41      0.56      1190
           3       0.00      0.00      0.00       179
           4       0.84      0.69      0.76     40072
           5       0.40      0.00      0.00      1065
           6       0.21      0.00      0.01      4638
           7       0.66      0.05      0.10      5351
           8       0.57      0.11      0.19      7495
           9       0.00      0.00      0.00       230
          10       0.76      0.15      0.26      3645
          11       0.00      0.00      0.00        26
          12       0.00      0.00      0.00       231
          13       0.94      0.74      0.83       603
          14       0.00      0.00      0.00       171
          15       0.68      0.37      0.48      4731
          16       0.00      0.00      0.00        72
          17       0.49    

In [27]:
import random
def predict_tags(tokenizer, model, tag_to_id, max_length):
    # Tokenizing sentence
    sentence = random.choice(test_data['tokens'])
    sentence_tokens = tokenizer.encode(' '.join(sentence), truncation=True, padding='max_length', max_length=max_length)
    sentence_tokens_ids = np.array([sentence_tokens])

    # Predicting tags
    predicted_tags_ids = model.predict(sentence_tokens_ids)

    # Fetching tag names from IDs
    id_to_tag = {id: tag for tag, id in tag_to_id.items()}  # reverse the tag_to_id dictionary
    predicted_tags = [id_to_tag[id] for id in np.argmax(predicted_tags_ids, axis=-1)[0]]

    # Removing padding tokens ("O") if any
    actual_tags = test_data['ner_tags'][test_data['tokens'].index(sentence)][:len(sentence_tokens)]
    return predicted_tags[:len(sentence)], actual_tags

predicted_tags,actual_tags = predict_tags(tokenizer, model, tag_to_id, max_length)
print('Predicted Tags:', predicted_tags)
print('Actual Tags:',actual_tags)

Predicted Tags: [4, 4, 22, 22, 22, 4, 22, 22, 22, 22]
Actual Tags: [4, 4, 22, 4, 22, 4, 22, 22, 22, 22]
