In [1]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset, concatenate_datasets
from sklearn.metrics import classification_report
from transformers import AutoTokenizer

In [2]:
hiner = load_dataset('cfilt/HiNER-collapsed')
train_data = hiner['train']
val_data = hiner['validation']
test_data = hiner['test']
combined_data = concatenate_datasets([train_data, val_data])

Downloading builder script:   0%|          | 0.00/3.08k [00:00<?, ?B/s]

Downloading and preparing dataset hi_ner_collapsed_config/HiNER-Collapsed to /root/.cache/huggingface/datasets/cfilt___hi_ner_collapsed_config/HiNER-Collapsed/0.0.2/fa4c99b4cefed1144a9c7da5e3d85737ead950f0e8db723d1bf108fc7613b493...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/50.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.20M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset hi_ner_collapsed_config downloaded and prepared to /root/.cache/huggingface/datasets/cfilt___hi_ner_collapsed_config/HiNER-Collapsed/0.0.2/fa4c99b4cefed1144a9c7da5e3d85737ead950f0e8db723d1bf108fc7613b493. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
token_vocab = tokenizer.get_vocab()

Downloading (…)lve/main/config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [4]:
tag_vocab = sorted(set(tag for sublist in combined_data['ner_tags'] for tag in sublist))

In [5]:
token_to_id = token_vocab
tag_to_id = {tag: i for i, tag in enumerate(tag_vocab)}

In [6]:
# def preprocess_data(data, tokenizer, token_to_id, tag_to_id, max_length):
#     tokens = data['tokens']
#     ner_tags = data['ner_tags']
#     tokens_ids = [[token_to_id.get(token, token_to_id.get('[UNK]',)) for token in sublist] for sublist in tokens]
#     tags_ids = [[tag_to_id.get(tag) for tag in sublist] for sublist in ner_tags]
#     tokens_ids = pad_sequences(tokens_ids, maxlen=max_length, padding='post')
#     tags_ids = pad_sequences(tags_ids, maxlen=max_length, padding='post')
#     tags_ids = to_categorical(tags_ids, num_classes=len(tag_to_id))
#     return tokens_ids, tags_ids

# max_length = 128

In [7]:
# train_tokens_ids, train_tags_ids = preprocess_data(train_data, tokenizer, token_to_id, tag_to_id, max_length)
# val_tokens_ids, val_tags_ids = preprocess_data(val_data, tokenizer, token_to_id, tag_to_id, max_length)
# test_tokens_ids, test_tags_ids = preprocess_data(test_data, tokenizer, token_to_id, tag_to_id, max_length)

In [8]:
import numpy as np

def preprocess_data(data, tokenizer, tag_to_id, max_length):
    tokens = data['tokens']
    ner_tags = data['ner_tags']
    # Encoding the tokens using the BERT tokenizer
    tokens_ids = [tokenizer.encode(' '.join(sublist), truncation=True, max_length=max_length, padding='max_length') for sublist in tokens]
    tags_ids = [[tag_to_id[tag] for tag in sublist] for sublist in ner_tags]
    tags_ids = pad_sequences(tags_ids, maxlen=max_length, padding='post')
    tags_ids = to_categorical(tags_ids, num_classes=len(tag_to_id))
    return np.array(tokens_ids), np.array(tags_ids)

max_length = 128

In [9]:
train_tokens_ids, train_tags_ids = preprocess_data(train_data, tokenizer, tag_to_id, max_length)
val_tokens_ids, val_tags_ids = preprocess_data(val_data, tokenizer, tag_to_id, max_length)
test_tokens_ids, test_tags_ids = preprocess_data(test_data, tokenizer, tag_to_id, max_length)

In [10]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=tokenizer.vocab_size, output_dim=64),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.TimeDistributed(layers.Dense(64, activation='relu')),
    layers.TimeDistributed(layers.Dense(len(tag_to_id), activation='softmax'))
])

In [11]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [12]:
model.fit(train_tokens_ids, train_tags_ids, validation_data=(val_tokens_ids, val_tags_ids), epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x79cfa8954280>

In [13]:
model.save('INDIC-BERT_NER_BiLSTM_Collapsed.h5')

In [14]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_tokens_ids, test_tags_ids)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)
print('Test Precision:',test_precision)
print('Test Recall:',test_recall)

Test Loss: 0.04439732804894447
Test Accuracy: 0.9865538477897644
Test Precision: 0.9893122911453247
Test Recall: 0.9844586849212646


In [15]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions on test data
predictions = model.predict(test_tokens_ids)

# Convert predictions from categorical to label encoded
pred_tags_ids = np.argmax(predictions, axis=-1)

# Convert test tags from one-hot encoded to label encoded
true_tags_ids = np.argmax(test_tags_ids, axis=-1)

# Flatten both arrays
pred_tags_ids_flat = pred_tags_ids.flatten()
true_tags_ids_flat = true_tags_ids.flatten()

# Use inverse_transform to map predicted and true labels back to their original form.
# You can skip this step if you want the classification report for the encoded labels.
pred_tags = [tag_vocab[i] for i in pred_tags_ids_flat]
true_tags = [tag_vocab[i] for i in true_tags_ids_flat]

# Generate classification report
report = classification_report(true_tags, pred_tags, output_dict=True)

report = classification_report(true_tags, pred_tags)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00   2369732
           1       0.54      0.12      0.20      5351
           2       0.62      0.26      0.36      7495
           3       0.82      0.32      0.46      4731
           4       0.51      0.18      0.26      3849
           5       0.65      0.29      0.40      5488
           6       0.93      0.99      0.96    375450

    accuracy                           0.99   2772096
   macro avg       0.72      0.45      0.52   2772096
weighted avg       0.98      0.99      0.98   2772096



In [27]:
import random
def predict_tags(tokenizer, model, tag_to_id, max_length):
    # Tokenizing sentence
    sentence = random.choice(test_data['tokens'])
    sentence_tokens = tokenizer.encode(' '.join(sentence), truncation=True, padding='max_length', max_length=max_length)
    sentence_tokens_ids = np.array([sentence_tokens])

    # Predicting tags
    predicted_tags_ids = model.predict(sentence_tokens_ids)

    # Fetching tag names from IDs
    id_to_tag = {id: tag for tag, id in tag_to_id.items()}  # reverse the tag_to_id dictionary
    predicted_tags = [id_to_tag[id] for id in np.argmax(predicted_tags_ids, axis=-1)[0]]

    # Removing padding tokens ("O") if any
    actual_tags = test_data['ner_tags'][test_data['tokens'].index(sentence)][:len(sentence_tokens)]
    return predicted_tags[:len(sentence)], actual_tags

predicted_tags,actual_tags = predict_tags(tokenizer, model, tag_to_id, max_length)
print('Predicted Tags:', predicted_tags)
print('Actual Tags:',actual_tags)

Predicted Tags: [4, 4, 22, 22, 22, 4, 22, 22, 22, 22]
Actual Tags: [4, 4, 22, 4, 22, 4, 22, 22, 22, 22]
