In [None]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset, concatenate_datasets
from sklearn.metrics import classification_report

In [2]:
hiner = load_dataset('cfilt/HiNER-original')
train_data = hiner['train']
val_data = hiner['validation']
test_data = hiner['test']

Downloading builder script:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

Downloading and preparing dataset hi_ner_config/HiNER to /root/.cache/huggingface/datasets/cfilt___hi_ner_config/HiNER/0.0.2/c2bf095b51bde10ac392c9203c0fcdd1d7c47d2b03b6b455bf277f1afd7feed0...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.39M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset hi_ner_config downloaded and prepared to /root/.cache/huggingface/datasets/cfilt___hi_ner_config/HiNER/0.0.2/c2bf095b51bde10ac392c9203c0fcdd1d7c47d2b03b6b455bf277f1afd7feed0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
combined_data = concatenate_datasets([train_data, val_data])

In [4]:
tokens = combined_data['tokens']
ner_tags = combined_data['ner_tags']

In [5]:
all_tokens = [token for sublist in tokens for token in sublist]
all_tags = [tag for sublist in ner_tags for tag in sublist]
token_vocab = sorted(set(all_tokens))
tag_vocab = sorted(set(all_tags))
# tag_vocab.append(int(23))

In [6]:
token_to_id = {token: i for i, token in enumerate(token_vocab)}
tag_to_id = {tag: i for i, tag in enumerate(tag_vocab)}

In [7]:
# Preprocess the training data
train_tokens = train_data['tokens']
train_tags = train_data['ner_tags']
train_tokens_ids = [[token_to_id[token] for token in sublist] for sublist in train_tokens]
train_tags_ids = [[tag_to_id[tag] for tag in sublist] for sublist in train_tags]
max_length = max(len(seq) for seq in train_tokens_ids)  # Calculate the maximum length
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=max_length, padding='post')
train_tags_ids = pad_sequences(train_tags_ids, maxlen=max_length, padding='post')
train_tags_ids = to_categorical(train_tags_ids, num_classes=len(tag_vocab))

In [8]:
# Preprocess the validation data
val_tokens = val_data['tokens']
val_tags = val_data['ner_tags']
val_tokens_ids = [[token_to_id.get(token, token_to_id.get('<unk>', 22)) for token in sublist] for sublist in val_tokens]
val_tags_ids = [[tag_to_id.get(tag, tag_to_id.get('<unk>', 22)) for tag in sublist] for sublist in val_tags]
val_tokens_ids = pad_sequences(val_tokens_ids, maxlen=max_length, padding='post')
val_tags_ids = pad_sequences(val_tags_ids, maxlen=max_length, padding='post')
val_tags_ids = to_categorical(val_tags_ids, num_classes=len(tag_vocab))

In [9]:
# Preprocess the test data
test_tokens = test_data['tokens']
test_tags = test_data['ner_tags']
test_tokens_ids = [[token_to_id.get(token, token_to_id.get('<unk>', 22)) for token in sublist] for sublist in test_tokens]
test_tags_ids = [[tag_to_id.get(tag, tag_to_id.get('<unk>', 22)) for tag in sublist] for sublist in test_tags]
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=max_length, padding='post')
test_tags_ids = pad_sequences(test_tags_ids, maxlen=max_length, padding='post')
test_tags_ids = to_categorical(test_tags_ids, num_classes=len(tag_vocab))

In [10]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=len(token_vocab), output_dim=64),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(tag_vocab), activation='softmax')
])

In [11]:
# import tensorflow_addons as tfa
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [12]:
model.fit(train_tokens_ids, train_tags_ids, validation_data=(val_tokens_ids, val_tags_ids), epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7bae06f498d0>

In [15]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_tokens_ids, test_tags_ids)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)

Test Loss: 0.02360674738883972
Test Accuracy: 0.9942708015441895


In [16]:
print('Test Loss:', test_precision)
print('Test Recall:', test_recall)

Test Loss: 0.9954822063446045
Test Recall: 0.9933662414550781


In [17]:
model.save('NLP_NER_BiLSTM.h5')

In [29]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions on test data
predictions = model.predict(test_tokens_ids)

# Convert predictions from categorical to label encoded
pred_tags_ids = np.argmax(predictions, axis=-1)

# Convert test tags from one-hot encoded to label encoded
true_tags_ids = np.argmax(test_tags_ids, axis=-1)

# Flatten both arrays
pred_tags_ids_flat = pred_tags_ids.flatten()
true_tags_ids_flat = true_tags_ids.flatten()

# Use inverse_transform to map predicted and true labels back to their original form.
# You can skip this step if you want the classification report for the encoded labels.
pred_tags = [tag_vocab[i] for i in pred_tags_ids_flat]
true_tags = [tag_vocab[i] for i in true_tags_ids_flat]

# Generate classification report
report = classification_report(true_tags, pred_tags, output_dict=True)



In [30]:
report = classification_report(true_tags, pred_tags)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3455846
           1       0.00      0.00      0.00       369
           2       0.94      0.83      0.88      1190
           3       0.00      0.00      0.00       179
           4       0.95      0.91      0.93     40072
           5       0.70      0.04      0.08      1065
           6       0.75      0.52      0.61      4638
           7       0.83      0.61      0.70      5351
           8       0.80      0.79      0.80      7495
           9       0.00      0.00      0.00       230
          10       0.77      0.70      0.73      3645
          11       0.00      0.00      0.00        26
          12       0.00      0.00      0.00       231
          13       0.94      0.96      0.95       603
          14       0.00      0.00      0.00       171
          15       0.82      0.67      0.74      4731
          16       0.00      0.00      0.00        72
          17       0.85    