In [1]:
import random
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset, concatenate_datasets
from sklearn.metrics import classification_report

In [2]:
hiner = load_dataset('cfilt/HiNER-original')
train_data = hiner['train']
val_data = hiner['validation']
test_data = hiner['test']
combined_data = concatenate_datasets([train_data, val_data])
tokens = combined_data['tokens']
ner_tags = combined_data['ner_tags']
all_tokens = [token for sublist in tokens for token in sublist]
all_tags = [tag for sublist in ner_tags for tag in sublist]
token_vocab = sorted(set(all_tokens))
tag_vocab = sorted(set(all_tags))
token_to_id = {token: i for i, token in enumerate(token_vocab)}
tag_to_id = {tag: i for i, tag in enumerate(tag_vocab)}

Downloading builder script:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

Downloading and preparing dataset hi_ner_config/HiNER to /root/.cache/huggingface/datasets/cfilt___hi_ner_config/HiNER/0.0.2/c2bf095b51bde10ac392c9203c0fcdd1d7c47d2b03b6b455bf277f1afd7feed0...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.39M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset hi_ner_config downloaded and prepared to /root/.cache/huggingface/datasets/cfilt___hi_ner_config/HiNER/0.0.2/c2bf095b51bde10ac392c9203c0fcdd1d7c47d2b03b6b455bf277f1afd7feed0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
len(train_data)*0.09

6824.429999999999

In [4]:
unlabeled_percentage = 0.09

# Randomly select a portion of the training data as unlabeled data
num_samples = len(train_data)
num_unlabeled_samples = int(num_samples * unlabeled_percentage)
unlabeled_indices = random.sample(range(num_samples), num_unlabeled_samples)
labeled_indices = [i for i in range(num_samples) if i not in unlabeled_indices]

In [5]:
labeled_train_data = train_data.select(labeled_indices)
train_tokens = labeled_train_data['tokens']
train_tags = labeled_train_data['ner_tags']
train_tokens_ids = [[token_to_id[token] for token in sublist] for sublist in train_tokens]
train_tags_ids = [[tag_to_id[tag] for tag in sublist] for sublist in train_tags]
max_length = max(len(seq) for seq in train_tokens_ids)  # Calculate the maximum length
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=max_length, padding='post')
train_tags_ids = pad_sequences(train_tags_ids, maxlen=max_length, padding='post')
train_tags_ids = to_categorical(train_tags_ids, num_classes=len(tag_vocab))

In [6]:
val_tokens = val_data['tokens']
val_tags = val_data['ner_tags']
val_tokens_ids = [[token_to_id.get(token, token_to_id.get('<unk>', 22)) for token in sublist] for sublist in val_tokens]
val_tags_ids = [[tag_to_id.get(tag, tag_to_id.get('<unk>', 22)) for tag in sublist] for sublist in val_tags]
val_tokens_ids = pad_sequences(val_tokens_ids, maxlen=max_length, padding='post')
val_tags_ids = pad_sequences(val_tags_ids, maxlen=max_length, padding='post')
val_tags_ids = to_categorical(val_tags_ids, num_classes=len(tag_vocab))

In [7]:
test_tokens = test_data['tokens']
test_tags = test_data['ner_tags']
test_tokens_ids = [[token_to_id.get(token, token_to_id.get('<unk>', 22)) for token in sublist] for sublist in test_tokens]
test_tags_ids = [[tag_to_id.get(tag, token_to_id.get('<unk>', 22)) for tag in sublist] for sublist in test_tags]
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=max_length, padding='post')
test_tags_ids = pad_sequences(test_tags_ids, maxlen=max_length, padding='post')
test_tags_ids = to_categorical(test_tags_ids, num_classes=len(tag_vocab))

In [8]:
unlabeled_train_data = train_data.select(unlabeled_indices)
unlabeled_tokens = unlabeled_train_data['tokens']
unlabeled_tokens_ids = [[token_to_id.get(token, token_to_id.get('<unk>', 22)) for token in sublist] for sublist in unlabeled_tokens]
unlabeled_tokens_ids = pad_sequences(unlabeled_tokens_ids, maxlen=max_length, padding='post')

In [9]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=len(token_vocab), output_dim=64),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(tag_vocab), activation='softmax')
])

In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [12]:
model.fit(train_tokens_ids, train_tags_ids, validation_data=(val_tokens_ids, val_tags_ids), epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x78b92c157670>

In [13]:
unlabeled_predictions = model.predict(unlabeled_tokens_ids)
unlabeled_tags_ids = tf.argmax(unlabeled_predictions, axis=-1)
unlabeled_tags_ids = to_categorical(unlabeled_tags_ids, num_classes=len(tag_vocab))



In [14]:
combined_tokens_ids = tf.concat([train_tokens_ids, unlabeled_tokens_ids], axis=0)
combined_tags_ids = tf.concat([train_tags_ids, unlabeled_tags_ids], axis=0)

In [15]:
model.fit(combined_tokens_ids, combined_tags_ids, validation_data=(val_tokens_ids, val_tags_ids), epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x78b90d146ce0>

In [16]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_tokens_ids, test_tags_ids)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)
print('Test Precision:',test_precision)
print('Test Recall:',test_recall)

Test Loss: 0.036444783210754395
Test Accuracy: 0.9928078651428223
Test Precision: 0.9941133856773376
Test Recall: 0.9921398162841797


In [23]:
import random
def predict_sentence():
    sentence = random.choice(test_data['tokens'])
    sentence_tokens = sentence
    sentence_token_ids = [token_to_id.get(token, token_to_id.get('<unk>', 0)) for token in sentence_tokens]
    padded_token_ids = pad_sequences([sentence_token_ids], maxlen=max_length, padding='post')
    predicted_tags_ids = model.predict(padded_token_ids)
    predicted_tags = [tag_vocab[tag_id.argmax()] for tag_id in predicted_tags_ids[0]]
    print('Sentence:', sentence)
    print('Predicted Tags:', predicted_tags[:len(sentence_tokens)])
    print('Actual Tags:', test_data['ner_tags'][test_data['tokens'].index(sentence_tokens)][:len(sentence_tokens)])

    # print ratio of correct tags to total tags without using loops
    correct = sum([1 for i in range(len(sentence_tokens)) if predicted_tags[i] == test_data['ner_tags'][test_data['tokens'].index(sentence_tokens)][i]])
    print('Predicted Tags ratio', correct,":",len(sentence_tokens))

In [24]:
predict_sentence()

Sentence: ['स्यूनानी,', 'जैंती', 'तहसील', 'में', 'भारत', 'के', 'उत्तराखण्ड', 'राज्य', 'के', 'अन्तर्गत', 'कुमाऊँ', 'मण्डल', 'के', 'अल्मोड़ा', 'जिले', 'का', 'एक', 'गाँव', 'है।']
Predicted Tags: [4, 4, 22, 22, 4, 22, 4, 22, 22, 22, 4, 22, 22, 4, 22, 22, 22, 22, 22]
Actual Tags: [4, 4, 22, 22, 4, 22, 4, 22, 22, 22, 4, 22, 22, 4, 22, 22, 22, 22, 22]
Predicted Tags ratio 19 : 19


In [26]:
predict_sentence()

Sentence: ['अलवर', 'ग्रामीण', 'विधानसभा', 'क्षेत्र', 'राजस्थान', 'का', 'एक', 'विधानसभा', 'क्षेत्र', 'है।']
Predicted Tags: [4, 15, 18, 22, 4, 22, 6, 7, 22, 22]
Actual Tags: [4, 15, 15, 22, 4, 22, 6, 7, 22, 22]
Predicted Tags ratio 9 : 10


In [17]:
model.save('BiLSTM_SemiSupervised.h5')

In [18]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions on test data
predictions = model.predict(test_tokens_ids)

# Convert predictions from categorical to label encoded
pred_tags_ids = np.argmax(predictions, axis=-1)

# Convert test tags from one-hot encoded to label encoded
true_tags_ids = np.argmax(test_tags_ids, axis=-1)

# Flatten both arrays
pred_tags_ids_flat = pred_tags_ids.flatten()
true_tags_ids_flat = true_tags_ids.flatten()

# Use inverse_transform to map predicted and true labels back to their original form.
# You can skip this step if you want the classification report for the encoded labels.
pred_tags = [tag_vocab[i] for i in pred_tags_ids_flat]
true_tags = [tag_vocab[i] for i in true_tags_ids_flat]

# Generate classification report
report = classification_report(true_tags, pred_tags, output_dict=True)

report = classification_report(true_tags, pred_tags)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3174305
           1       0.41      0.05      0.08       369
           2       0.95      0.87      0.91      1190
           3       1.00      0.01      0.01       179
           4       0.96      0.87      0.91     40072
           5       0.53      0.46      0.49      1065
           6       0.43      0.60      0.50      4638
           7       0.80      0.70      0.75      5351
           8       0.89      0.70      0.78      7495
           9       0.69      0.38      0.49       230
          10       0.85      0.67      0.75      3645
          11       0.00      0.00      0.00        26
          12       0.54      0.06      0.12       231
          13       0.93      0.97      0.95       603
          14       0.67      0.01      0.02       171
          15       0.86      0.58      0.69      4731
          16       0.00      0.00      0.00        72
          17       0.62    