In [1]:
pip install tf2crf

Installing collected packages: tf2crf
Successfully installed tf2crf-0.1.33
[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset, concatenate_datasets
from sklearn.metrics import classification_report
from tf2crf import CRF, ModelWithCRFLoss

In [16]:
hiner = load_dataset('cfilt/HiNER-collapsed')
train_data = hiner['train']
val_data = hiner['validation']
test_data = hiner['test']

Downloading builder script:   0%|          | 0.00/3.08k [00:00<?, ?B/s]

Downloading and preparing dataset hi_ner_collapsed_config/HiNER-Collapsed to /root/.cache/huggingface/datasets/cfilt___hi_ner_collapsed_config/HiNER-Collapsed/0.0.2/fa4c99b4cefed1144a9c7da5e3d85737ead950f0e8db723d1bf108fc7613b493...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/50.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.20M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset hi_ner_collapsed_config downloaded and prepared to /root/.cache/huggingface/datasets/cfilt___hi_ner_collapsed_config/HiNER-Collapsed/0.0.2/fa4c99b4cefed1144a9c7da5e3d85737ead950f0e8db723d1bf108fc7613b493. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
combined_data = concatenate_datasets([train_data, val_data])

In [18]:
tokens = combined_data['tokens']
ner_tags = combined_data['ner_tags']

In [19]:
all_tokens = [token for sublist in tokens for token in sublist]
all_tags = [tag for sublist in ner_tags for tag in sublist]
token_vocab = sorted(set(all_tokens))
tag_vocab = sorted(set(all_tags))
# tag_vocab.append(int(23))

In [20]:
token_to_id = {token: i for i, token in enumerate(token_vocab)}
tag_to_id = {tag: i for i, tag in enumerate(tag_vocab)}

In [21]:
# Preprocess the training data
train_tokens = train_data['tokens']
train_tags = train_data['ner_tags']
train_tokens_ids = [[token_to_id[token] for token in sublist] for sublist in train_tokens]
train_tags_ids = [[tag_to_id[tag] for tag in sublist] for sublist in train_tags]
max_length = max(len(seq) for seq in train_tokens_ids)  # Calculate the maximum length
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=max_length, padding='post')
train_tags_ids = pad_sequences(train_tags_ids, maxlen=max_length, padding='post')
# train_tags_ids = to_categorical(train_tags_ids, num_classes=len(tag_vocab))

In [22]:
# Preprocess the validation data
val_tokens = val_data['tokens']
val_tags = val_data['ner_tags']
val_tokens_ids = [[token_to_id.get(token, token_to_id.get('<unk>', 6)) for token in sublist] for sublist in val_tokens]
val_tags_ids = [[tag_to_id.get(tag, tag_to_id.get('<unk>', 6)) for tag in sublist] for sublist in val_tags]
val_tokens_ids = pad_sequences(val_tokens_ids, maxlen=max_length, padding='post')
val_tags_ids = pad_sequences(val_tags_ids, maxlen=max_length, padding='post')
# val_tags_ids = to_categorical(val_tags_ids, num_classes=len(tag_vocab))

In [23]:
# Preprocess the test data
test_tokens = test_data['tokens']
test_tags = test_data['ner_tags']
test_tokens_ids = [[token_to_id.get(token, token_to_id.get('<unk>', 6)) for token in sublist] for sublist in test_tokens]
test_tags_ids = [[tag_to_id.get(tag, tag_to_id.get('<unk>', 6)) for tag in sublist] for sublist in test_tags]
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=max_length, padding='post')
test_tags_ids = pad_sequences(test_tags_ids, maxlen=max_length, padding='post')
# test_tags_ids = to_categorical(test_tags_ids, num_classes=len(tag_vocab))

In [24]:
word_embedding = layers.Embedding(input_dim=len(token_vocab), output_dim=64)
bilstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))
bilstm_2 = layers.Bidirectional(layers.LSTM(64, return_sequences=True))
dense = layers.Dense(len(tag_vocab))
crf = CRF(dtype=tf.float32)

inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = word_embedding(inputs)
x = bilstm(x)
x = bilstm_2(x)
x = dense(x)
outputs = crf(x)

base_model = tf.keras.Model(inputs, outputs)

model = ModelWithCRFLoss(base_model)

In [25]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [26]:
model.fit(train_tokens_ids, train_tags_ids, validation_data=(val_tokens_ids, val_tags_ids), epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7a963bf31750>

In [29]:
test_loss, test_accuracy = model.evaluate(test_tokens_ids, test_tags_ids)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)

Test Loss: 1.8837071657180786
Test Accuracy: 0.9970746040344238


In [30]:
model.save('NLP_NER_BiLSTM-CRF_Collapsed',save_format="tf")

In [31]:
from sklearn.metrics import classification_report
import numpy as np

# Get predictions on test data
predictions = model.predict(test_tokens_ids)

# Convert predictions from categorical to label encoded
pred_tags_ids = np.argmax(predictions, axis=-1)

# Convert test tags from one-hot encoded to label encoded
true_tags_ids = np.argmax(test_tags_ids, axis=-1)

# Flatten both arrays
pred_tags_ids_flat = pred_tags_ids.flatten()
true_tags_ids_flat = true_tags_ids.flatten()

# Use inverse_transform to map predicted and true labels back to their original form.
# You can skip this step if you want the classification report for the encoded labels.
pred_tags = [tag_vocab[i] for i in pred_tags_ids_flat]
true_tags = [tag_vocab[i] for i in true_tags_ids_flat]

# Generate classification report
report = classification_report(true_tags, pred_tags, output_dict=True)

report = classification_report(true_tags, pred_tags)
print(report)



IndexError: list index out of range

In [32]:
!zip -r NLP_NER_BiLSTM-CRF_Collapsed NLP_NER_BiLSTM-CRF_Collapsed/

  adding: NLP_NER_BiLSTM-CRF_Collapsed/ (stored 0%)
  adding: NLP_NER_BiLSTM-CRF_Collapsed/keras_metadata.pb (deflated 92%)
  adding: NLP_NER_BiLSTM-CRF_Collapsed/variables/ (stored 0%)
  adding: NLP_NER_BiLSTM-CRF_Collapsed/variables/variables.index (deflated 66%)
  adding: NLP_NER_BiLSTM-CRF_Collapsed/variables/variables.data-00000-of-00001 (deflated 10%)
  adding: NLP_NER_BiLSTM-CRF_Collapsed/saved_model.pb (deflated 90%)
  adding: NLP_NER_BiLSTM-CRF_Collapsed/assets/ (stored 0%)
  adding: NLP_NER_BiLSTM-CRF_Collapsed/fingerprint.pb (stored 0%)
