In [None]:
!pip install datasets
!pip install sklearn



In [None]:
import numpy as np
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense

# Load dataset
dataset = load_dataset("ncbi_disease")
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# Tokenize words and tags
word_tokenizer = Tokenizer(filters='', lower=False, oov_token='<UNK>')
tag_tokenizer = Tokenizer(filters='', lower=False)

word_tokenizer.fit_on_texts(train_data["tokens"])
tag_tokenizer.fit_on_texts(train_data["ner_tags"])

# Convert words and tags to sequences
X_train = word_tokenizer.texts_to_sequences(train_data["tokens"])
y_train = tag_tokenizer.texts_to_sequences(train_data["ner_tags"])
X_val = word_tokenizer.texts_to_sequences(val_data["tokens"])
y_val = tag_tokenizer.texts_to_sequences(val_data["ner_tags"])
X_test = word_tokenizer.texts_to_sequences(test_data["tokens"])
y_test = tag_tokenizer.texts_to_sequences(test_data["ner_tags"])

# Pad sequences
max_seq_len = max([len(seq) for seq in X_train])  # You can also set an arbitrary number
X_train = pad_sequences(X_train, maxlen=max_seq_len, padding='post')
y_train = pad_sequences(y_train, maxlen=max_seq_len, padding='post')
X_val = pad_sequences(X_val, maxlen=max_seq_len, padding='post')
y_val = pad_sequences(y_val, maxlen=max_seq_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_seq_len, padding='post')
y_test = pad_sequences(y_test, maxlen=max_seq_len, padding='post')

# One-hot encode labels
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

# Build LSTM model
vocab_size = len(word_tokenizer.word_index) + 1
num_tags = len(tag_tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len),
    LSTM(units=256, return_sequences=True),
    TimeDistributed(Dense(units=num_tags, activation='softmax'))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=5)

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=32)
print(f"Test accuracy: {test_acc}")



  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.990936815738678


In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
y_test_pred = model.predict(X_test)
y_tst=np.concatenate(np.argmax(y_test,axis=2))
y_tst_pred=np.concatenate(np.argmax(y_test_pred, axis=2))
print(classification_report(y_tst,y_tst_pred))
print(confusion_matrix(y_tst,y_tst_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     91246
           1       0.98      0.98      0.98     22450
           2       0.68      0.77      0.72      1087
           3       0.80      0.60      0.68       960

    accuracy                           0.99    115743
   macro avg       0.86      0.84      0.85    115743
weighted avg       0.99      0.99      0.99    115743

[[91243     0     3     0]
 [    4 22044   280   122]
 [    1   227   834    25]
 [    7   272   108   573]]
