In [69]:
import pickle
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
train_df = pd.read_csv('./data/medical_dataset_temp.csv')
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,description,severity
3290,Split-thickness skin grafting a total area of ...,1
3053,Open reduction and internal plate and screw fi...,1
599,Dilatation and curettage (D&C) and Laparoscopi...,0
2221,"Repeat low-transverse C-section, lysis of omen...",0
1161,"Excision of abscess, removal of foreign body. ...",1


In [4]:
train_df.severity.value_counts()

1    1907
0    1906
Name: severity, dtype: int64

In [5]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["description"].to_numpy(),
                                                                            train_df_shuffled["severity"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [7]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [8]:
text_vectorizer.adapt(train_sentences)

In [9]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 1,  6,  1, 13,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int64)>

In [13]:
def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [23]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding")

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [24]:
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [26]:
model_history = model.fit(train_sentences,
                              train_labels,
                              epochs=15,
                              validation_data=(val_sentences, val_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [70]:
text = "surgery"
token = text_vectorizer([text])
# print(val_sentences)

model_pred_probs = model.predict([text])
model_pred_probs[:10]



array([[0.45843607]], dtype=float32)

In [78]:
model.save("Miner_Consensus")



INFO:tensorflow:Assets written to: Miner_Consensus\assets


INFO:tensorflow:Assets written to: Miner_Consensus\assets


In [80]:
loaded_model = tf.keras.models.load_model("Miner_Consensus")
loaded_model.predict([text])



array([[0.45843607]], dtype=float32)