In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
# Load and preprocess data
input_file = "DiseaseAndSymptoms.csv"
data = pd.read_csv(input_file)
data.columns = [col.replace("_", " ") for col in data.columns]
data = data.apply(lambda x: x.str.replace("_", " ") if x.dtype == "object" else x)
data["Disease"] = data["Disease"].replace("Peptic ulcer diseae", "Peptic ulcer disease")
data["Disease"] = data["Disease"].replace("Dimorphic hemmorhoids(piles)", "Dimorphic hemorrhoids (piles)")

# Clean and combine symptoms
symptom_cols = [col for col in data.columns if "Symptom" in col]
data["Symptoms"] = data[symptom_cols].apply(
    lambda row: " ".join(sorted(set([s.strip() for s in row if pd.notna(s)]))), axis=1
)

# Verify data
print(f"Number of rows: {len(data)}")
disease_list = sorted(data["Disease"].unique())
num_classes = len(disease_list)
print(f"Number of classes: {num_classes}")
print("Class distribution:\n", data["Disease"].value_counts())

Number of rows: 4920
Number of classes: 41
Class distribution:
 Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemorrhoids (piles)              120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                        

In [3]:
# Split data (stratified)
X_train, X_val, y_train_labels, y_val_labels = train_test_split(
    data["Symptoms"], data["Disease"], test_size=0.2, random_state=42, stratify=data["Disease"]
)
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

Training samples: 3936, Validation samples: 984


In [4]:
# Convert labels to categorical
y_train = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_train_labels], num_classes=num_classes)
y_val = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_val_labels], num_classes=num_classes)
print(f"Training labels shape: {y_train.shape}, Validation labels shape: {y_val.shape}")

Training labels shape: (3936, 41), Validation labels shape: (984, 41)


In [5]:
# Load BioWordVec extrinsic embeddings
word2vec_path = r"C:\Users\ACER\Downloads\bio_embedding_extrinsic.bin"  # Update with your path
word_vectors = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Function to get average embedding
def get_symptom_embedding(text, wv):
    words = text.split()
    vectors = [wv[word] for word in words if word in wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(wv.vector_size)

# Generate embeddings
X_train_emb = np.array([get_symptom_embedding(text, word_vectors) for text in X_train])
X_val_emb = np.array([get_symptom_embedding(text, word_vectors) for text in X_val])

# Verify shapes
print(f"Training embeddings shape: {X_train_emb.shape}")  # (3936, 200)
print(f"Validation embeddings shape: {X_val_emb.shape}")  # (984, 200)

# Build classifier
inputs = Input(shape=(word_vectors.vector_size,))  # 200D
x = Dense(256, activation="relu")(inputs)
x = Dropout(0.3)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.3)(x)
outputs = Dense(num_classes, activation="softmax")(x)
model = Model(inputs, outputs)

# Compile
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

Training embeddings shape: (3936, 200)
Validation embeddings shape: (984, 200)


In [6]:
# Callbacks
checkpoint = ModelCheckpoint("biowordvec_diagnosis_model.h5", monitor="val_accuracy", save_best_only=True, mode="max", verbose=1)
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1)

# Train
history = model.fit(
    X_train_emb, y_train,
    validation_data=(X_val_emb, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[checkpoint, early_stopping],
    verbose=1
)

Epoch 1/20
[1m119/123[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.1994 - loss: 3.2893
Epoch 1: val_accuracy improved from -inf to 0.91362, saving model to biowordvec_diagnosis_model.h5




[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.2071 - loss: 3.2635 - val_accuracy: 0.9136 - val_loss: 0.9805
Epoch 2/20
[1m118/123[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8039 - loss: 0.9192
Epoch 2: val_accuracy improved from 0.91362 to 0.99695, saving model to biowordvec_diagnosis_model.h5




[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8064 - loss: 0.9084 - val_accuracy: 0.9970 - val_loss: 0.1727
Epoch 3/20
[1m120/123[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.9466 - loss: 0.2952
Epoch 3: val_accuracy improved from 0.99695 to 0.99898, saving model to biowordvec_diagnosis_model.h5




[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9468 - loss: 0.2938 - val_accuracy: 0.9990 - val_loss: 0.0629
Epoch 4/20
[1m122/123[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.9765 - loss: 0.1468
Epoch 4: val_accuracy improved from 0.99898 to 1.00000, saving model to biowordvec_diagnosis_model.h5




[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9765 - loss: 0.1466 - val_accuracy: 1.0000 - val_loss: 0.0298
Epoch 5/20
[1m 94/123[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.9866 - loss: 0.0927
Epoch 5: val_accuracy did not improve from 1.00000
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9869 - loss: 0.0901 - val_accuracy: 1.0000 - val_loss: 0.0162
Epoch 6/20
[1m119/123[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.9873 - loss: 0.0661
Epoch 6: val_accuracy did not improve from 1.00000
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9875 - loss: 0.0658 - val_accuracy: 1.0000 - val_loss: 0.0084
Epoch 7/20
[1m113/123[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 2ms/step - accuracy: 0.9928 - loss: 0.0460
Epoch 7: val_accuracy did not improve from 1.00000
[1m123/123[0m [32m━━━━━━━━━━━━━

In [7]:
# Generate embeddings
symptom_texts = data["Symptoms"].tolist()
embeddings = np.array([get_symptom_embedding(text, word_vectors) for text in symptom_texts])

# Save embeddings to .npy file
embedding_dict = {"symptoms": symptom_texts, "embeddings": embeddings}
npy_path = "symptom_embeddings.npy"
np.save(npy_path, embedding_dict)
print(f"Saved symptom embeddings to {npy_path}")

Saved symptom embeddings to symptom_embeddings.npy
