In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
# Load and preprocess data
input_file = "DiseaseAndSymptoms.csv"
data = pd.read_csv(input_file)
data.columns = [col.replace("_", " ") for col in data.columns]
data = data.apply(lambda x: x.str.replace("_", " ") if x.dtype == "object" else x)
data["Disease"] = data["Disease"].replace("Peptic ulcer diseae", "Peptic ulcer disease")
data["Disease"] = data["Disease"].replace("Dimorphic hemmorhoids(piles)", "Dimorphic hemorrhoids (piles)")

# Clean and combine symptoms
symptom_cols = [col for col in data.columns if "Symptom" in col]
data["Symptoms"] = data[symptom_cols].apply(
    lambda row: " ".join(sorted(set([s.strip() for s in row if pd.notna(s)]))), axis=1
)

# Verify data
print(f"Number of rows: {len(data)}")
disease_list = sorted(data["Disease"].unique())
num_classes = len(disease_list)
print(f"Number of classes: {num_classes}")
print("Class distribution:\n", data["Disease"].value_counts())

Number of rows: 4920
Number of classes: 41
Class distribution:
 Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemorrhoids (piles)              120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                        

In [3]:
# Split data (stratified)
X_train, X_val, y_train_labels, y_val_labels = train_test_split(
    data["Symptoms"], data["Disease"], test_size=0.2, random_state=42, stratify=data["Disease"]
)
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

Training samples: 3936, Validation samples: 984


In [4]:
# Convert labels to categorical
y_train = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_train_labels], num_classes=num_classes)
y_val = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_val_labels], num_classes=num_classes)
print(f"Training labels shape: {y_train.shape}, Validation labels shape: {y_val.shape}")

Training labels shape: (3936, 41), Validation labels shape: (984, 41)


In [8]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import KeyedVectors

# Load BioWordVec embeddings (already loaded in Cell 5)
word2vec_path = r"C:\Users\ACER\Downloads\bio_embedding_extrinsic.bin"
word_vectors = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Function to get symptom embedding (already defined in Cell 5)
def get_symptom_embedding(text, wv):
    words = text.split()
    vectors = [wv[word] for word in words if word in wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(wv.vector_size)

# Function to generate partial symptom combinations
def generate_partial_symptoms(symptom_text, min_symptoms=2):
    symptoms = symptom_text.split()
    partial_combinations = []
    for n in range(min_symptoms, len(symptoms) + 1):
        for i in range(5):  # Generate 5 partial combinations per length
            partial = " ".join(sorted(random.sample(symptoms, n)))
            partial_combinations.append(partial)
    return partial_combinations

# Augment training data
augmented_data = []
for idx, row in data.iterrows():
    symptom_text = row["Symptoms"]
    disease = row["Disease"]
    # Add the full symptom combination
    augmented_data.append((symptom_text, disease))
    # Add partial symptom combinations
    partial_symptoms = generate_partial_symptoms(symptom_text)
    for partial in partial_symptoms:
        augmented_data.append((partial, disease))

# Create augmented DataFrame
augmented_df = pd.DataFrame(augmented_data, columns=["Symptoms", "Disease"])

# Split augmented data
from sklearn.model_selection import train_test_split
X_train_aug, X_val_aug, y_train_labels_aug, y_val_labels_aug = train_test_split(
    augmented_df["Symptoms"], augmented_df["Disease"], test_size=0.2, random_state=42, stratify=augmented_df["Disease"]
)
print(f"Augmented training samples: {len(X_train_aug)}, Validation samples: {len(X_val_aug)}")

# Generate embeddings for augmented data
X_train_emb_aug = np.array([get_symptom_embedding(text, word_vectors) for text in X_train_aug])
X_val_emb_aug = np.array([get_symptom_embedding(text, word_vectors) for text in X_val_aug])
print(f"Augmented training embeddings shape: {X_train_emb_aug.shape}")
print(f"Augmented validation embeddings shape: {X_val_emb_aug.shape}")

# Convert labels to categorical
disease_list = sorted(data["Disease"].unique())
num_classes = len(disease_list)
y_train_aug = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_train_labels_aug], num_classes=num_classes)
y_val_aug = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_val_labels_aug], num_classes=num_classes)
print(f"Augmented training labels shape: {y_train_aug.shape}, Validation labels shape: {y_val_aug.shape}")

Augmented training samples: 256584, Validation samples: 64146
Augmented training embeddings shape: (256584, 200)
Augmented validation embeddings shape: (64146, 200)
Augmented training labels shape: (256584, 41), Validation labels shape: (64146, 41)


In [9]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Callbacks
checkpoint = ModelCheckpoint("biowordvec_diagnosis_model.keras", monitor="val_accuracy", save_best_only=True, mode="max", verbose=1)
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1)

# Train
history = model.fit(
    X_train_emb_aug, y_train_aug,
    validation_data=(X_val_emb_aug, y_val_aug),
    epochs=20,
    batch_size=32,
    callbacks=[checkpoint, early_stopping],
    verbose=1
)

Epoch 1/20
[1m8019/8019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9173 - loss: 0.3061
Epoch 1: val_accuracy improved from -inf to 0.95686, saving model to biowordvec_diagnosis_model.keras
[1m8019/8019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - accuracy: 0.9173 - loss: 0.3061 - val_accuracy: 0.9569 - val_loss: 0.1270
Epoch 2/20
[1m8009/8019[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.9461 - loss: 0.1744
Epoch 2: val_accuracy did not improve from 0.95686
[1m8019/8019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.9461 - loss: 0.1743 - val_accuracy: 0.9564 - val_loss: 0.1234
Epoch 3/20
[1m8019/8019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9492 - loss: 0.1552
Epoch 3: val_accuracy improved from 0.95686 to 0.95736, saving model to biowordvec_diagnosis_model.keras
[1m8019/8019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m

In [None]:
# Generate embeddings
# symptom_texts = data["Symptoms"].tolist()
# embeddings = np.array([get_symptom_embedding(text, word_vectors) for text in symptom_texts])

# # Save embeddings to .npy file
# embedding_dict = {"symptoms": symptom_texts, "embeddings": embeddings}
# npy_path = "symptom_embeddings.npy"
# np.save(npy_path, embedding_dict)
# print(f"Saved symptom embeddings to {npy_path}")

Saved symptom embeddings to symptom_embeddings.npy
