In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [5]:
# Load and preprocess data
input_file = "DiseaseAndSymptoms.csv"
data = pd.read_csv(input_file)
data.columns = [col.replace("_", " ") for col in data.columns]
data = data.apply(lambda x: x.str.replace("_", " ") if x.dtype == "object" else x)
data["Disease"] = data["Disease"].replace("Peptic ulcer diseae", "Peptic ulcer disease")
data["Disease"] = data["Disease"].replace("Dimorphic hemmorhoids(piles)", "Dimorphic hemorrhoids (piles)")

# Clean and combine symptoms
symptom_cols = [col for col in data.columns if "Symptom" in col]
data["Symptoms"] = data[symptom_cols].apply(
    lambda row: " ".join(sorted(set([s.strip() for s in row if pd.notna(s)]))), axis=1
)

# Verify data
print(f"Number of rows: {len(data)}")
disease_list = sorted(data["Disease"].unique())
num_classes = len(disease_list)
print(f"Number of classes: {num_classes}")
print("Class distribution:\n", data["Disease"].value_counts())

Number of rows: 4920
Number of classes: 41
Class distribution:
 Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemorrhoids (piles)              120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                        

In [6]:
# Split data (stratified)
X_train, X_val, y_train_labels, y_val_labels = train_test_split(
    data["Symptoms"], data["Disease"], test_size=0.2, random_state=42, stratify=data["Disease"]
)
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

Training samples: 3936, Validation samples: 984


In [7]:
# Convert labels to categorical
y_train = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_train_labels], num_classes=num_classes)
y_val = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_val_labels], num_classes=num_classes)
print(f"Training labels shape: {y_train.shape}, Validation labels shape: {y_val.shape}")

Training labels shape: (3936, 41), Validation labels shape: (984, 41)


In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import KeyedVectors
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers

# Load BioWordVec embeddings
word2vec_path = r"C:\Users\ACER\Downloads\bio_embedding_extrinsic.bin"
word_vectors = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Load and preprocess data
data = pd.read_csv("DiseaseAndSymptoms.csv")
data.columns = [col.replace("_", " ") for col in data.columns]
data = data.apply(lambda x: x.str.replace("_", " ") if x.dtype == "object" else x)
symptom_cols = [col for col in data.columns if "Symptom" in col]
data["Symptoms"] = data[symptom_cols].apply(
    lambda row: " ".join(sorted(set([s.strip() for s in row if pd.notna(s)]))), axis=1
)

# Augment data with partial symptom combinations (balanced)
import random

def generate_partial_symptoms(symptom_text, min_symptoms=2, max_combinations=10):
    symptoms = symptom_text.split()
    partial_combinations = []
    num_symptoms = len(symptoms)
    
    # Generate combinations for each length
    for n in range(min_symptoms, num_symptoms + 1):
        for i in range(max_combinations):
            try:
                partial = " ".join(sorted(random.sample(symptoms, n)))
                partial_combinations.append(partial)
            except ValueError:
                break
    
    # If we have more than max_combinations, randomly sample to balance
    if len(partial_combinations) > max_combinations:
        partial_combinations = random.sample(partial_combinations, max_combinations)
    
    return partial_combinations

augmented_data = []
for idx, row in data.iterrows():
    symptom_text = row["Symptoms"]
    disease = row["Disease"]
    augmented_data.append((symptom_text, disease))
    partial_symptoms = generate_partial_symptoms(symptom_text, max_combinations=10)
    for partial in partial_symptoms:
        augmented_data.append((partial, disease))

augmented_df = pd.DataFrame(augmented_data, columns=["Symptoms", "Disease"])

# Split augmented data
X_train, X_val, y_train_labels, y_val_labels = train_test_split(
    augmented_df["Symptoms"], augmented_df["Disease"], test_size=0.2, random_state=42, stratify=augmented_df["Disease"]
)
print(f"Augmented training samples: {len(X_train)}, Validation samples: {len(X_val)}")

# Tokenize symptoms
tokenizer = Tokenizer()
tokenizer.fit_on_texts(augmented_df["Symptoms"])
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# Pad sequences
max_len = max(len(seq) for seq in X_train_seq)
X_train_seq = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_val_seq = pad_sequences(X_val_seq, maxlen=max_len, padding="post")
print(f"Max sequence length: {max_len}")

# Convert labels to categorical
disease_list = sorted(data["Disease"].unique())
num_classes = len(disease_list)
y_train = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_train_labels], num_classes=num_classes)
y_val = tf.keras.utils.to_categorical([disease_list.index(d) for d in y_val_labels], num_classes=num_classes)
print(f"Training labels shape: {y_train.shape}, Validation labels shape: {y_val.shape}")

# Create embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = word_vectors.vector_size  # 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

# Build simpler model with regularization
inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True)(inputs)
x = GlobalAveragePooling1D()(x)
x = Dense(256, activation="relu", kernel_regularizer=regularizers.l2(0.01))(x)  # Add L2 regularization
x = Dropout(0.5)(x)  # Increase dropout to 0.5
x = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(0.01))(x)
x = Dropout(0.5)(x)
outputs = Dense(num_classes, activation="softmax")(x)
model = Model(inputs, outputs)

# Compile with label smoothing for better calibration
model.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1), metrics=["accuracy"])
model.summary()

Augmented training samples: 43296, Validation samples: 10824
Max sequence length: 30
Training labels shape: (43296, 41), Validation labels shape: (10824, 41)




In [9]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Callbacks
checkpoint = ModelCheckpoint("biowordvec_diagnosis_model.keras", monitor="val_accuracy", save_best_only=True, mode="max", verbose=1)
early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1)

# Train
history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[checkpoint, early_stopping],
    verbose=1
)

Epoch 1/20
[1m1351/1353[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.1453 - loss: 3.8502
Epoch 1: val_accuracy improved from -inf to 0.66297, saving model to biowordvec_diagnosis_model.keras
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.1456 - loss: 3.8485 - val_accuracy: 0.6630 - val_loss: 2.2726
Epoch 2/20
[1m1351/1353[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.5861 - loss: 2.3431
Epoch 2: val_accuracy improved from 0.66297 to 0.84433, saving model to biowordvec_diagnosis_model.keras
[1m1353/1353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5862 - loss: 2.3428 - val_accuracy: 0.8443 - val_loss: 1.8712
Epoch 3/20
[1m1350/1353[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.7490 - loss: 2.0216
Epoch 3: val_accuracy improved from 0.84433 to 0.86789, saving model to biowordvec_diagnosis_model.keras
[1m1353/1353[0

In [10]:
import pickle

# Save the tokenizer for inference
with open("symptom_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("Saved tokenizer to symptom_tokenizer.pkl")

Saved tokenizer to symptom_tokenizer.pkl
