In [19]:
# ## Cell 1: Import Libraries and Define Paths
#
# In this cell, we import the necessary libraries and define the file paths for the dataset, BioWordVec embeddings, and output files.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import pickle
import os
from gensim.models import KeyedVectors

# Paths to files (adjust as needed)
DATA_PATH = "DiseaseAndSymptoms.csv"
BIOWORDVEC_PATH = r"C:\Users\ACER\Downloads\BioWordVec_PubMed_MIMICIII_d200.vec.bin" # Path to BioWordVec embeddings in .bin format
MODEL_PATH = "biowordvec_diagnosis_model.keras"
TOKENIZER_PATH = "symptom_tokenizer.pkl"
LABEL_ENCODER_PATH = "label_encoder.pkl"

In [8]:
# ## Cell 2: Load and Preprocess the Dataset
#
# This cell loads the dataset (`DiseaseAndSymptoms.csv`), preprocesses it, and splits it into train/test sets.

def load_and_preprocess_data(data_path):
    """
    Load the dataset, preprocess symptoms, and split into train/test sets.
    """
    data = pd.read_csv(data_path)
    data["Disease"] = data["Disease"].replace("Peptic ulcer diseae", "Peptic ulcer disease")
    data["Disease"] = data["Disease"].replace("Dimorphic hemmorhoids(piles)", "Dimorphic hemorrhoids (piles)")
    
    # Replace underscores with spaces in column names and data
    data.columns = [col.replace("_", " ") for col in data.columns]
    data = data.apply(lambda x: x.str.replace("_", " ") if x.dtype == "object" else x)
    
    # Combine all symptoms into a single string per row
    symptom_cols = [col for col in data.columns if "Symptom" in col]
    data["Symptoms"] = data[symptom_cols].apply(
        lambda row: " ".join(sorted(set([s.strip() for s in row if pd.notna(s)]))), axis=1
    )
    
    # Drop rows with empty symptoms
    data = data[data["Symptoms"].str.strip() != ""]
    
    data = data.drop_duplicates(subset=["Symptoms", "Disease"])
    print(f"Dataset size after deduplication: {len(data)}")
    
    # Split into train and test sets (before augmentation)
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    return train_data[["Disease", "Symptoms"]], test_data[["Disease", "Symptoms"]]

print("Loading and preprocessing data...")
train_data, test_data = load_and_preprocess_data(DATA_PATH)
print(f"Training dataset size: {len(train_data)}, Test dataset size: {len(test_data)}")

Loading and preprocessing data...
Dataset size after deduplication: 304
Training dataset size: 243, Test dataset size: 61


In [7]:
train_symptoms = set(train_data["Symptoms"])
test_symptoms = set(test_data["Symptoms"])
overlap = train_symptoms.intersection(test_symptoms)
print(f"Number of overlapping symptom strings between train and test: {len(overlap)}")
if overlap:
    print("Sample overlapping symptoms:", list(overlap)[:5])

Number of overlapping symptom strings between train and test: 0


In [13]:
def augment_data(data, samples_per_disease=20):
    """
    Augment the dataset by creating synthetic symptom combinations with co-occurrence patterns.
    """
    augmented_data = data.copy()
    diseases = data["Disease"].unique()
    
    for disease in diseases:
        disease_rows = data[data["Disease"] == disease]
        all_symptoms = set()
        for symptoms in disease_rows["Symptoms"]:
            all_symptoms.update(symptoms.split())
        all_symptoms = list(all_symptoms)
        
        # Calculate symptom co-occurrence (simple frequency-based probability)
        symptom_counts = {}
        for symptoms in disease_rows["Symptoms"]:
            symptom_list = symptoms.split()
            for i, s1 in enumerate(symptom_list):
                if s1 not in symptom_counts:
                    symptom_counts[s1] = {}
                for s2 in symptom_list[i+1:]:
                    if s2 not in symptom_counts[s1]:
                        symptom_counts[s1][s2] = 0
                    symptom_counts[s1][s2] += 1
        
        # Normalize to get probabilities
        symptom_probs = {}
        for s1 in symptom_counts:
            total = sum(symptom_counts[s1].values())
            if total > 0:
                symptom_probs[s1] = {s2: count/total for s2, count in symptom_counts[s1].items()}
        
        # Generate synthetic samples
        for _ in range(samples_per_disease):
            num_symptoms = np.random.randint(2, min(len(all_symptoms) + 1, 6))
            selected_symptoms = []
            # Start with a random symptom
            current_symptom = np.random.choice(all_symptoms)
            selected_symptoms.append(current_symptom)
            
            # Add symptoms based on co-occurrence probabilities
            for _ in range(num_symptoms - 1):
                if current_symptom in symptom_probs and symptom_probs[current_symptom]:
                    next_symptom_probs = symptom_probs[current_symptom]
                    next_symptom = np.random.choice(
                        list(next_symptom_probs.keys()),
                        p=list(next_symptom_probs.values())
                    )
                    selected_symptoms.append(next_symptom)
                    current_symptom = next_symptom
                else:
                    # Fallback to random selection if no co-occurrence data
                    remaining_symptoms = [s for s in all_symptoms if s not in selected_symptoms]
                    if remaining_symptoms:
                        next_symptom = np.random.choice(remaining_symptoms)
                        selected_symptoms.append(next_symptom)
                        current_symptom = next_symptom
            
            synthetic_symptoms = " ".join(sorted(set(selected_symptoms)))
            augmented_data = pd.concat([augmented_data, pd.DataFrame({
                "Disease": [disease],
                "Symptoms": [synthetic_symptoms]
            })], ignore_index=True)
    
    return augmented_data
print("Augmenting training data...")
augmented_train_data = augment_data(train_data, samples_per_disease=20)
print(f"Original training dataset size: {len(train_data)}, Augmented training dataset size: {len(augmented_train_data)}")

Augmenting training data...
Original training dataset size: 243, Augmented training dataset size: 1063


In [14]:
# ## Cell 4: Prepare Data for Training
#
# This cell tokenizes the symptoms, encodes the diseases, and prepares the training, validation, and test sets.

def prepare_data(train_data, test_data, max_len=20):
    """
    Tokenize symptoms, encode diseases, and prepare training, validation, and test data.
    """
    # Tokenize symptoms
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data["Symptoms"])
    
    # Convert symptoms to sequences
    train_sequences = tokenizer.texts_to_sequences(train_data["Symptoms"])
    train_sequences = pad_sequences(train_sequences, maxlen=max_len, padding="post")
    
    test_sequences = tokenizer.texts_to_sequences(test_data["Symptoms"])
    test_sequences = pad_sequences(test_sequences, maxlen=max_len, padding="post")
    
    # Encode diseases
    label_encoder = LabelEncoder()
    train_labels = label_encoder.fit_transform(train_data["Disease"])
    test_labels = label_encoder.transform(test_data["Disease"])
    
    # Split augmented training data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        train_sequences, train_labels, test_size=0.2, random_state=42
    )
    
    # Compute class weights to handle class imbalance
    class_weights = compute_class_weight("balanced", classes=np.unique(train_labels), y=train_labels)
    class_weight_dict = dict(enumerate(class_weights))
    class_weight_dict = {k: np.sqrt(v) for k, v in class_weight_dict.items()}  # Soften weights
    
    return X_train, X_val, y_train, y_val, test_sequences, test_labels, tokenizer, label_encoder, len(tokenizer.word_index) + 1, class_weight_dict

print("Preparing data for training...")
X_train, X_val, y_train, y_val, X_test, y_test, tokenizer, label_encoder, vocab_size, class_weight_dict = prepare_data(augmented_train_data, test_data)
num_classes = len(label_encoder.classes_)
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}, Test samples: {len(X_test)}")
print(f"Vocabulary size: {vocab_size}, Number of classes: {num_classes}")

Preparing data for training...
Training samples: 850, Validation samples: 213, Test samples: 61
Vocabulary size: 209, Number of classes: 41


In [15]:
# ## Cell 5: Load BioWordVec Embeddings
#
# This cell loads the BioWordVec embeddings from the .bin file using gensim's KeyedVectors.

def load_biowordvec_embeddings(biowordvec_path):
    """
    Load BioWordVec embeddings from a .bin file using gensim.
    """
    print("Loading BioWordVec embeddings from .bin file...")
    embeddings = KeyedVectors.load_word2vec_format(biowordvec_path, binary=True)
    return embeddings

print("Loading BioWordVec embeddings...")
embeddings_index = load_biowordvec_embeddings(BIOWORDVEC_PATH)
print(f"Loaded {len(embeddings_index)} word vectors.")

Loading BioWordVec embeddings...
Loading BioWordVec embeddings from .bin file...
Loaded 16545452 word vectors.


In [16]:
# ## Cell 6: Create Embedding Matrix
#
# This cell creates an embedding matrix using the BioWordVec embeddings for the tokenized vocabulary.

def create_embedding_matrix(tokenizer, embeddings_index, vocab_size, embedding_dim=200):
    """
    Create an embedding matrix for the model using BioWordVec embeddings.
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i >= vocab_size:
            continue
        try:
            embedding_vector = embeddings_index[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            # Word not in BioWordVec vocabulary, leave as zero vector
            pass
    return embedding_matrix

print("Creating embedding matrix...")
embedding_matrix = create_embedding_matrix(tokenizer, embeddings_index, vocab_size)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Creating embedding matrix...
Embedding matrix shape: (209, 200)


In [20]:
# ## Cell 7: Build and Train the Model
#
# This cell builds a simpler dense model with BioWordVec embeddings and trains it on the augmented dataset.

def build_and_train_model(X_train, X_val, y_train, y_val, vocab_size, embedding_matrix, num_classes, class_weight_dict, max_len=20, embedding_dim=200):
    """
    Build and train a simpler dense model with BioWordVec embeddings.
    """
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=True),  # Fine-tune embeddings
        Flatten(),
        Dense(64, activation="relu"),
        Dropout(0.5),
        Dense(32, activation="relu"),
        Dropout(0.5),
        Dense(num_classes, activation="softmax")
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.summary()
    
    # Add early stopping
    early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
    
    # Train the model with class weights
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,  # Increased
        batch_size=32,
        callbacks=[EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)],  # Adjusted patience
        class_weight=class_weight_dict,
        verbose=1
    )
    
    return model, history

print("Building and training model...")
model, history = build_and_train_model(
    X_train, X_val, y_train, y_val, vocab_size, embedding_matrix, num_classes, class_weight_dict
)

Building and training model...


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.0166 - loss: 3.7647 - val_accuracy: 0.0423 - val_loss: 3.6919
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0328 - loss: 3.7326 - val_accuracy: 0.0563 - val_loss: 3.6778
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0367 - loss: 3.6801 - val_accuracy: 0.0704 - val_loss: 3.6656
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0513 - loss: 3.6495 - val_accuracy: 0.0845 - val_loss: 3.6415
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0570 - loss: 3.6371 - val_accuracy: 0.0986 - val_loss: 3.6220
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0556 - loss: 3.6195 - val_accuracy: 0.1174 - val_loss: 3.5948
Epoch 7/50
[1m27/27[0m [32m━━━━━━━━━

In [18]:
# ## Cell 8: Evaluate on Test Set
#
# This cell evaluates the model on the separate test set.

print("Evaluating on test set...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Evaluating on test set...
Test Loss: 1.5290, Test Accuracy: 0.7213


In [24]:
# ## Cell 9: Save the Model, Tokenizer, and Label Encoder
#
# This cell saves the trained model, tokenizer, and label encoder for use in the inference script (`main.py`).

print("Saving model, tokenizer, and label encoder...")
model.save(MODEL_PATH)
with open(TOKENIZER_PATH, "wb") as f:
    pickle.dump(tokenizer, f)
with open(LABEL_ENCODER_PATH, "wb") as f:
    pickle.dump(label_encoder, f)

print("Training complete!")

Saving model, tokenizer, and label encoder...
Training complete!
