In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import pickle
import os

# Paths to files (adjust as needed)
DATA_PATH = "DiseaseAndSymptoms.csv"
BIOWORDVEC_PATH = "BioWordVec_PubMed_MIMICIII_d200.txt"  # Path to BioWordVec embeddings
MODEL_PATH = "biowordvec_diagnosis_model.keras"
TOKENIZER_PATH = "symptom_tokenizer.pkl"

In [None]:
# ## Cell 2: Load and Preprocess the Dataset
#
# This cell loads the dataset (`DiseaseAndSymptoms.csv`) and preprocesses it by combining symptoms into a single string per row and fixing disease name typos.

def load_and_preprocess_data(data_path):
    """
    Load the dataset, preprocess symptoms, and encode diseases.
    """
    data = pd.read_csv(data_path)
    data["Disease"] = data["Disease"].replace("Peptic ulcer diseae", "Peptic ulcer disease")
    data["Disease"] = data["Disease"].replace("Dimorphic hemmorhoids(piles)", "Dimorphic hemorrhoids (piles)")
    
    # Replace underscores with spaces in column names and data
    data.columns = [col.replace("_", " ") for col in data.columns]
    data = data.apply(lambda x: x.str.replace("_", " ") if x.dtype == "object" else x)
    
    # Combine all symptoms into a single string per row
    symptom_cols = [col for col in data.columns if "Symptom" in col]
    data["Symptoms"] = data[symptom_cols].apply(
        lambda row: " ".join(sorted(set([s.strip() for s in row if pd.notna(s)]))), axis=1
    )
    
    # Drop rows with empty symptoms
    data = data[data["Symptoms"].str.strip() != ""]
    
    return data[["Disease", "Symptoms"]]

print("Loading and preprocessing data...")
data = load_and_preprocess_data(DATA_PATH)
print(f"Loaded dataset with {len(data)} rows.")

In [None]:
# ## Cell 3: Augment the Dataset
#
# This cell augments the dataset by creating synthetic symptom combinations for each disease to increase data diversity.

def augment_data(data):
    """
    Augment the dataset by creating synthetic symptom combinations.
    """
    augmented_data = data.copy()
    diseases = data["Disease"].unique()
    
    # For each disease, create synthetic samples by combining symptoms
    for disease in diseases:
        disease_rows = data[data["Disease"] == disease]
        all_symptoms = set()
        for symptoms in disease_rows["Symptoms"]:
            all_symptoms.update(symptoms.split())
        
        # Generate synthetic samples (e.g., 5 new samples per disease)
        for _ in range(5):
            # Randomly select 2-5 symptoms
            num_symptoms = np.random.randint(2, 6)
            selected_symptoms = np.random.choice(list(all_symptoms), size=num_symptoms, replace=False)
            synthetic_symptoms = " ".join(sorted(selected_symptoms))
            augmented_data = pd.concat([augmented_data, pd.DataFrame({
                "Disease": [disease],
                "Symptoms": [synthetic_symptoms]
            })], ignore_index=True)
    
    return augmented_data

print("Augmenting data...")
augmented_data = augment_data(data)
print(f"Original dataset size: {len(data)}, Augmented dataset size: {len(augmented_data)}")

In [None]:
# ## Cell 4: Prepare Data for Training
#
# This cell tokenizes the symptoms, encodes the diseases, and splits the data into training and validation sets.

def prepare_data(data, max_len=20):
    """
    Tokenize symptoms, encode diseases, and prepare training data.
    """
    # Tokenize symptoms
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data["Symptoms"])
    
    # Convert symptoms to sequences
    symptom_sequences = tokenizer.texts_to_sequences(data["Symptoms"])
    symptom_sequences = pad_sequences(symptom_sequences, maxlen=max_len, padding="post")
    
    # Encode diseases
    label_encoder = LabelEncoder()
    disease_labels = label_encoder.fit_transform(data["Disease"])
    
    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        symptom_sequences, disease_labels, test_size=0.2, random_state=42
    )
    
    return X_train, X_val, y_train, y_val, tokenizer, label_encoder, len(tokenizer.word_index) + 1

print("Preparing data for training...")
X_train, X_val, y_train, y_val, tokenizer, label_encoder, vocab_size = prepare_data(augmented_data)
num_classes = len(label_encoder.classes_)
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")
print(f"Vocabulary size: {vocab_size}, Number of classes: {num_classes}")

In [None]:
# ## Cell 5: Load BioWordVec Embeddings
#
# This cell loads the BioWordVec embeddings from the .bin file using gensim's KeyedVectors.

def load_biowordvec_embeddings(biowordvec_path):
    """
    Load BioWordVec embeddings from a .bin file using gensim.
    """
    print("Loading BioWordVec embeddings from .bin file...")
    embeddings = KeyedVectors.load_word2vec_format(biowordvec_path, binary=True)
    return embeddings

print("Loading BioWordVec embeddings...")
embeddings_index = load_biowordvec_embeddings(BIOWORDVEC_PATH)
print(f"Loaded {len(embeddings_index)} word vectors.")

In [None]:
# ## Cell 6: Create Embedding Matrix
#
# This cell creates an embedding matrix using the BioWordVec embeddings for the tokenized vocabulary.

def create_embedding_matrix(tokenizer, embeddings_index, vocab_size, embedding_dim=200):
    """
    Create an embedding matrix for the model using BioWordVec embeddings.
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i >= vocab_size:
            continue
        try:
            embedding_vector = embeddings_index[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            # Word not in BioWordVec vocabulary, leave as zero vector
            pass
    return embedding_matrix

print("Creating embedding matrix...")
embedding_matrix = create_embedding_matrix(tokenizer, embeddings_index, vocab_size)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

In [None]:
# ## Cell 7: Build and Train the Model
#
# This cell builds an LSTM model with BioWordVec embeddings and trains it on the augmented dataset.

def build_and_train_model(X_train, X_val, y_train, y_val, vocab_size, embedding_matrix, num_classes, max_len=20, embedding_dim=200):
    """
    Build and train an LSTM model with BioWordVec embeddings.
    """
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
        LSTM(128, return_sequences=True),
        Dropout(0.3),
        LSTM(64),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.3),
        Dense(num_classes, activation="softmax")
    ])
    
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.summary()
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=32,
        verbose=1
    )
    
    return model, history

print("Building and training model...")
model, history = build_and_train_model(
    X_train, X_val, y_train, y_val, vocab_size, embedding_matrix, num_classes
)

In [None]:
# ## Cell 8: Save the Model and Tokenizer
#
# This cell saves the trained model and tokenizer for use in the inference script (`main.py`).

print("Saving model and tokenizer...")
model.save(MODEL_PATH)
with open(TOKENIZER_PATH, "wb") as f:
    pickle.dump(tokenizer, f)

print("Training complete!")