# Simplified Disease Prediction from Symptoms
This notebook demonstrates disease prediction using machine learning based on patient symptoms.

## Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Deep learning libraries
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    from tensorflow.keras.utils import to_categorical
    tensorflow_available = True
except ImportError:
    print("TensorFlow not available. Deep learning model will be disabled.")
    tensorflow_available = False

# Set plot style
plt.style.use('ggplot')
sns.set(style='whitegrid')



## Stopwords and Text Preprocessing

In [2]:
# Define English stopwords (simplified version)
STOPWORDS = {
    'a', 'an', 'the', 'and', 'but', 'or', 'if', 'because', 'as', 'until', 'while',
    'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
    'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
    'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'i',
    'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
    'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
    'did', 'doing', 'would', 'could', 'should', 'ought', 'i\'m', 'you\'re', 'he\'s',
    'she\'s', 'it\'s', 'we\'re', 'they\'re', 'i\'ve', 'you\'ve', 'we\'ve', 'they\'ve',
    'i\'d', 'you\'d', 'he\'d', 'she\'d', 'we\'d', 'they\'d', 'i\'ll', 'you\'ll', 'he\'ll',
    'she\'ll', 'we\'ll', 'they\'ll', 'isn\'t', 'aren\'t', 'wasn\'t', 'weren\'t', 'hasn\'t',
    'haven\'t', 'hadn\'t', 'doesn\'t', 'don\'t', 'didn\'t', 'won\'t', 'wouldn\'t',
    'shan\'t', 'shouldn\'t', 'can\'t', 'cannot', 'couldn\'t', 'mustn\'t', 'let\'s',
    'that\'s', 'who\'s', 'what\'s', 'here\'s', 'there\'s', 'when\'s', 'where\'s', 'why\'s',
    'how\'s'
}

# Simple text preprocessing function
def preprocess_text(text):
    """Simplified text preprocessing without relying on NLTK"""
    # Convert to lowercase
    text = text.lower()
    
    # Replace punctuation with spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenize by splitting on whitespace
    tokens = text.split()
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in STOPWORDS]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text

## Dataset Creation and Loading

In [3]:
# Create sample dataset
def create_sample_dataset():
    os.makedirs('data', exist_ok=True)
    
    data = """Symptoms,Disease
fever headache cough,Common Cold
high fever severe headache stiff neck light sensitivity,Meningitis
chest pain shortness of breath sweating,Heart Attack
fatigue weight loss night sweats cough,Tuberculosis
abdominal pain diarrhea nausea vomiting,Gastroenteritis
high fever fatigue sore throat swollen lymph glands,Mononucleosis
fever rash joint pain muscle pain,Dengue
frequent urination excessive thirst hunger weight loss,Diabetes
wheezing shortness of breath chest tightness coughing,Asthma
pain numbness tingling in hands feet,Peripheral Neuropathy
dry mouth blurred vision frequent urination,Type 2 Diabetes
headache nausea vomiting dizziness,Migraine
severe joint pain swelling stiffness,Rheumatoid Arthritis
fever headache fatigue muscle aches,Influenza
sore throat difficulty swallowing fever,Strep Throat
rash fever fatigue headache,Measles
cough mucus shortness of breath wheezing,Bronchitis
itchy eyes runny nose sneezing congestion,Allergic Rhinitis
painful urination urgency frequency,Urinary Tract Infection
abdominal pain bloating cramping diarrhea,Irritable Bowel Syndrome
jaundice abdominal pain dark urine,Hepatitis
fatigue muscle weakness numbness tingling,Multiple Sclerosis
tremor stiffness slow movement,Parkinson's Disease
recurring headaches seizures vision problems,Brain Tumor
chest discomfort pain sweating nausea,Angina
vision loss eye pain redness,Glaucoma
trouble sleeping mood changes anxiety,Depression
joint pain swelling warmth redness,Gout
dizziness vertigo hearing loss tinnitus,Meniere's Disease
lower back pain numbness tingling in legs,Herniated Disc"""
    
    with open('data/symptom_disease.csv', 'w') as f:
        f.write(data)
    
    print("Sample dataset created at data/symptom_disease.csv")

# Load and process dataset
def load_and_process_data():
    # Check if the data file exists, if not create it
    try:
        df = pd.read_csv('data/symptom_disease.csv')
    except FileNotFoundError:
        print("Dataset not found, creating sample data...")
        create_sample_dataset()
        df = pd.read_csv('data/symptom_disease.csv')
    
    print("Dataset Shape:", df.shape)
    print("\nFirst few rows:")
    print(df.head())
    
    # Process the symptoms with our simplified approach
    df['Processed_Symptoms'] = df['Symptoms'].apply(preprocess_text)
    print("\nProcessed symptoms (first 5 rows):")
    print(df[['Symptoms', 'Processed_Symptoms']].head())
    
    return df

## Visualization Functions

In [4]:
# Plot disease distribution
def plot_disease_distribution(df):
    plt.figure(figsize=(12, 6))
    df['Disease'].value_counts().plot(kind='bar')
    plt.title('Disease Distribution')
    plt.xlabel('Disease')
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

## Feature Preparation

In [5]:
# Prepare features and split data
def prepare_features(df):
    # Create a label encoder for the disease classes
    le = LabelEncoder()
    df['Disease_Encoded'] = le.fit_transform(df['Disease'])
    
    # Split the data
    X = df['Processed_Symptoms']
    y = df['Disease_Encoded']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Convert text to TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    print(f"Training set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}")
    
    return X_train_tfidf, X_test_tfidf, y_train, y_test, tfidf_vectorizer, le

## Machine Learning Models

In [None]:
# Train machine learning models
def train_ml_models(X_train_tfidf, y_train, X_test_tfidf, y_test, le):
    # Train Naive Bayes model
    nb_model = MultinomialNB()
    nb_model.fit(X_train_tfidf, y_train)
    
    # Evaluate
    y_pred_nb = nb_model.predict(X_test_tfidf)
    accuracy_nb = accuracy_score(y_test, y_pred_nb)
    print(f"Naive Bayes Accuracy: {accuracy_nb:.4f}")
    
    # Train Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_tfidf, y_train)
    
    # Evaluate
    y_pred_rf = rf_model.predict(X_test_tfidf)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
    
    return nb_model, rf_model, accuracy_nb, accuracy_rf

## Deep Learning Model

In [None]:
# Train deep learning model (if TensorFlow is available)
def train_deep_learning_model(X_train_tfidf, y_train, X_test_tfidf, y_test, le):
    if not tensorflow_available:
        print("TensorFlow not available - skipping deep learning model")
        return None, 0.0, X_train_tfidf.shape[1]
    
    # Prepare data for deep learning
    X_train_dense = X_train_tfidf.toarray()
    X_test_dense = X_test_tfidf.toarray()
    
    num_classes = len(le.classes_)
    y_train_onehot = to_categorical(y_train, num_classes=num_classes)
    y_test_onehot = to_categorical(y_test, num_classes=num_classes)
    
    # Build neural network
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.summary()
    
    # Train the model
    history = model.fit(X_train_dense, y_train_onehot,
                        epochs=20,
                        batch_size=8,
                        validation_split=0.1,
                        verbose=1)
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'])
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'])
    
    plt.tight_layout()
    plt.show()
    
    # Evaluate
    loss, accuracy_dl = model.evaluate(X_test_dense, y_test_onehot, verbose=0)
    print(f"Deep Learning Model Accuracy: {accuracy_dl:.4f}")
    
    return model, accuracy_dl, X_train_dense.shape[1]

## Model Comparison

In [None]:
# Compare model performance
def compare_models(nb_accuracy, rf_accuracy, dl_accuracy):
    models = ['Naive Bayes', 'Random Forest', 'Deep Learning']
    accuracies = [nb_accuracy, rf_accuracy, dl_accuracy]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(models, accuracies, color=['blue', 'green', 'red'])
    plt.title('Model Accuracy Comparison')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1.0)
    
    for i, acc in enumerate(accuracies):
        plt.text(i, acc + 0.01, f'{acc:.4f}', ha='center')
        
    plt.tight_layout()
    plt.show()

## Prediction Functions

In [None]:
# Prediction function
def predict_disease(symptoms_text, tfidf_vectorizer, nb_model, rf_model, dl_model, le, model_type='rf'):
    # Preprocess the input symptoms
    processed_symptoms = preprocess_text(symptoms_text)
    
    # Vectorize the processed symptoms
    symptoms_tfidf = tfidf_vectorizer.transform([processed_symptoms])
    
    # Make prediction based on the selected model
    if model_type == 'nb':
        prediction = nb_model.predict(symptoms_tfidf)
        probabilities = nb_model.predict_proba(symptoms_tfidf)
    elif model_type == 'rf':
        prediction = rf_model.predict(symptoms_tfidf)
        probabilities = rf_model.predict_proba(symptoms_tfidf)
    elif model_type == 'dl' and tensorflow_available and dl_model is not None:
        symptoms_dense = symptoms_tfidf.toarray()
        probabilities = dl_model.predict(symptoms_dense)[0]
        prediction = [np.argmax(probabilities)]
    else:
        if model_type == 'dl':
            print("Deep learning model not available, using Random Forest instead")
        prediction = rf_model.predict(symptoms_tfidf)
        probabilities = rf_model.predict_proba(symptoms_tfidf)
        model_type = 'rf'
    
    # Get the predicted disease name
    predicted_disease = le.inverse_transform(prediction)[0]
    
    # Get the top 3 predictions with probabilities
    if model_type == 'dl':
        top_indices = np.argsort(probabilities)[-3:][::-1]
        top_probabilities = probabilities[top_indices]
    else:
        top_indices = np.argsort(probabilities[0])[-3:][::-1]
        top_probabilities = probabilities[0][top_indices]
    
    top_diseases = le.inverse_transform(top_indices)
    
    return {
        'predicted_disease': predicted_disease,
        'top_diseases': list(zip(top_diseases, top_probabilities)),
        'input_symptoms': symptoms_text,
        'processed_symptoms': processed_symptoms
    }

## Visualization Functions

In [None]:
# Visualize prediction results
def visualize_prediction(result):
    diseases = [disease for disease, _ in result['top_diseases']]
    probabilities = [prob*100 for _, prob in result['top_diseases']]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(range(len(diseases)), probabilities, color='skyblue')
    plt.xlabel('Disease')
    plt.ylabel('Probability (%)')
    plt.title('Disease Prediction Results')
    plt.xticks(range(len(diseases)), diseases, rotation=30)
    plt.ylim(0, 100)
    
    for bar, prob in zip(bars, probabilities):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                 f'{prob:.2f}%', ha='center')
    
    plt.tight_layout()
    plt.show()

## Interactive Prediction

In [None]:
# Interactive prediction function
def predict_from_input(tfidf_vectorizer, nb_model, rf_model, dl_model, le):
    print("DISEASE PREDICTION SYSTEM")
    print("-" * 25)
    
    symptoms = input("Enter symptoms (separated by spaces): ")
    
    print("\nSelect model:\n1. Naive Bayes\n2. Random Forest\n3. Deep Learning")
    choice = input("Enter choice (1-3): ")
    
    model_map = {'1': 'nb', '2': 'rf', '3': 'dl'}
    model_type = model_map.get(choice, 'rf')  # Default to RF
    
    result = predict_disease(
        symptoms, 
        tfidf_vectorizer, 
        nb_model, 
        rf_model, 
        dl_model, 
        le, 
        model_type
    )
    
    print(f"\nBased on symptoms: '{symptoms}'")
    print(f"Processed symptoms: '{result['processed_symptoms']}'")
    print(f"Predicted Disease: {result['predicted_disease']}")
    
    print("\nTop 3 Possible Diseases:")
    for disease, prob in result['top_diseases']:
        print(f"- {disease}: {prob*100:.2f}%")
    
    visualize_prediction(result)
    
    return result

## Main Function

In [None]:
# Main function
def main():
    print("===== SIMPLIFIED DISEASE PREDICTION SYSTEM =====")
    print("This version uses simplified text processing to avoid NLTK issues.")
    
    # Load and process the dataset
    print("\nLoading and processing data...")
    df = load_and_process_data()
    
    # Plot disease distribution
    print("\nPlotting disease distribution...")
    plot_disease_distribution(df)
    
    # Prepare features
    print("\nPreparing features...")
    X_train_tfidf, X_test_tfidf, y_train, y_test, tfidf_vectorizer, le = prepare_features(df)
    
    # Train machine learning models
    print("\nTraining machine learning models...")
    nb_model, rf_model, nb_accuracy, rf_accuracy = train_ml_models(
        X_train_tfidf, y_train, X_test_tfidf, y_test, le
    )
    
    # Train deep learning model
    print("\nTraining deep learning model...")
    dl_model, dl_accuracy, input_shape = train_deep_learning_model(
        X_train_tfidf, y_train, X_test_tfidf, y_test, le
    )
    
    # Compare models
    print("\nComparing model performance...")
    compare_models(nb_accuracy, rf_accuracy, dl_accuracy)
    
    # Start interactive prediction
    print("\nWould you like to make predictions? (y/n)")
    choice = input().lower()
    
    if choice == 'y':
        while True:
            predict_from_input(tfidf_vectorizer, nb_model, rf_model, dl_model, le)
            
            print("\nTry another prediction? (y/n)")
            choice = input().lower()
            if choice != 'y':
                break
    
    print("\nDisease Prediction demo completed.")

## Run Application

In [None]:
if __name__ == "__main__":
    main()







