In [1]:
#  REALISTIC DISEASE DIAGNOSIS MODEL
# Model: Logistic Regression
# Dataset: Clustered variations (5 per disease)
# Input: List of symptoms
# Output: Predicted Disease & Probability

# --- 1. Import Required Libraries ---
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import random
from IPython.display import display

# --- 2. Define Disease & Symptom Profiles (NEW Realistic Variations) ---
# Each disease now has a list of 5 slightly different profiles (variations)
# This creates the "closely similar" patient rows you requested.

all_symptoms = [
    'fever', 'cough', 'fatigue', 'headache', 'nausea', 'vomiting',
    'diarrhea', 'sore_throat', 'runny_nose', 'body_ache', 'shortness_of_breath',
    'loss_of_smell', 'loss_of_taste', 'jaundice', 'abdominal_pain', 'joint_pain', 'rash', 'chills'
]

# Background "noise" probability for a random symptom
BACKGROUND_SYMPTOM_PROB = 0.01

disease_variations = {
    'COVID-19': [
        # Var 1: Classic
        {'fever': 0.7, 'cough': 0.8, 'fatigue': 0.8, 'loss_of_smell': 0.7, 'loss_of_taste': 0.7, 'body_ache': 0.5, 'headache': 0.4},
        # Var 2: Mild / Cold-like
        {'fever': 0.2, 'cough': 0.6, 'fatigue': 0.3, 'headache': 0.2, 'sore_throat': 0.4, 'runny_nose': 0.5},
        # Var 3: Severe
        {'fever': 0.9, 'cough': 0.9, 'fatigue': 0.9, 'shortness_of_breath': 0.7, 'body_ache': 0.8, 'chills': 0.6},
        # Var 4: GI-Focused
        {'fever': 0.4, 'cough': 0.2, 'fatigue': 0.6, 'nausea': 0.7, 'vomiting': 0.5, 'diarrhea': 0.7, 'abdominal_pain': 0.5},
        # Var 5: Fatigue-Heavy
        {'fever': 0.1, 'cough': 0.3, 'fatigue': 1.0, 'body_ache': 0.7, 'headache': 0.5, 'joint_pain': 0.4}
    ],
    'Influenza (Flu)': [
        # Var 1: Classic Severe
        {'fever': 0.9, 'cough': 0.8, 'fatigue': 0.9, 'headache': 0.8, 'sore_throat': 0.5, 'body_ache': 0.9, 'chills': 0.9},
        # Var 2: Milder Flu
        {'fever': 0.7, 'cough': 0.7, 'fatigue': 0.7, 'headache': 0.5, 'body_ache': 0.6, 'chills': 0.5},
        # Var 3: Flu with GI
        {'fever': 0.8, 'cough': 0.7, 'fatigue': 0.8, 'nausea': 0.5, 'vomiting': 0.3, 'diarrhea': 0.3, 'body_ache': 0.8},
        # Var 4: Quick Onset
        {'fever': 1.0, 'cough': 0.5, 'fatigue': 0.9, 'headache': 0.9, 'body_ache': 1.0, 'chills': 1.0},
        # Var 5: Lingering Cough
        {'fever': 0.6, 'cough': 0.9, 'fatigue': 0.6, 'sore_throat': 0.4, 'body_ache': 0.5}
    ],
    'Common Cold': [
        # Var 1: Head Cold
        {'fever': 0.0, 'cough': 0.3, 'fatigue': 0.1, 'headache': 0.2, 'sore_throat': 0.5, 'runny_nose': 0.9},
        # Var 2: Chest Cold
        {'fever': 0.1, 'cough': 0.8, 'fatigue': 0.3, 'sore_throat': 0.4, 'runny_nose': 0.5, 'body_ache': 0.2},
        # Var 3: All-around Mild
        {'fever': 0.0, 'cough': 0.4, 'fatigue': 0.2, 'headache': 0.1, 'sore_throat': 0.3, 'runny_nose': 0.7},
        # Var 4: Sinus-y
        {'fever': 0.0, 'cough': 0.2, 'headache': 0.4, 'sore_throat': 0.2, 'runny_nose': 0.9},
        # Var 5: Scratchy Throat
        {'fever': 0.0, 'cough': 0.1, 'fatigue': 0.1, 'sore_throat': 0.8, 'runny_nose': 0.4}
    ],
    'Gastroenteritis (Stomach Flu)': [
        # Var 1: Vomiting-heavy
        {'fever': 0.3, 'nausea': 0.9, 'vomiting': 0.9, 'diarrhea': 0.5, 'abdominal_pain': 0.7, 'fatigue': 0.6},
        # Var 2: Diarrhea-heavy
        {'fever': 0.2, 'nausea': 0.6, 'vomiting': 0.3, 'diarrhea': 0.9, 'abdominal_pain': 0.6, 'fatigue': 0.5},
        # Var 3: Mild Case
        {'fever': 0.1, 'nausea': 0.5, 'vomiting': 0.2, 'diarrhea': 0.5, 'abdominal_pain': 0.3},
        # Var 4: With Fever
        {'fever': 0.6, 'nausea': 0.8, 'vomiting': 0.7, 'diarrhea': 0.7, 'abdominal_pain': 0.7, 'body_ache': 0.4},
        # Var 5: Nausea only
        {'fever': 0.1, 'nausea': 0.9, 'vomiting': 0.1, 'abdominal_pain': 0.4, 'headache': 0.3}
    ],
    'Migraine': [
        # Var 1: Classic
        {'fever': 0.0, 'headache': 1.0, 'nausea': 0.7, 'vomiting': 0.3, 'fatigue': 0.4},
        # Var 2: Headache Only
        {'fever': 0.0, 'headache': 1.0, 'nausea': 0.1, 'fatigue': 0.2},
        # Var 3: Severe
        {'fever': 0.0, 'headache': 1.0, 'nausea': 0.9, 'vomiting': 0.7, 'fatigue': 0.6},
        # Var 4: With Body Ache
        {'fever': 0.0, 'headache': 0.9, 'nausea': 0.5, 'body_ache': 0.4},
        # Var 5: Fatigue trigger
        {'fever': 0.0, 'headache': 0.8, 'nausea': 0.3, 'fatigue': 0.8}
    ],
    'Allergies': [
        # Var 1: Classic Hay Fever
        {'fever': 0.0, 'cough': 0.2, 'headache': 0.2, 'sore_throat': 0.1, 'runny_nose': 0.9, 'rash': 0.1},
        # Var 2: Skin Allergy
        {'fever': 0.0, 'cough': 0.0, 'runny_nose': 0.2, 'rash': 0.8},
        # Var 3: Sinus Pressure
        {'fever': 0.0, 'cough': 0.1, 'headache': 0.6, 'runny_nose': 0.7, 'sore_throat': 0.2},
        # Var 4: All-around
        {'fever': 0.0, 'cough': 0.3, 'headache': 0.3, 'sore_throat': 0.3, 'runny_nose': 0.8, 'rash': 0.2},
        # Var 5: Mild
        {'fever': 0.0, 'cough': 0.0, 'runny_nose': 0.5}
    ]
}

# --- 3. Generate Synthetic Dataset ---
num_records = 6000 # Increased size
data = []
disease_list = list(disease_variations.keys())

for _ in range(num_records):
    # 1. Pick a random disease
    disease = random.choice(disease_list)

    # 2. Pick one of the 5 random variations for that disease
    profile = random.choice(disease_variations[disease])

    symptoms_vector = []
    for symptom in all_symptoms:
        if symptom in profile:
            # If symptom is in the profile, use its defined probability
            prob = profile[symptom]
            has_symptom = 1 if np.random.rand() < prob else 0
        else:
            # If not in profile, use a low background probability
            has_symptom = 1 if np.random.rand() < BACKGROUND_SYMPTOM_PROB else 0
        symptoms_vector.append(has_symptom)

    # Add the disease label
    row = symptoms_vector + [disease]
    data.append(row)

# Create DataFrame
columns = all_symptoms + ['Disease']
df = pd.DataFrame(data, columns=columns)

print(f"✅ Realistic dataset generated with {num_records} records.")
print("--- Sample of Generated Data (showing 10 rows) ---")
display(df.sample(10)) # Show 10 random rows to see variation

# --- 4. Preprocessing and Feature Engineering ---
X = df[all_symptoms] # Features
y = df['Disease']    # Target

# Encode text labels (y) to numbers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save class names to decode predictions later
disease_names = label_encoder.classes_
print("\nDisease classes encoded:")
for i, name in enumerate(disease_names):
    print(f"{name}  ->  {i}")

# --- 5. Split and Train the Model (Logistic Regression) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"\nTraining model on {len(X_train)} samples...")

# Initialize and train the model
model = LogisticRegression(solver='liblinear', multi_class='ovr', random_state=42, C=1.0)
model.fit(X_train, y_train)

print("✅ Model trained successfully.")

# --- 6. Model Evaluation ---
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"Model Accuracy on Test Set: {accuracy * 100:.2f}%")
print("-" * 30)


# --- 7. Interactive Prediction Function ---

def predict_disease(symptom_string):
    """
    Takes a comma-separated string of symptoms, converts it to a
    feature vector, and predicts the disease.
    """
    # 1. Clean and process the input string
    user_symptoms = [s.strip().lower().replace(' ', '_') for s in symptom_string.split(',')]

    # 2. Create the 0/1 feature vector
    input_vector = [0] * len(all_symptoms)
    found_symptoms = []
    unknown_symptoms = []

    for symptom in user_symptoms:
        if symptom in all_symptoms:
            idx = all_symptoms.index(symptom)
            input_vector[idx] = 1
            found_symptoms.append(symptom)
        elif symptom:
            unknown_symptoms.append(symptom)

    if not found_symptoms:
        print("\nWarning: No recognized symptoms were entered. Please check spelling.")
        print(f"Available symptoms are: {', '.join(all_symptoms)}")
        return

    print(f"\nRecognized Symptoms: {', '.join(found_symptoms)}")
    if unknown_symptoms:
        print(f"Unrecognized Symptoms: {', '.join(unknown_symptoms)} (These will be ignored)")

    # 3. Use the model to predict probabilities
    input_vector_reshaped = [input_vector]
    probabilities = model.predict_proba(input_vector_reshaped)[0]

    # 4. Get the top prediction
    top_prediction_index = np.argmax(probabilities)
    top_probability = probabilities[top_prediction_index]
    top_disease_name = disease_names[top_prediction_index]

    # 5. Display the result
    print("\n--- 🩺 Diagnosis Result ---")
    print(f"Predicted Disease: {top_disease_name}")
    print(f"Probability:         {top_probability * 100:.2f}%")

    print("\n--- Top 3 Possibilities ---")
    top_3_indices = np.argsort(probabilities)[-3:][::-1] # Get indices of top 3
    for i in top_3_indices:
        disease = disease_names[i]
        prob = probabilities[i]
        # Show a small bar for visualization
        bar = "█" * int(prob * 20)
        print(f"  - {disease:<25} ({prob*100:5.2f}%) {bar}")
    print("-" * 30)

# --- 8. Run Interactive Prediction ---
print("\n\n" + "="*40)
print("🩺 REALISTIC DISEASE DIAGNOSIS PREDICTOR")
print("="*40)
print("Enter your symptoms, separated by commas.")
print("Examples:")
print("  fever, cough, loss of smell")
print("  headache, nausea")
print("  runny nose, sore throat, cough")

try:
    while True:
        symptoms_input = input("\n➡️ Your Symptoms: ")
        if symptoms_input.lower() in ['exit', 'quit']:
            print("Exiting.")
            break
        predict_disease(symptoms_input)

except (KeyboardInterrupt, EOFError):
    print("\n\nPrediction loop stopped.")

