In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Dataset 1: Symptoms and Diseases
df_symptoms = pd.read_csv("stomach_disease_dataset.csv")
# Dataset 2: Diseases and Doctor Info
df_doctors = pd.read_csv("doctor_dataset.csv")

In [None]:
df_symptoms.head()

In [None]:
df_doctors.head()

In [None]:
df_symptoms.info()

In [None]:
df_doctors.info()

In [None]:
df_symptoms.describe()

In [None]:
df_doctors.describe()

In [None]:
# Combine 6 symptoms into a list per row
symptom_cols = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6']
df_symptoms['symptom_list'] = df_symptoms[symptom_cols].values.tolist()

# Encode symptom lists using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df_symptoms['symptom_list'])

# Target variable: Disease
y = df_symptoms['Disease']

In [None]:
# Train, Validation, Test Split (60/20/20)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate
print("\n--- Validation Set ---")
val_preds = model.predict(X_val)
print(classification_report(y_val, val_preds))

In [None]:
print("\n--- Test Set ---")
test_preds = model.predict(X_test)
print(classification_report(y_test, test_preds))

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(confusion_matrix(y_test, test_preds), annot=True, fmt='d',
            xticklabels=model.classes_, yticklabels=model.classes_, cmap='Blues')
plt.title("Confusion Matrix on Test Set")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Function to predict disease
def predict_disease_from_user_input():
    print("\n🩺 Available Symptoms:")
    print(", ".join(sorted(mlb.classes_)))

    input_str = input("\nEnter symptoms separated by commas (e.g., nausea,vomiting,bloating):\n")
    user_symptoms = [sym.strip().lower() for sym in input_str.split(",")]
    # Validate input
    for sym in user_symptoms:
        if sym not in mlb.classes_:
            print(f"\n❌ Invalid symptom: '{sym}'")
            print("Please choose symptoms only from the available list.")
            return
    # Transform input and predict
    input_encoded = mlb.transform([user_symptoms])
    predicted_disease = model.predict(input_encoded)[0]
    # Retrieve doctor info
    matching_doctor = df_doctors[df_doctors['Disease'] == predicted_disease]
    if matching_doctor.empty:
        print(f"\nPredicted Disease: {predicted_disease}")
        print("⚠️ No doctor found for this disease in the dataset.")
    else:
        doc = matching_doctor.iloc[0]
        print("\n✅ Prediction Result:")
        print(f"Symptoms You Entered: {user_symptoms}")
        print(f"Predicted Disease: {predicted_disease}")
        print(f"Doctor Name: {doc['Doctor_Name']}")
        print(f"Specialization: {doc['Doctor_Specialization']}")
        print(f"Contact: {doc['Doctor_Contact']}")
# Run the prediction interaction
predict_disease_from_user_input()