In [9]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
import difflib

# Load dataset
df = pd.read_csv("/content/health_symptom_checker_dataset_expanded.csv")

# Combine all symptom columns into a list
symptom_cols = [col for col in df.columns if col.startswith("Symptom_")]
df['Symptoms'] = df[symptom_cols].values.tolist()

# Clean and standardize symptoms
df['Symptoms'] = df['Symptoms'].apply(lambda x: [sym.strip().lower() for sym in x if isinstance(sym, str)])

# One-hot encode symptoms
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['Symptoms'])
y = df['Disease']

# Train classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Map readable symptoms
symptom_name_map = {sym.lower().replace("_", " "): sym for sym in mlb.classes_}

# Fuzzy matcher for typos
def get_closest_symptom(symptom):
    candidates = list(symptom_name_map.keys())
    match = difflib.get_close_matches(symptom.lower(), candidates, n=1, cutoff=0.8)
    return symptom_name_map[match[0]] if match else None

# Predict disease from user symptoms
def predict_disease(user_symptoms):
    cleaned = []
    for sym in user_symptoms:
        key = sym.strip().lower()
        matched_sym = symptom_name_map.get(key) or get_closest_symptom(key)
        if matched_sym:
            cleaned.append(matched_sym)

    if not cleaned:
        return "No known symptoms entered. Please try again with valid symptoms."

    input_encoded = mlb.transform([cleaned])
    prediction = model.predict(input_encoded)
    return prediction[0]

# Show known symptoms
print("Known symptoms are:")
print(", ".join(sorted([sym.replace("_", " ") for sym in mlb.classes_])))

# User input loop
while True:
    user_input = input("\nEnter symptoms separated by commas (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Goodbye!")
        break

    user_symptoms = user_input.split(",")
    result = predict_disease(user_symptoms)

    print(f"\nPredicted Disease: {result}")

Known symptoms are:
abdominal bloating, abdominal pain, acne, back pain, blurred vision, breathlessness, chest pain, chills, confusion, congestion, constipation, cough, cramps, dehydration, diarrhoea, dizziness, dry cough, dry skin, eye pain, fatigue, fever, hair loss, headache, irregular periods, irritability, itching, itchy eyes, joint pain, joint stiffness, loss of appetite, loss of balance, malaise, memory loss, mood swings, muscle pain, muscle weakness, nausea, night sweats, numbness, palpitations, pimples, rash, redness, runny nose, scaling, slurred speech, sneezing, sore throat, stiffness, stomach pain, sweating, swelling, thirst, vomiting, watering eyes, weakness, weight gain, weight loss

Enter symptoms separated by commas (or type 'exit' to quit): runny nose, cough

Predicted Disease: common_cold

Enter symptoms separated by commas (or type 'exit' to quit): irregular periods, weight gain

Predicted Disease: hormonal_imbalance

Enter symptoms separated by commas (or type 'exit