<a href="https://colab.research.google.com/github/IHK-hk/Project/blob/main/Health_Symptom_Checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler  # For imbalance if needed

# Load the dataset (replace with your path)
data = pd.read_csv('/content/sample_data/dataset.csv')

# Clean symptom columns: strip spaces and lowercase
symptom_columns = [col for col in data.columns if 'Symptom_' in col]
for col in symptom_columns:
    data[col] = data[col].str.strip().str.lower() if data[col].dtype == 'object' else data[col]

# Get all unique symptoms
all_symptoms = pd.unique(data[symptom_columns].values.ravel())
all_symptoms = [s for s in all_symptoms if pd.notna(s)]  # Remove NaNs

# Create one-hot encoded features for symptoms
one_hot_df = pd.get_dummies(data[symptom_columns].stack()).groupby(level=0).sum()
data = pd.concat([data['Disease'], one_hot_df], axis=1)

# Encode the target (diseases)
encoder = LabelEncoder()
data['Disease'] = encoder.fit_transform(data['Disease'])

# Features (X) and target (y)
X = data.drop('Disease', axis=1)
y = data['Disease']

# Optional: Handle imbalance with oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [2]:
# Train models
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

svm_model = SVC(probability=True)  # Enable probabilities for potential thresholding
svm_model.fit(X_train, y_train)

# Predict on test set and evaluate
rf_preds = rf_model.predict(X_test)
nb_preds = nb_model.predict(X_test)
svm_preds = svm_model.predict(X_test)

# Ensemble: Majority vote
from scipy.stats import mode
ensemble_preds = mode([rf_preds, nb_preds, svm_preds], axis=0)[0].flatten()

print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_preds):.4f}")
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, nb_preds):.4f}")
print(f"SVM Accuracy: {accuracy_score(y_test, svm_preds):.4f}")
print(f"Ensemble Accuracy: {accuracy_score(y_test, ensemble_preds):.4f}")

Random Forest Accuracy: 1.0000
Naive Bayes Accuracy: 1.0000
SVM Accuracy: 1.0000
Ensemble Accuracy: 1.0000


In [4]:
import numpy as np

# Symptom checker function
def predict_disease(input_symptoms_str):
    input_symptoms = [s.strip().lower() for s in input_symptoms_str.split(',')]
    # Create a DataFrame with the same columns as training data
    input_data = pd.DataFrame(np.zeros((1, len(X.columns))), columns=X.columns)

    # Set 1 for present symptoms
    for symptom in input_symptoms:
        if symptom in X.columns:
            input_data[symptom] = 1

    # Predict with each model
    rf_pred = encoder.inverse_transform(rf_model.predict(input_data))[0]
    nb_pred = encoder.inverse_transform(nb_model.predict(input_data))[0]
    svm_pred = encoder.inverse_transform(svm_model.predict(input_data))[0]

    # Ensemble: Majority vote using np.unique
    predictions = [rf_pred, nb_pred, svm_pred]
    unique, counts = np.unique(predictions, return_counts=True)
    final_pred = unique[np.argmax(counts)]  # Pick the most frequent prediction

    return {
        "Random Forest Prediction": rf_pred,
        "Naive Bayes Prediction": nb_pred,
        "SVM Prediction": svm_pred,
        "Final Prediction": final_pred
    }

# Example usage
print(predict_disease("itching,skin_rash,nodal_skin_eruptions"))

{'Random Forest Prediction': 'Fungal infection', 'Naive Bayes Prediction': 'Fungal infection', 'SVM Prediction': 'Fungal infection', 'Final Prediction': np.str_('Fungal infection')}


In [17]:
# More Example usage
print(predict_disease("itching,skin_rash,nodal_skin_eruptions,dischromic_patches"))
print(predict_disease("continuous_sneezing,shivering,chills,watering_from_eyes"))
print(predict_disease("stomach_pain,acidity,chest_pain,cough"))
print(predict_disease("fatigue,weight_loss,polyuria,increased_appetite"))
print(predict_disease("headache,dizziness,chest_pain,loss_of_balance"))
print(predict_disease("fever"))
print(predict_disease("congestion,runny_nose,cough,high_fever"))
print(predict_disease("joint_pain,stiffness,swelling_joints,painful_walking"))
print(predict_disease("nausea,vomiting,diarrhoea,abdominal_pain"))
#Invalid symptom test (to check error handling)
print(predict_disease("invalid_symptom,skin_rash,fever"))

{'Random Forest': 'Fungal infection', 'Naive Bayes': 'Fungal infection', 'SVM': 'Fungal infection', 'Ensemble': np.str_('Fungal infection')}
{'Random Forest': 'Allergy', 'Naive Bayes': 'Allergy', 'SVM': 'Allergy', 'Ensemble': np.str_('Allergy')}
{'Random Forest': 'GERD', 'Naive Bayes': 'GERD', 'SVM': 'GERD', 'Ensemble': np.str_('GERD')}
{'Random Forest': 'Diabetes ', 'Naive Bayes': 'Diabetes ', 'SVM': 'Diabetes ', 'Ensemble': np.str_('Diabetes ')}
{'Random Forest': 'Hypertension ', 'Naive Bayes': 'Hypertension ', 'SVM': 'Hypertension ', 'Ensemble': np.str_('Hypertension ')}
{'Random Forest': 'Arthritis', 'Naive Bayes': 'Fungal infection', 'SVM': 'AIDS', 'Ensemble': np.str_('AIDS')}
{'Random Forest': 'Bronchial Asthma', 'Naive Bayes': 'Bronchial Asthma', 'SVM': 'AIDS', 'Ensemble': np.str_('Bronchial Asthma')}
{'Random Forest': 'Osteoarthristis', 'Naive Bayes': 'Osteoarthristis', 'SVM': 'Osteoarthristis', 'Ensemble': np.str_('Osteoarthristis')}
{'Random Forest': 'Gastroenteritis', 'Naive

In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define models and their predictions
models = {
    "Random Forest": rf_preds,
    "Naive Bayes": nb_preds,
    "SVM": svm_preds,
    "Ensemble": ensemble_preds
}

# Calculate metrics for each model
metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": []
}

for name, preds in models.items():
    metrics["Model"].append(name)
    metrics["Accuracy"].append(accuracy_score(y_test, preds))
    metrics["Precision"].append(precision_score(y_test, preds, average='weighted', zero_division=0))
    metrics["Recall"].append(recall_score(y_test, preds, average='weighted', zero_division=0))
    metrics["F1-Score"].append(f1_score(y_test, preds, average='weighted', zero_division=0))

# Create and display DataFrame
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

# Save to CSV for reference
metrics_df.to_csv('model_metrics.csv', index=False)

           Model  Accuracy  Precision  Recall  F1-Score
0  Random Forest       1.0        1.0     1.0       1.0
1    Naive Bayes       1.0        1.0     1.0       1.0
2            SVM       1.0        1.0     1.0       1.0
3       Ensemble       1.0        1.0     1.0       1.0


In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Define models and their predictions
models = {
    "Random Forest": rf_preds,
    "Naive Bayes": nb_preds,
    "SVM": svm_preds,
    "Ensemble": ensemble_preds
}

# 1. Plot Confusion Matrix Heatmaps
plt.figure(figsize=(20, 15))
for i, (name, preds) in enumerate(models.items(), 1):
    cm = confusion_matrix(y_test, preds)
    plt.subplot(2, 2, i)
    sns.heatmap(cm, annot=False, cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
    plt.title(f'Confusion Matrix: {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('confusion_matrices.png')
plt.close()

# 2. Compute Metrics for Bar Chart
metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": []
}

for name, preds in models.items():
    metrics["Model"].append(name)
    metrics["Accuracy"].append(accuracy_score(y_test, preds))
    metrics["Precision"].append(precision_score(y_test, preds, average='weighted', zero_division=0))
    metrics["Recall"].append(recall_score(y_test, preds, average='weighted', zero_division=0))
    metrics["F1-Score"].append(f1_score(y_test, preds, average='weighted', zero_division=0))

metrics_df = pd.DataFrame(metrics)

# 3. Plot Bar Chart for Metrics Comparison
metrics_df.set_index('Model').plot(kind='bar', figsize=(10, 6), color=['#4CAF50', '#2196F3', '#FF9800', '#9C27B0'])
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.legend(title='Metrics')
plt.tight_layout()
plt.savefig('metrics_bar_chart.png')
plt.close()
# 4. Save Classification Report for Ensemble Model
report = classification_report(y_test, ensemble_preds, target_names=encoder.classes_, output_dict=False)
with open('ensemble_classification_report.txt', 'w') as f:
    f.write(report)
