In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode
import joblib

# Read data from the CSV file
df = pd.read_csv('/content/symptoms_disease.csv')

# Fill any missing symptom columns with empty strings (for rows with fewer symptoms)
df = df.fillna('')

# Combine symptom columns into a single 'Symptoms' column
df['Symptoms'] = df['Symptom 1'] + ', ' + df['Symptom 2'] + ', ' + df['Symptom 3']

# Split Symptoms into individual features (based on commas)
df['Symptoms'] = df['Symptoms'].apply(lambda x: x.split(', '))

# One-hot encoding: Transform each symptom into a binary feature
all_symptoms = set(symptom for sublist in df['Symptoms'] for symptom in sublist if symptom)
for symptom in all_symptoms:
    df[symptom] = df['Symptoms'].apply(lambda x: 1 if symptom in x else 0)

# Drop the original Symptoms columns as they are no longer needed
df.drop(['Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptoms'], axis=1, inplace=True)

# Prepare X (features) and y (target)
X = df.drop('Disease', axis=1)
y = df['Disease']

# Encode target variable (Disease) into numerical labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the three models
random_forest_model = RandomForestClassifier(random_state=42)
naive_bayes_model = GaussianNB()
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Train each model
random_forest_model.fit(X_train, y_train)
naive_bayes_model.fit(X_train, y_train)
decision_tree_model.fit(X_train, y_train)

# Test each model
rf_pred = random_forest_model.predict(X_test)
nb_pred = naive_bayes_model.predict(X_test)
dt_pred = decision_tree_model.predict(X_test)

# Combine predictions from all models
final_predictions = np.array([rf_pred, nb_pred, dt_pred])
# Use majority voting to make the final prediction
final_majority_vote = mode(final_predictions, axis=0)[0].flatten()

# Calculate accuracy for each model
rf_accuracy = accuracy_score(y_test, rf_pred)
nb_accuracy = accuracy_score(y_test, nb_pred)
dt_accuracy = accuracy_score(y_test, dt_pred)
majority_vote_accuracy = accuracy_score(y_test, final_majority_vote)
# print(rf_accuracy," ",nb_accuracy," ",dt_accuracy," ",majority_vote_accuracy)

# Save the models and LabelEncoder for later use in the app
joblib.dump(random_forest_model, 'random_forest_model.pkl')
joblib.dump(naive_bayes_model, 'naive_bayes_model.pkl')
joblib.dump(decision_tree_model, 'decision_tree_model.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

In [2]:
import numpy as np
from scipy.stats import mode

# Load the saved models and LabelEncoder
random_forest_model = joblib.load('random_forest_model.pkl')
naive_bayes_model = joblib.load('naive_bayes_model.pkl')
decision_tree_model = joblib.load('decision_tree_model.pkl')
le = joblib.load('label_encoder.pkl')

# List of all symptoms (same set as in training)
all_symptoms = [
    'Abdominal pain', 'Belly pain', 'Nausea',
    'Fever', 'Cough', 'Difficulty breathing',
    'Joint pain', 'Fatigue', 'Muscle weakness',
    'Chest pain', 'Fast heart rate', 'Shortness of breath',
    'Blurred vision', 'Loss of consciousness',
    'Bladder discomfort', 'Painful urination', 'Foul smell of urine',
    'Jaundice', 'Yellowing of eyes',
    'Excessive hunger', 'Unexplained weight loss',
    'Acne', 'Blackheads', 'Oily skin',
    'Depression', 'Irritability', 'Appetite loss',
    'Abdominal pain', 'Diarrhea', 'Vomiting',
    'Fatigue', 'Fever', 'Sweating',
    'Headache', 'Nausea', 'Sensitivity to light',
    'Back pain', 'Leg pain', 'Weakness',
    'Ear pain', 'Sore throat', 'Swollen lymph nodes',
    'Weight gain', 'Cold intolerance', 'Constipation',
    'Rash', 'Itching', 'Swelling',
    'Abnormal heart rhythm', 'Dizziness', 'Fainting',
    'Dry mouth', 'Increased thirst', 'Frequent urination',
    'Knee pain', 'Swelling', 'Stiffness',
    'Vomiting', 'Abdominal cramps', 'Bloating',
    'Persistent cough', 'Chest tightness', 'Wheezing',
    'Fatigue', 'Hair loss', 'Sensitivity to cold',
    'Weakness', 'Pale skin', 'Shortness of breath',
    'Nausea', 'Vomiting', 'Abdominal swelling',
    'Painful joints', 'Skin rash', 'Sun sensitivity',
    'Loss of appetite', 'Fatigue', 'Weight loss',
    'Frequent urination', 'Increased thirst', 'Blurred vision',
    'Fever', 'Red skin', 'Pus discharge',
    'Nausea', 'Vomiting', 'Severe headache',
    'Redness of eyes', 'Watery discharge', 'Itching',
    'Hoarseness', 'Difficulty swallowing', 'Neck swelling'
]

# Save the all_symptoms list after training
joblib.dump(list(all_symptoms), 'all_symptoms_list.pkl')

# Function to predict disease based on user-selected symptoms
def predict_disease(chosen_symptoms, all_symptoms, X_columns):
    # Prepare the input vector for prediction
    input_vector = [1 if symptom in chosen_symptoms else 0 for symptom in X_columns]
    print(input_vector)
    # Predict using the loaded models
    rf_pred_new = random_forest_model.predict([input_vector])
    nb_pred_new = naive_bayes_model.predict([input_vector])
    dt_pred_new = decision_tree_model.predict([input_vector])

    # Combine predictions using majority voting
    new_final_predictions = np.array([rf_pred_new, nb_pred_new, dt_pred_new])
    new_final_majority_vote = mode(new_final_predictions, axis=0)[0].flatten()

    # Decode the predicted label back to the disease name
    final_disease = le.inverse_transform(new_final_majority_vote)
    return final_disease[0]

# Example usage in your app:
# Let the user choose 3 symptoms
chosen_symptoms = ['Abdominal pain','Belly pain','Nausea']  # This would come from user input in your app

# Predict the disease based on the symptoms
predicted_disease = predict_disease(chosen_symptoms, all_symptoms, X.columns)
print(f"Predicted Disease: {predicted_disease}")

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Predicted Disease: Gastroenteritis


