In [3]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import joblib

# 1. Load Dataset
df = pd.read_csv(r'C:\Users\sondh\Desktop\healthCare_LLM\HealthCare-Prediction-Langchain-LLM-\models\symptoms_disease_cleaned.csv')  
df= df.head(700) 
# 2. Combine and encode symptoms (multi-hot)
symptom_cols = ['Symptom 1', 'Symptom 2', 'Symptom 3']
all_symptoms = sorted(set(symptom for symptom in df[symptom_cols].values.ravel() if isinstance(symptom, str)))
print(all_symptoms)
# Multi-hot encode each row
def encode_row(row):
    symptoms = [row[col] for col in symptom_cols]
    return [1 if symptom in symptoms else 0 for symptom in all_symptoms]

df['features'] = df.apply(encode_row, axis=1)

# Expand list into multiple columns
X = pd.DataFrame(df['features'].to_list(), columns=all_symptoms)

# 3. Encode disease labels
le = LabelEncoder()
y = le.fit_transform(df['Disease']) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 

# === Decision Tree ===
dt_model = DecisionTreeClassifier(random_state=0)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
print("\n=== Decision Tree ===")
dt_ans=mode(dt_pred, axis=0)[0].flatten()[0] 
print("Accuracy: ", accuracy_score(dt_pred, y_test) * 100, "%")
print("Precision      :", precision_score(y_test, dt_pred, average='weighted') * 100, "%")
print("Recall         :", recall_score(y_test, dt_pred, average='weighted') * 100, "%")
print("F1 Score       :", f1_score(y_test, dt_pred, average='weighted') * 100, "%")


# === Random Forest ===
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("\n=== Random Forest ===")
rf_ans=mode(rf_pred, axis=0)[0].flatten()[0] 
print("Accuracy: ", accuracy_score(rf_pred, y_test) * 100, " %")
print("Precision      :", precision_score(y_test, rf_pred, average='weighted') * 100, "%")
print("Recall         :", recall_score(y_test, rf_pred, average='weighted') * 100, "%")
print("F1 Score       :", f1_score(y_test, rf_pred, average='weighted') * 100, "%")


# === Naive Bayes ===
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
print("\n=== Naive Bayes ===")
nb_ans=mode(nb_pred, axis=0)[0].flatten()[0] 
print("Accuracy: ", accuracy_score(nb_pred, y_test) * 100, " %")
print("Precision      :", precision_score(y_test, nb_pred, average='weighted') * 100, "%")
print("Recall         :", recall_score(y_test, nb_pred, average='weighted') * 100, "%")
print("F1 Score       :", f1_score(y_test, nb_pred, average='weighted') * 100, "%")

# === Majority ===
final_predictions = np.array([rf_pred, nb_pred, dt_pred])
final_majority_vote = mode(final_predictions, axis=0)[0].flatten() 
print("\n=== Mode ===")
print("Accuracy: ", accuracy_score(final_majority_vote, y_test) * 100, " %")
print("Precision      :", precision_score(y_test, final_majority_vote, average='weighted') * 100, "%")
print("Recall         :", recall_score(y_test, final_majority_vote, average='weighted') * 100, "%")
print("F1 Score       :", f1_score(y_test, final_majority_vote, average='weighted') * 100, "%")

joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(nb_model, 'naive_bayes_model.pkl')
joblib.dump(dt_model, 'decision_tree_model.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(list(all_symptoms), 'all_symptoms_list.pkl')




['Abdominal cramps', 'Abdominal pain', 'Abdominal swelling', 'Abnormal heart rhythm', 'Acne', 'Anxiety', 'Appetite loss', 'Back pain', 'Belly pain', 'Blackheads', 'Bladder discomfort', 'Bleeding gums', 'Bloating', 'Blurred vision', 'Body aches', 'Burning urination', 'Butterfly rash', 'Chest pain', 'Chest tightness', 'Cold intolerance', 'Confusion', 'Constipation', 'Cough', 'Dark urine', 'Dehydration', 'Depression', 'Diarrhea', 'Difficulty breathing', 'Difficulty swallowing', 'Dizziness', 'Dry mouth', 'Dry skin', 'Ear pain', 'Excessive hunger', 'Excessive sweating', 'Fainting', 'Fast heart rate', 'Fatigue', 'Fever', 'Foul smell of urine', 'Frequent urination', 'Hair loss', 'Headache', 'High fever', 'Hoarseness', 'Increased thirst', 'Irritability', 'Itching', 'Itchy eyes', 'Jaundice', 'Joint pain', 'Joint stiffness', 'Knee pain', 'Leg pain', 'Loss of appetite', 'Loss of consciousness', 'Lower abdominal pain', 'Morning pain', 'Muscle pain', 'Muscle weakness', 'Nausea', 'Neck swelling', 'N

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Random Forest ===
Accuracy:  54.77707006369427  %
Precision      : 55.96968318305899 %
Recall         : 54.77707006369427 %
F1 Score       : 51.97823661705265 %

=== Naive Bayes ===
Accuracy:  52.22929936305732  %
Precision      : 56.58965072228467 %
Recall         : 52.22929936305732 %
F1 Score       : 50.16332570472698 %

=== Mode ===
Accuracy:  54.77707006369427  %
Precision      : 57.96353328200461 %
Recall         : 54.77707006369427 %
F1 Score       : 52.82460612070018 %


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['all_symptoms_list.pkl']

In [5]:
import numpy as np
from scipy.stats import mode

# Load the saved models and LabelEncoder
random_forest_model = joblib.load('random_forest_model.pkl')
naive_bayes_model = joblib.load('naive_bayes_model.pkl')
decision_tree_model = joblib.load('decision_tree_model.pkl')
le = joblib.load('label_encoder.pkl')
all_symptoms = joblib.load('all_symptoms_list.pkl')
# Function to predict disease based on user-selected symptoms
def predict_disease(chosen_symptoms, all_symptoms, X_columns):
    # Prepare the input vector for prediction
    input_vector = [1 if symptom in chosen_symptoms else 0 for symptom in X_columns]
    print(input_vector)
    # Predict using the loaded models
    rf_pred_new = random_forest_model.predict([input_vector])
    nb_pred_new = naive_bayes_model.predict([input_vector])
    dt_pred_new = decision_tree_model.predict([input_vector])

    # Combine predictions using majority voting
    new_final_predictions = np.array([rf_pred_new, nb_pred_new, dt_pred_new])
    new_final_majority_vote = mode(new_final_predictions, axis=0)[0].flatten()

    # Decode the predicted label back to the disease name
    final_disease = le.inverse_transform(new_final_majority_vote)
    return final_disease[0]

# user choose 3 symptoms
chosen_symptoms = ['Bladder discomfort','Foul smell of urine','Nausea']

# Prediction
predicted_disease = predict_disease(chosen_symptoms, all_symptoms, X.columns)
print(f"Predicted Disease: {predicted_disease}")

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Predicted Disease: Urinary Tract Infection (UTI)


