In [1]:
# all necessary imports
import warnings
from decimal import Decimal
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
# from xgboost import XGBClassifier
# import matplotlib.pyplot as plt
# ignore warnings generated due to usage of old version of tensorflow
warnings.simplefilter("ignore")

### Load Dataset

In [2]:
# Load Dataset scraped from NHP (https://www.nhp.gov.in/disease-a-z) & Wikipedia
# Scrapping and creation of dataset csv is done in a separate program
df_comb = pd.read_csv("dis_sym_dataset_comb.csv")

# creation of features and label for training the models
X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

In [3]:
# splitting data for training the classifiers and testing
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)

In [4]:
# lists used for accuracy plots
accuracy_list = []
cross_accuracy_list = []
model_list = []

### Naive Bayes Classifier 

In [5]:
# Multinomial NB Classifier
mnb = MultinomialNB()
mnb = mnb.fit(X, Y)
# prediction of labels for the test data
mnb_pred = mnb.predict(x_test)
# calculation of accuracy score based on predictions performed
# converting to Decimal as rounding with float is inaccurate
acc_mnb = round(Decimal(accuracy_score(y_test, mnb_pred) * 100), 2)
accuracy_list.append(acc_mnb)
model_list.append("MNB")
print(f"Accuracy (MNB) : {acc_mnb}%")

# Cross Validation Accuracy MNB
# performing cross validation with 5 different splits
scores_mnb = cross_val_score(mnb, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_mnb.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (MNB): {score}%")

Accuracy (MNB) : 83.82%
Cross Validation Accuracy (MNB): 84.50%


### Random Forest Classifier

In [6]:
# RF Classifier
rf = RandomForestClassifier(n_estimators=10, criterion='entropy')
rf = rf.fit(X, Y)
# prediction of labels for the test data
rf_pred = rf.predict(x_test)
acc_rf = round(Decimal(accuracy_score(y_test, rf_pred) * 100), 2)
accuracy_list.append(acc_rf)
model_list.append("RF")
print(f"Accuracy (RF) : {acc_rf}%")

# Cross Validation Accuracy RF
# performing cross validation with 5 different splits
scores_rf = cross_val_score(rf, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_rf.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (RF): {score}%")

Accuracy (RF) : 91.52%
Cross Validation Accuracy (RF): 86.84%


### Multi-Layer Perceptron

In [42]:
# MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(32, 32, 32), activation='relu', solver='adam', max_iter=50)
mlp = mlp.fit(X, Y)
# prediction of labels for the test data
mlp_pred = mlp.predict(x_test)
acc_mlp = round(Decimal(accuracy_score(y_test, mlp_pred) * 100), 2)
accuracy_list.append(acc_mlp)
model_list.append("MLP")
print(f"Accuracy (MLP) : {acc_mlp}%")

# Cross Validation Accuracy MLP
# performing cross validation with 5 different splits
scores_mlp = cross_val_score(mlp, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_mlp.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (MLP): {score}%")

Accuracy (MLP) : 91.97%
Cross Validation Accuracy (MLP): 87.09%


### Bayesian Inference Approach for Dynamic Disease Prediction

In [47]:
diseases = list(df_comb['label_dis'].unique())
diseases[:5]

['Abscess',
 'Acquired Capillary Haemangioma of Eyelid',
 'Acquired Immuno Deficiency Syndrome',
 'Acute encephalitis syndrome',
 'Adult Inclusion Conjunctivitis']

In [53]:
def calculate_probabilities(symptoms):
    probabilities = {}

    # Filter dataframe based on symptom presence (symptom columns == 1)
    df_filtered = df_comb.copy()  # Start with full dataframe
    
    for symptom in symptoms:
        df_filtered = df_filtered[df_filtered[symptom] == 1]  # Only rows where symptom == 1
    
    # Calculate probabilities for each disease based on filtered dataset
    if not df_filtered.empty:  # Proceed if the filtered dataframe has rows
        for disease in diseases:
            df_disease = df_filtered[df_filtered['label_dis'] == disease]
            probabilities[disease] = len(df_disease) / len(df_filtered)
    else:
        # If no rows match the symptoms, return equal probabilities or 0
        for disease in diseases:
            probabilities[disease] = 0

    return probabilities


In [56]:
probs = calculate_probabilities(['fatigue'])
print([(disease, probs[disease]) for disease in diseases if probs[disease] > 0])

[('Coronavirus disease 2019 (COVID-19)', 0.4), ('Dehydration', 0.2), ('Mouth Breathing', 0.4)]


### Using Random Forest predict_proba method

In [64]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df_comb = pd.read_csv('dis_sym_dataset_comb.csv')

# Define features and target
X = df_comb.drop(columns=['label_dis'])
y = df_comb['label_dis']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

# Function to predict disease probabilities based on input symptoms
def predict_disease(symptoms):
    # Create a sample input row with 0s for all symptoms
    input_data = pd.DataFrame([0] * len(X.columns), index=X.columns).T

    # Set the symptoms provided by the user to 1
    for symptom in symptoms:
        if symptom in input_data.columns:
            input_data[symptom] = 1
        else:
            print(f"Warning: Symptom '{symptom}' not found in dataset.")

    # Predict disease probabilities
    probabilities = rf_model.predict_proba(input_data)[0]

    # Map probabilities to disease names
    disease_probabilities = dict(zip(rf_model.classes_, probabilities))

    # Sort and print the diseases by probability
    sorted_disease_probabilities = dict(sorted(disease_probabilities.items(), key=lambda item: item[1], reverse=True))
    return sorted_disease_probabilities

# Example usage
symptoms_list = ['fatigue']  # Replace with symptoms of interest
probs = predict_disease(symptoms_list)
probs

Accuracy on test data: 87.95%
                                                         precision    recall  f1-score   support

                                                Abscess       0.00      0.00      0.00         1
                    Acquired Immuno Deficiency Syndrome       0.00      0.00      0.00         1
                            Acute encephalitis syndrome       0.67      0.67      0.67         6
                         Adult Inclusion Conjunctivitis       1.00      1.00      1.00         1
                           Alcohol Abuse and Alcoholism       1.00      1.00      1.00         9
                                              Alzheimer       1.00      0.83      0.91         6
                                             Amoebiasis       0.00      0.00      0.00         1
                                                Anaemia       0.00      0.00      0.00         4
                                          Anisometropia       0.00      0.00      0.00         1

{'Mouth Breathing': np.float64(0.72),
 'Coronavirus disease 2019 (COVID-19)': np.float64(0.12),
 'Dehydration': np.float64(0.08),
 'Alcohol Abuse and Alcoholism': np.float64(0.01),
 'Calculi': np.float64(0.01),
 'GERD': np.float64(0.01),
 'Herpes Simplex': np.float64(0.01),
 'Jaundice': np.float64(0.01),
 'Myocardial Infarction (Heart Attack)': np.float64(0.01),
 'Stroke': np.float64(0.01),
 'Multiple myeloma': np.float64(0.005),
 'Rickets': np.float64(0.005),
 'Abscess': np.float64(0.0),
 'Acquired Capillary Haemangioma of Eyelid': np.float64(0.0),
 'Acute encephalitis syndrome': np.float64(0.0),
 'Adult Inclusion Conjunctivitis': np.float64(0.0),
 'Alopecia (hair loss)': np.float64(0.0),
 'Alzheimer': np.float64(0.0),
 'Amaurosis Fugax': np.float64(0.0),
 'Amblyopia': np.float64(0.0),
 'Amoebiasis': np.float64(0.0),
 'Anaemia': np.float64(0.0),
 'Aniseikonia': np.float64(0.0),
 'Antepartum hemorrhage (Bleeding in late pregnancy)': np.float64(0.0),
 'Anthrax': np.float64(0.0),
 'Anxie