In [58]:
import math
import numpy as np
import pandas as pd

import urllib.parse
from sqlalchemy import create_engine
import psycopg2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential

import matplotlib.pyplot as plt
import seaborn as sns

import sys, os
# Add location of config.py in grandparent folder to sys.path
sys.path.append(os.path.abspath(os.path.join('../..')))
from config import db_password

In [2]:
# Set seaborn theme
sns.set_style('darkgrid', {
    'grid.linestyle': ':',
    'grid.linewidth': 1.0})

plt.style.use('dark_background')

### Load Data from Provisional Database

In [3]:
# Store SQL connection string
db_string = f"postgresql://user:{urllib.parse.quote(db_password)}\
@127.0.0.1:5432/disease_prediction"

# Connect to SQL database
# Use 'with' to let Python's context manager call Connection.close()
with create_engine(db_string).connect() as engine:
    # Load table from database into DataFrame
    data_df = pd.read_sql('dataset_clean', engine)

In [73]:
# # Load data locally for testing
# data_df = pd.read_csv('./Data/Cleaned/dataset_clean.csv')

data_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,fungal infection,itching,skin rash,nodal skin eruptions,mottling skin,,,,,,,,,,,,,
1,fungal infection,skin rash,nodal skin eruptions,mottling skin,,,,,,,,,,,,,,
2,fungal infection,itching,nodal skin eruptions,mottling skin,,,,,,,,,,,,,,
3,fungal infection,itching,skin rash,mottling skin,,,,,,,,,,,,,,
4,fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


### Transform DataFrame to Boolean per Symptom

In [6]:
# List of symptom column names from data_df
symptom_cols = data_df.columns.to_list()[1:]

# Get values from symptom columns as an array
symptom_values = data_df[symptom_cols].values.astype(str)

# Get list of unique values from array
symptom_list = list(np.unique(symptom_values))
symptom_list.remove('nan')

In [51]:
# Create boolean DataFrame
bool_df = data_df.reindex(columns=['Disease'] + symptom_list)

# Apply boolean values to bool_df according to presence in data_df row
for col in bool_df[symptom_list]:
    bool_df[col] = [True if np.isin(row, col).any() else False
                    for row in data_df[symptom_cols].to_numpy()]

# Check that no NaN's remain
print(bool_df.isnull().any().any())
bool_df

False


Unnamed: 0,Disease,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered mental state,anxiety,back pain,belly pain,blackheads,...,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow oozing scabs,yellow urine,yellowing of eyes,yellowish skin
0,fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,fungal infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,paroxysmal positional vertigo,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4916,acne,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4917,urinary tract infection,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4918,psoriasis,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [52]:
# Save boolean DataFrame
bool_df.to_csv('./Data/Cleaned/dataset_bool.csv')

### Split Data into Training and Testing

In [56]:
# Split into features and target arrays
y = bool_df['Disease'].values
X = bool_df.drop(columns=['Disease']).values

In [70]:
# Split into Training and Testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=2718281828   # math constant e
)

### Compile Machine Learning Model

<!-- ##### Particularly in high-dimensional spaces, data can more easily be separated linearly and the simplicity of classifiers such as naive Bayes and linear SVMs might lead to better generalization than is achieved by other classifiers. -->

In [None]:
# Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.predict(X_test)

In [None]:
# Support Vector Classification
rbf_svm = SVC()
rbf_svm.fit(X_train, y_train)
rbf_svm.predict(X_test)

In [None]:
# sklearn neural network
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
mlp.predict(X_test)

In [None]:
# Keras neural network
nn = Sequential()

# Neural network layers

nn.fit(X_train, y_train)
nn.predict(X_test)

In [None]:
# Output labels for input data
y_pred_rfc = rfc.predict(X_test)

y_pred_svm = rbf_svm.predict(X_test)

y_pred_mlp = mlp.predict(X_test)

y_pred_nn = nn.predict_classes(X_test)

In [None]:
# Generate confusion matrix
rfc_conf_mat = confusion_matrix(y_test, y_pred_rfc)