# Diagnosis Prediction Tool: ETL Notebook

## Overview

#### This notebook contains the python code to populate a database that contains tables: 1. diseases; 2. diagnosis_samples.  The data for these tables comes from a kaggle dataset containing close to 5,000 samples of symptoms and diagnosis.

* The diseases data comes from two csv files (symptom_Description.csv and symptom_precaution.csv) found at:  https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset

* The diagnosis_sample data comes from two csv files (dataset.csv and Symptom-severity.csv) found at:  https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset

In [None]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from sqlalchemy import Column, Integer, String, Float, Boolean, ForeignKey, UniqueConstraint, ForeignKeyConstraint
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.ext.declarative import declarative_base
import config
import numpy as np


## Extract

* Each of the 4 csv files are loaded into dataframes
    * symptom_Description.csv to disease_desc_df
    * symptom_precaution.csv to disease_treat_df
    * dataset.csv to samples_df
    * Symptom-severity.csv to symp_sever_df

In [None]:
# Extract symptom_Description.csv to disease_desc_df
filename = '../Resources/symptom_Description.csv'
disease_desc_df = pd.read_csv(filename)
disease_desc_df.head()

In [None]:
# Extract symptom_precaution.csv to disease_treat_df
filename = '../Resources/symptom_precaution.csv'
disease_treat_df = pd.read_csv(filename)
disease_treat_df.head()

In [None]:
# Extract dataset.csv to samples_df
filename = '../Resources/dataset.csv'
samples_df = pd.read_csv(filename)
samples_df.head()

In [None]:
# Extract Symptom-severity.csv to symp_sever_df
filename = '../Resources/Symptom-severity.csv'
symp_sever_df = pd.read_csv(filename)
symp_sever_df.head()

## Transform

### 1. Transform Disease Data

In [None]:
# Check that Disease column values are the same from treatment and description dataframes
disease_compare_df = pd.DataFrame()
disease_compare_df['desc_disease'] = disease_desc_df['Disease']
disease_compare_df['treat_disease'] = disease_treat_df['Disease']
disease_compare_df['Check'] =  disease_compare_df['treat_disease']==disease_compare_df['desc_disease']
disease_compare_df

In [None]:
# Remove leading and trailing spaces from Disease columns in both dataframes
disease_desc_df['Disease'] = disease_desc_df['Disease'].str.strip()
disease_treat_df['Disease'] = disease_treat_df['Disease'].str.strip()

# fix misspelling in treatment file
disease_treat_df.loc[ disease_treat_df['Disease'] == 'Dimorphic hemmorhoids(piles)', 'Disease'] = 'Dimorphic hemorrhoids(piles)'

# Merge the disease_treat_df and disease_desc_df on the Disease field
disease_df = disease_desc_df.merge(disease_treat_df[['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']], on=['Disease'])

# Rename columns to read treatments vs precautions
disease_df = disease_df.rename(columns={"Disease": "disease",
                                        "Description" : "description",
                                        "Precaution_1": "treatment1",
                                        "Precaution_2": "treatment2",
                                        "Precaution_3": "treatment3",
                                        "Precaution_4": "treatment4"})

# Add classifications column for disease severity - immediacy of seeking help based on if it says to go to hospital or consult a doctor


disease_df

### 2. Transform Diagnosis Sample Data

In [None]:
# Create Sample ID
samples_df['sample_id'] = np.arange(samples_df.shape[0])

# Eliminate NaN
samples_df = samples_df.fillna('')
samples_df

In [None]:
# Identify the number and list of unique symptoms by taking the horizontal data and making it vertical
symptom_df = pd.melt(samples_df, id_vars=['sample_id','Disease'], value_vars=['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17'])
symptom_df

In [None]:
# Check the number of times each symptom appears
symptom_df['value'].value_counts()

In [None]:
# Assign an equal weight to each noted symptom when indicated
symptom_df['weight'] = 1

# fix misspelling in file
symptom_df.loc[ symptom_df['Disease'] == 'Dimorphic hemmorhoids(piles)', 'Disease'] = 'Dimorphic hemorrhoids(piles)'

# Remove leading and trailing spaces from disease
symptom_df['Disease'] = symptom_df['Disease'].str.strip()

# Remove spaces and parenthesis from symptoms
symptom_df['value'] = symptom_df['value'].str.replace(' ', '')
symptom_df['value'] = symptom_df['value'].str.replace(')', '')
symptom_df['value'] = symptom_df['value'].str.replace('(', '')

# Reduce the columns to only those needed
symptom_df = symptom_df [['sample_id', 'Disease', 'value', 'weight']]

# Drop the index
symptom_df = symptom_df.drop(symptom_df[symptom_df.value == ''].index)

# Rename column headings to lower case
symptom_df = symptom_df.rename(columns={"Disease": "disease"})
symptom_df

In [None]:
# Pivot the table to create the columns with indicated values
pd.set_option("display.max_columns", None)
samples_transformed_df = pd.DataFrame(symptom_df.reset_index().groupby(['sample_id', 'value'])['weight'].aggregate('first').unstack())

# Change NaN values to 0
samples_transformed_df = samples_transformed_df.fillna(0)
samples_transformed_df.head()

In [None]:
# Add the diagnosed disease into the dataset
samples_transformed_df['disease'] = symptom_df['disease']
samples_transformed_df

In [None]:
# Remove spaces and parenthesis from symptoms
symp_sever_df['Symptom'] = symp_sever_df['Symptom'].str.replace(' ', '')
symp_sever_df['Symptom'] = symp_sever_df['Symptom'].str.replace(')', '')
symp_sever_df['Symptom'] = symp_sever_df['Symptom'].str.replace('(', '')
symp_sever_df

In [None]:
# Apply the symptom weight values to the dataframe
weighted_samples_df = samples_transformed_df.copy()

for column in weighted_samples_df:
    if weighted_samples_df[column].name != 'disease':
        value = symp_sever_df.loc[symp_sever_df['Symptom'] == weighted_samples_df[column].name, 'weight'].iloc[0]
        weighted_samples_df[column] = weighted_samples_df[column] * value

weighted_samples_df

# Load 
##### Create Disease and Diagnosis Sample tables and load the dataframes for Disease and Diagnosis Sample data to the database.

In [None]:
##### Create Database Connection #####

protocol = 'postgresql'
username = config.myusername
password = config.mypassword
host = 'localhost'
port = config.myport_number
database_name = 'diagnosis_db'

# create the database if it doesn't already exist
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string, echo = False)
if not database_exists(engine.url):
    create_database(engine.url)

# connect to the database
conn = engine.connect()

In [None]:
##### Create Tables #####

# instantiate the base
Base = declarative_base()

# define tables
class diseases(Base):
    __tablename__ = "diseases"
    disease = Column(String(250), nullable = False, unique = True, primary_key = True)
    description = Column(String(1000), nullable = False)
    treatment1 = Column(String(250), nullable = False)
    treatment2 = Column(String(250), nullable = True)
    treatment3 = Column(String(250), nullable = True)
    treatment4 = Column(String(250), nullable = True)
   

class diagnosis_samples(Base):
    __tablename__ = "diagnosis_samples"
    uid = Column(Integer, nullable = False, unique = True, primary_key = True)
    disease = Column(String(250), ForeignKey("diseases.disease"), nullable = False)
    fatigue = Column(Integer, nullable = False)
    vomiting = Column(Integer, nullable = False)
    high_fever = Column(Integer, nullable = False)
    loss_of_appetite = Column(Integer, nullable = False)
    nausea = Column(Integer, nullable = False)
    headache = Column(Integer, nullable = False)
    abdominal_pain = Column(Integer, nullable = False)
    yellowish_skin = Column(Integer, nullable = False)
    yellowing_of_eyes = Column(Integer, nullable = False)
    chills = Column(Integer, nullable = False)
    skin_rash = Column(Integer, nullable = False)
    malaise = Column(Integer, nullable = False)
    chest_pain = Column(Integer, nullable = False)
    joint_pain = Column(Integer, nullable = False)
    sweating = Column(Integer, nullable = False)
    itching = Column(Integer, nullable = False)
    dark_urine = Column(Integer, nullable = False)
    cough = Column(Integer, nullable = False)
    diarrhoea = Column(Integer, nullable = False)
    muscle_pain = Column(Integer, nullable = False)
    irritability = Column(Integer, nullable = False)
    excessive_hunger = Column(Integer, nullable = False)
    lethargy = Column(Integer, nullable = False)
    weight_loss = Column(Integer, nullable = False)
    breathlessness = Column(Integer, nullable = False)
    mild_fever = Column(Integer, nullable = False)
    phlegm = Column(Integer, nullable = False)
    swelled_lymph_nodes = Column(Integer, nullable = False)
    blurred_and_distorted_vision = Column(Integer, nullable = False)
    loss_of_balance = Column(Integer, nullable = False)
    dizziness = Column(Integer, nullable = False)
    abnormal_menstruation = Column(Integer, nullable = False)
    fast_heart_rate = Column(Integer, nullable = False)
    depression = Column(Integer, nullable = False)
    muscle_weakness = Column(Integer, nullable = False)
    red_spots_over_body = Column(Integer, nullable = False)
    family_history = Column(Integer, nullable = False)
    painful_walking = Column(Integer, nullable = False)
    swelling_joints = Column(Integer, nullable = False)
    obesity = Column(Integer, nullable = False)
    neck_pain = Column(Integer, nullable = False)
    mood_swings = Column(Integer, nullable = False)
    restlessness = Column(Integer, nullable = False)
    constipation = Column(Integer, nullable = False)
    back_pain = Column(Integer, nullable = False)
    stiff_neck = Column(Integer, nullable = False)
    indigestion = Column(Integer, nullable = False)
    acidity = Column(Integer, nullable = False)
    stomach_pain = Column(Integer, nullable = False)
    continuous_sneezing = Column(Integer, nullable = False)
    burning_micturition = Column(Integer, nullable = False)
    throat_irritation = Column(Integer, nullable = False)
    pain_behind_the_eyes = Column(Integer, nullable = False)
    increased_appetite = Column(Integer, nullable = False)
    polyuria = Column(Integer, nullable = False)
    brittle_nails = Column(Integer, nullable = False)
    enlarged_thyroid = Column(Integer, nullable = False)
    swollen_extremeties = Column(Integer, nullable = False)
    loss_of_smell = Column(Integer, nullable = False)
    slurred_speech = Column(Integer, nullable = False)
    receiving_unsterile_injections = Column(Integer, nullable = False)
    blood_in_sputum = Column(Integer, nullable = False)
    congestion = Column(Integer, nullable = False)
    runny_nose = Column(Integer, nullable = False)
    stomach_bleeding = Column(Integer, nullable = False)
    sinus_pressure = Column(Integer, nullable = False)
    coma = Column(Integer, nullable = False)
    palpitations = Column(Integer, nullable = False)
    rusty_sputum = Column(Integer, nullable = False)
    redness_of_eyes = Column(Integer, nullable = False)
    receiving_blood_transfusion = Column(Integer, nullable = False)
    drying_and_tingling_lips = Column(Integer, nullable = False)
    internal_itching = Column(Integer, nullable = False)
    history_of_alcohol_consumption = Column(Integer, nullable = False)
    unsteadiness = Column(Integer, nullable = False)
    inflammatory_nails = Column(Integer, nullable = False)
    fluid_overload = Column(Integer, nullable = False)
    puffy_face_and_eyes = Column(Integer, nullable = False)
    prominent_veins_on_calf = Column(Integer, nullable = False)
    visual_disturbances = Column(Integer, nullable = False)
    toxic_look_typhos = Column(Integer, nullable = False)
    acute_liver_failure = Column(Integer, nullable = False)
    yellow_urine = Column(Integer, nullable = False)
    belly_pain = Column(Integer, nullable = False)
    mucoid_sputum = Column(Integer, nullable = False)
    yellow_crust_ooze = Column(Integer, nullable = False)
    movement_stiffness = Column(Integer, nullable = False)
    irregular_sugar_level = Column(Integer, nullable = False)
    skin_peeling = Column(Integer, nullable = False)
    altered_sensorium = Column(Integer, nullable = False)
    swelling_of_stomach = Column(Integer, nullable = False)
    bloody_stool = Column(Integer, nullable = False)
    hip_joint_pain = Column(Integer, nullable = False)
    small_dents_in_nails = Column(Integer, nullable = False)
    bladder_discomfort = Column(Integer, nullable = False)
    knee_pain = Column(Integer, nullable = False)
    continuous_feel_of_urine = Column(Integer, nullable = False)
    silver_like_dusting = Column(Integer, nullable = False)
    red_sore_around_nose = Column(Integer, nullable = False)
    anxiety = Column(Integer, nullable = False)
    passage_of_gases = Column(Integer, nullable = False)
    cold_hands_and_feets = Column(Integer, nullable = False)
    bruising = Column(Integer, nullable = False)
    weight_gain = Column(Integer, nullable = False)
    lack_of_concentration = Column(Integer, nullable = False)
    cramps = Column(Integer, nullable = False)
    pain_during_bowel_movements = Column(Integer, nullable = False)
    pain_in_anal_region = Column(Integer, nullable = False)
    distention_of_abdomen = Column(Integer, nullable = False)
    irritation_in_anus = Column(Integer, nullable = False)
    swollen_legs = Column(Integer, nullable = False)
    blister = Column(Integer, nullable = False)
    dischromic_patches = Column(Integer, nullable = False)
    nodal_skin_eruptions = Column(Integer, nullable = False)
    ulcers_on_tongue = Column(Integer, nullable = False)
    blackheads = Column(Integer, nullable = False)
    dehydration = Column(Integer, nullable = False)
    watering_from_eyes = Column(Integer, nullable = False)
    shivering = Column(Integer, nullable = False)
    extra_marital_contacts = Column(Integer, nullable = False)
    pus_filled_pimples = Column(Integer, nullable = False)
    spinning_movements = Column(Integer, nullable = False)
    scurring = Column(Integer, nullable = False)
    spotting_urination = Column(Integer, nullable = False)
    muscle_wasting = Column(Integer, nullable = False)
    patches_in_throat = Column(Integer, nullable = False)
    sunken_eyes = Column(Integer, nullable = False)
    weakness_in_limbs = Column(Integer, nullable = False)
    swollen_blood_vessels = Column(Integer, nullable = False)
    weakness_of_one_body_side = Column(Integer, nullable = False)
    foul_smell_ofurine = Column(Integer, nullable = False)
    __tableargs__ = (
        ForeignKeyConstraint(
            [disease],
            [diseases.disease]
        )
    )

class weighted_diagnosis_samples(Base):
    __tablename__ = "weighted_diagnosis_samples"
    uid = Column(Integer, nullable = False, unique = True, primary_key = True)
    disease = Column(String(250), ForeignKey("diseases.disease"), nullable = False)
    fatigue = Column(Integer, nullable = False)
    vomiting = Column(Integer, nullable = False)
    high_fever = Column(Integer, nullable = False)
    loss_of_appetite = Column(Integer, nullable = False)
    nausea = Column(Integer, nullable = False)
    headache = Column(Integer, nullable = False)
    abdominal_pain = Column(Integer, nullable = False)
    yellowish_skin = Column(Integer, nullable = False)
    yellowing_of_eyes = Column(Integer, nullable = False)
    chills = Column(Integer, nullable = False)
    skin_rash = Column(Integer, nullable = False)
    malaise = Column(Integer, nullable = False)
    chest_pain = Column(Integer, nullable = False)
    joint_pain = Column(Integer, nullable = False)
    sweating = Column(Integer, nullable = False)
    itching = Column(Integer, nullable = False)
    dark_urine = Column(Integer, nullable = False)
    cough = Column(Integer, nullable = False)
    diarrhoea = Column(Integer, nullable = False)
    muscle_pain = Column(Integer, nullable = False)
    irritability = Column(Integer, nullable = False)
    excessive_hunger = Column(Integer, nullable = False)
    lethargy = Column(Integer, nullable = False)
    weight_loss = Column(Integer, nullable = False)
    breathlessness = Column(Integer, nullable = False)
    mild_fever = Column(Integer, nullable = False)
    phlegm = Column(Integer, nullable = False)
    swelled_lymph_nodes = Column(Integer, nullable = False)
    blurred_and_distorted_vision = Column(Integer, nullable = False)
    loss_of_balance = Column(Integer, nullable = False)
    dizziness = Column(Integer, nullable = False)
    abnormal_menstruation = Column(Integer, nullable = False)
    fast_heart_rate = Column(Integer, nullable = False)
    depression = Column(Integer, nullable = False)
    muscle_weakness = Column(Integer, nullable = False)
    red_spots_over_body = Column(Integer, nullable = False)
    family_history = Column(Integer, nullable = False)
    painful_walking = Column(Integer, nullable = False)
    swelling_joints = Column(Integer, nullable = False)
    obesity = Column(Integer, nullable = False)
    neck_pain = Column(Integer, nullable = False)
    mood_swings = Column(Integer, nullable = False)
    restlessness = Column(Integer, nullable = False)
    constipation = Column(Integer, nullable = False)
    back_pain = Column(Integer, nullable = False)
    stiff_neck = Column(Integer, nullable = False)
    indigestion = Column(Integer, nullable = False)
    acidity = Column(Integer, nullable = False)
    stomach_pain = Column(Integer, nullable = False)
    continuous_sneezing = Column(Integer, nullable = False)
    burning_micturition = Column(Integer, nullable = False)
    throat_irritation = Column(Integer, nullable = False)
    pain_behind_the_eyes = Column(Integer, nullable = False)
    increased_appetite = Column(Integer, nullable = False)
    polyuria = Column(Integer, nullable = False)
    brittle_nails = Column(Integer, nullable = False)
    enlarged_thyroid = Column(Integer, nullable = False)
    swollen_extremeties = Column(Integer, nullable = False)
    loss_of_smell = Column(Integer, nullable = False)
    slurred_speech = Column(Integer, nullable = False)
    receiving_unsterile_injections = Column(Integer, nullable = False)
    blood_in_sputum = Column(Integer, nullable = False)
    congestion = Column(Integer, nullable = False)
    runny_nose = Column(Integer, nullable = False)
    stomach_bleeding = Column(Integer, nullable = False)
    sinus_pressure = Column(Integer, nullable = False)
    coma = Column(Integer, nullable = False)
    palpitations = Column(Integer, nullable = False)
    rusty_sputum = Column(Integer, nullable = False)
    redness_of_eyes = Column(Integer, nullable = False)
    receiving_blood_transfusion = Column(Integer, nullable = False)
    drying_and_tingling_lips = Column(Integer, nullable = False)
    internal_itching = Column(Integer, nullable = False)
    history_of_alcohol_consumption = Column(Integer, nullable = False)
    unsteadiness = Column(Integer, nullable = False)
    inflammatory_nails = Column(Integer, nullable = False)
    fluid_overload = Column(Integer, nullable = False)
    puffy_face_and_eyes = Column(Integer, nullable = False)
    prominent_veins_on_calf = Column(Integer, nullable = False)
    visual_disturbances = Column(Integer, nullable = False)
    toxic_look_typhos = Column(Integer, nullable = False)
    acute_liver_failure = Column(Integer, nullable = False)
    yellow_urine = Column(Integer, nullable = False)
    belly_pain = Column(Integer, nullable = False)
    mucoid_sputum = Column(Integer, nullable = False)
    yellow_crust_ooze = Column(Integer, nullable = False)
    movement_stiffness = Column(Integer, nullable = False)
    irregular_sugar_level = Column(Integer, nullable = False)
    skin_peeling = Column(Integer, nullable = False)
    altered_sensorium = Column(Integer, nullable = False)
    swelling_of_stomach = Column(Integer, nullable = False)
    bloody_stool = Column(Integer, nullable = False)
    hip_joint_pain = Column(Integer, nullable = False)
    small_dents_in_nails = Column(Integer, nullable = False)
    bladder_discomfort = Column(Integer, nullable = False)
    knee_pain = Column(Integer, nullable = False)
    continuous_feel_of_urine = Column(Integer, nullable = False)
    silver_like_dusting = Column(Integer, nullable = False)
    red_sore_around_nose = Column(Integer, nullable = False)
    anxiety = Column(Integer, nullable = False)
    passage_of_gases = Column(Integer, nullable = False)
    cold_hands_and_feets = Column(Integer, nullable = False)
    bruising = Column(Integer, nullable = False)
    weight_gain = Column(Integer, nullable = False)
    lack_of_concentration = Column(Integer, nullable = False)
    cramps = Column(Integer, nullable = False)
    pain_during_bowel_movements = Column(Integer, nullable = False)
    pain_in_anal_region = Column(Integer, nullable = False)
    distention_of_abdomen = Column(Integer, nullable = False)
    irritation_in_anus = Column(Integer, nullable = False)
    swollen_legs = Column(Integer, nullable = False)
    blister = Column(Integer, nullable = False)
    dischromic_patches = Column(Integer, nullable = False)
    nodal_skin_eruptions = Column(Integer, nullable = False)
    ulcers_on_tongue = Column(Integer, nullable = False)
    blackheads = Column(Integer, nullable = False)
    dehydration = Column(Integer, nullable = False)
    watering_from_eyes = Column(Integer, nullable = False)
    shivering = Column(Integer, nullable = False)
    extra_marital_contacts = Column(Integer, nullable = False)
    pus_filled_pimples = Column(Integer, nullable = False)
    spinning_movements = Column(Integer, nullable = False)
    scurring = Column(Integer, nullable = False)
    spotting_urination = Column(Integer, nullable = False)
    muscle_wasting = Column(Integer, nullable = False)
    patches_in_throat = Column(Integer, nullable = False)
    sunken_eyes = Column(Integer, nullable = False)
    weakness_in_limbs = Column(Integer, nullable = False)
    swollen_blood_vessels = Column(Integer, nullable = False)
    weakness_of_one_body_side = Column(Integer, nullable = False)
    foul_smell_ofurine = Column(Integer, nullable = False)
    __tableargs__ = (
        ForeignKeyConstraint(
            [disease],
            [diseases.disease]
        )
    )

# create the tables
Base.metadata.create_all(engine)

In [None]:
##### Load Data into the Tables #####

# confirm tables exist
inspector = inspect(engine)
inspector.get_table_names()

In [None]:
# Load Diseases to disease Table
disease_df.to_sql(name='diseases', con=engine, if_exists='append', index=False)

In [None]:
# Load Diagnosis Samples to diagnosis_samples table
samples_transformed_df.to_sql(name='diagnosis_samples', con=engine, if_exists='append', index=False)

In [None]:
# Load Diagnosis Samples to diagnosis_samples table
samples_transformed_df.to_sql(name='weighted_diagnosis_samples', con=engine, if_exists='append', index=False)