# Diagnosis Prediction Tool: ETL Notebook

## Overview

#### This notebook contains the python code to populate a database that contains tables: 1. diseases; 2. diagnosis_samples.  The data for these tables comes from a kaggle dataset containing close to 5,000 samples of symptoms and diagnosis.

* The diseases data comes from two csv files (symptom_Description.csv and symptom_precaution.csv) found at:  https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset

* The diagnosis_sample data comes from two csv files (dataset.csv and Symptom-severity.csv) found at:  https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset

In [None]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from sqlalchemy import Column, Integer, String, Float, Boolean, ForeignKey, UniqueConstraint, ForeignKeyConstraint
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.ext.declarative import declarative_base
import config
import numpy as np


## Extract

* Each of the 4 csv files are loaded into dataframes
    * symptom_Description.csv to disease_desc_df
    * symptom_precaution.csv to disease_treat_df
    * dataset.csv to samples_df
    * Symptom-severity.csv to symp_sever_df

In [None]:
# Extract symptom_Description.csv to disease_desc_df
filename = '../Resources/symptom_Description.csv'
disease_desc_df = pd.read_csv(filename)
disease_desc_df.head()

In [None]:
# Extract symptom_precaution.csv to disease_treat_df
filename = '../Resources/symptom_precaution.csv'
disease_treat_df = pd.read_csv(filename)
disease_treat_df.head()

In [None]:
# Extract dataset.csv to samples_df
filename = '../Resources/dataset.csv'
samples_df = pd.read_csv(filename)
samples_df.head()

In [None]:
# Extract Symptom-severity.csv to symp_sever_df
filename = '../Resources/Symptom-severity.csv'
symp_sever_df = pd.read_csv(filename)
symp_sever_df.head()

## Transform

### 1. Transform Disease Data

In [None]:
# Check that Disease column values are the same from treatment and description dataframes
disease_compare_df = pd.DataFrame()
disease_compare_df['desc_disease'] = disease_desc_df['Disease']
disease_compare_df['treat_disease'] = disease_treat_df['Disease']
disease_compare_df['Check'] =  disease_compare_df['treat_disease']==disease_compare_df['desc_disease']
disease_compare_df

In [None]:
# Remove leading and trailing spaces from Disease columns in both dataframes
disease_desc_df['Disease'] = disease_desc_df['Disease'].str.strip()
disease_treat_df['Disease'] = disease_treat_df['Disease'].str.strip()

# fix misspelling in treatment file
disease_treat_df.loc[ disease_treat_df['Disease'] == 'Dimorphic hemmorhoids(piles)', 'Disease'] = 'Dimorphic hemorrhoids(piles)'

# Merge the disease_treat_df and disease_desc_df on the Disease field
disease_df = disease_desc_df.merge(disease_treat_df[['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']], on=['Disease'])

# Rename columns to read treatments vs precautions
disease_df = disease_df.rename(columns={"Disease": "disease",
                                        "Description" : "description",
                                        "Precaution_1": "treatment1",
                                        "Precaution_2": "treatment2",
                                        "Precaution_3": "treatment3",
                                        "Precaution_4": "treatment4"})

# Add classifications column for disease severity - immediacy of seeking help based on if it says to go to hospital or consult a doctor


disease_df

### 2. Transform Diagnosis Sample Data

In [None]:
# Create Sample ID
samples_df['sample_id'] = np.arange(samples_df.shape[0])

# Eliminate NaN
samples_df = samples_df.fillna('')
samples_df

In [None]:
# Identify the number and list of unique symptoms by taking the horizontal data and making it vertical
symptom_df = pd.melt(samples_df, id_vars=['sample_id','Disease'], value_vars=['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17'])
symptom_df

In [None]:
# Check the number of times each symptom appears
symptom_df['value'].value_counts()

In [None]:
# Assign an equal weight to each noted symptom when indicated
symptom_df['weight'] = 1

# fix misspelling in file
symptom_df.loc[ symptom_df['Disease'] == 'Dimorphic hemmorhoids(piles)', 'Disease'] = 'Dimorphic hemorrhoids(piles)'

# Remove leading and trailing spaces from disease
symptom_df['Disease'] = symptom_df['Disease'].str.strip()

# Remove spaces and parenthesis from symptoms
symptom_df['value'] = symptom_df['value'].str.replace(' ', '')
symptom_df['value'] = symptom_df['value'].str.replace(')', '')
symptom_df['value'] = symptom_df['value'].str.replace('(', '')

# Reduce the columns to only those needed
symptom_df = symptom_df [['sample_id', 'Disease', 'value', 'weight']]

# Drop the index
symptom_df = symptom_df.drop(symptom_df[symptom_df.value == ''].index)

# Rename column headings to lower case
symptom_df = symptom_df.rename(columns={"Disease": "disease"})
symptom_df

In [None]:
# Pivot the table to create the columns with indicated values
pd.set_option("display.max_columns", None)
samples_transformed_df = pd.DataFrame(symptom_df.reset_index().groupby(['sample_id', 'value'])['weight'].aggregate('first').unstack())

# Change NaN values to 0
samples_transformed_df = samples_transformed_df.fillna(0)
samples_transformed_df.head()

In [None]:
# Add the diagnosed disease into the dataset
samples_transformed_df['disease'] = symptom_df['disease']
samples_transformed_df

In [None]:
# Remove spaces and parenthesis from symptoms
symp_sever_df['Symptom'] = symp_sever_df['Symptom'].str.replace(' ', '')
symp_sever_df['Symptom'] = symp_sever_df['Symptom'].str.replace(')', '')
symp_sever_df['Symptom'] = symp_sever_df['Symptom'].str.replace('(', '')
symp_sever_df

In [None]:
# Apply the symptom weight values to the dataframe
weighted_samples_df = samples_transformed_df.copy()

for column in weighted_samples_df:
    if weighted_samples_df[column].name != 'disease':
        value = symp_sever_df.loc[symp_sever_df['Symptom'] == weighted_samples_df[column].name, 'weight'].iloc[0]
        weighted_samples_df[column] = weighted_samples_df[column] * value

weighted_samples_df

# Load 
##### Save the new dataframes to csv files for the flask server to consume

In [None]:
# Save disease data to csv
disease_df.to_csv("../Resources/disease_df.csv")

In [None]:
# Save samples and diagnosis to csv
weighted_samples_df.to_csv("../Resources/weighted_samples_df.csv")