In [None]:
import numpy as np
import pandas as pd

### Main Dataset

In [None]:
# Main dataset
data_df = pd.read_csv("./Data/Dirty/dataset.csv")
data_df.head()

In [None]:
# Number of target variables
data_df['Disease'].nunique()

In [None]:
data_df['Disease'].value_counts()[:5]

In [None]:
# All diseases have a sample size of n=120
data_df['Disease'].value_counts().nunique()

In [None]:
# Last column contains values
data_df['Symptom_17'].value_counts()

In [None]:
# A more rigorous confirmation that no symptom columns contain only NaN
empty_bool = True

for col in data_df.columns[1:]:
    if data_df[col].value_counts().empty == True:
        print(col)
        empty_bool = False
        
if empty_bool: print('No columns are empty.')

In [None]:
# Confirm that the .value_counts() of a column with only NaN is empty
test_df = pd.DataFrame({'column': [np.nan] * 3})
print(test_df)

test_df['column'].value_counts().empty

In [None]:
# List of symptom column names from data_df
symptom_cols = data_df.columns.to_list()[1:]

# Get values from symptom columns as an array
symptom_values = data_df[symptom_cols].values.astype(str)

# Get list of unique values from array
symptom_list = list(np.unique(symptom_values))
symptom_list.remove('nan')
symptom_list[:5]

In [None]:
# Get list of unique Disease values from data_df
disease_data_list = list(np.unique(data_df['Disease'].values.astype(str)))
disease_data_list[:5]

In [None]:
# Find diseases in data_df with leading or trailing whitespace
for value in disease_data_list:
    if value.startswith(" ") or value.endswith(" "):
        print(value)

In [None]:
# Strip all values in data_df of leading or trailing whitespace
for col in data_df.columns:
    data_df[col] = data_df[col].str.strip()

In [None]:
# Check that values in all columns have been stripped
strip_bool = True

for col in data_df:
    if (data_df[col].str.startswith(" ").any() or
        data_df[col].str.endswith(" ").any()):
        print(col)
        strip_bool = False
        
if strip_bool: print('No leading or trailing whitespace found.')

### Symptom Severity

In [None]:
# Symptom severity
severity_df = pd.read_csv('./Data/Dirty/symptom_severity.csv')
severity_df.tail()

In [None]:
# Compare number of symptoms in dataset to those in severity_df
print(len(symptom_list))

print(len(severity_df['Symptom']))

In [None]:
# Find duplicated symptoms in severity_df
severity_df[severity_df['Symptom'].duplicated()]

In [None]:
# Look at both duplicate records
severity_df.loc[severity_df['Symptom'] == 'fluid_overload']

In [None]:
# Drop duplicate and replace weight of remaining record with the average, '5'
severity_df = severity_df.drop_duplicates(subset='Symptom')

fluid_exp = severity_df['Symptom'] == 'fluid_overload'
severity_df.loc[fluid_exp, 'weight'] = '5'
severity_df.loc[fluid_exp]

In [None]:
# Convert symptom severity weights to integers
severity_df['weight'] = severity_df['weight'].astype('int')
severity_df.dtypes

In [None]:
# Sort symptoms by weight to view comparatively
severity_df = severity_df.sort_values(by='weight', ascending=False)
severity_df = severity_df.reset_index(drop=True)
severity_df

In [None]:
# Check distribution of symptom severity weights
# Weights roughly follow a normal distribution
weight_freq = severity_df['weight'].value_counts().sort_index()
weight_freq.plot()

### Disease Descriptions

In [None]:
# Disease descriptions
desc_df = pd.read_csv('./Data/Dirty/disease_description.csv')
desc_df.tail()

### Disease Precautions

In [None]:
# Disease precautions
precaution_df = pd.read_csv('./Data/Dirty/disease_precaution.csv')
precaution_df.tail()

In [None]:
# List of precaution column names from precaution_df
precaution_cols = precaution_df.columns.to_list()[1:]

# Get values from precaution columns as an array
precaution_values = precaution_df[precaution_cols].values.astype(str)

# Get list of unique values from array
precaution_list = list(np.unique(precaution_values))
precaution_list[:5]

### Replace Values in all DataFrames

In [None]:
# Replace precaution values
precaution_replace = {
    "anti itch medicine": "anti-itch medicine",
    "antiboitic therapy": "antibiotics",
    "apply calamine": "apply calamine lotion",
    "avoid abrupt head movment": "avoid abrupt head movement",
    "avoid non veg food": "avoid non-vegeterian food",
    "avoid sudden change in body": "avoid sudden body movements",
    "avoid too many products": "avoid too many skincare products",
    "bath twice": "bathe twice",
    "check in pulse": "check pulse",
    "chew or swallow asprin": "chew or swallow aspirin",
    "Consult nearest hospital": "consult nearest hospital",
    "consume alovera juice": "consume aloe vera juice",
    "consume milk thistle": "drink vitamin c rich drinks",
    "dont stand still for long": "don't stand still for long",
    "eat fruits and high fiberous food": "eat fruits and foods high in fiber",
    "eat high calorie vegitables": "eat high-calorie vegetables",
    "follow up": "follow up with doctor",
    "keep mosquitos out": "keep mosquitos away",
    "salt baths": "warm bath with epsom salt",
    "switch to loose cloothing": "loosen clothing",
    np.NaN: "loosen clothing",  # precaution for heart attack
    "take otc pain reliver": "take OTC pain reliever",
    "take vapour": "steam inhalation",
    "use detol or neem in bathing water": "use oil of tea tree or peppermint in bathwater",
    "use neem in bathing ": "use oil of tea tree or peppermint in bathwater",
    "use oinments": "use ointments",
    "use poloroid glasses in sun": "use polarized glasses in sun",
    "wash hands through": "wash hands thoroughly",
    "wash hands with warm soapy water": "wash hands thoroughly",
    "wear ppe if possible": "wear PPE if possible"
}

precaution_df = precaution_df.replace(precaution_replace)

In [None]:
# Replace 'Symptom*' in data_df and severity_df
symptom_replace = {
    "scurring": "scarring",
    "diarrhoea": "diarrhea",
    "cold_hands_and_feets": "cold_hands_and_feet",
    "swollen_extremeties": "swollen_extremities",
    "foul_smell_ofurine": "foul_smell_of_urine",
    "altered_sensorium": "altered_mental_state",
    "dischromic_patches": "mottling_skin",
    "polyuria": "excessive_urination",
    "burning_micturition": "painful_urination",
    "silver_like_dusting": "blue-gray_complexion_(argyria)",
    "pus_filled_pimples": "pus-filled_pimples",
    "toxic_look_(typhos)": "toxic_look_(toxemia)",
    "yellow_crust_ooze": "yellow_oozing_scabs"
}

for df in [data_df, severity_df]:
    # Create list of columns containing symptoms
    symptom_cols = [col for col in df.columns if "Symptom" in col]

    # Remove any whitespace in symptom
    df[symptom_cols] = df[symptom_cols].replace(' ', '', regex=True)
    
    # Replace values according to dictionary
    df[symptom_cols] = df[symptom_cols].replace(symptom_replace)

    # Replace "_" with whitespace " "
    df[symptom_cols] = df[symptom_cols].replace('_', ' ', regex=True)

In [None]:
# Replace 'Disease' in data_df, desc_df, precaution_df
# force lowercase for all diseases except AIDS & GERD
disease_replace = {
    "Dimorphic hemorrhoids(piles)": "Hemorrhoids",
    "Dimorphic hemmorhoids(piles)": "Hemorrhoids",
    "Bronchial Asthma": "Bronchial asthma",
    "Common Cold": "Common cold",
    "Drug Reaction": "Drug reaction",
    "Paralysis (brain hemorrhage)": "Brain hemorrhage",
    "(vertigo) Paroymsal  Positional Vertigo": "Paroxysmal positional vertigo",
    "Osteoarthristis": "Osteoarthritis",
    "Typhoid": "Typhoid fever"
}

for df in [data_df, desc_df, precaution_df]:
    # Strip any preceding and trailing whitespace
    data_df[col] = data_df[col].str.strip()

    # Replace values according to dictionary
    df['Disease'] = df['Disease'].replace(disease_replace)

    # Lowercase first letter of disease if not acronym
    df['Disease'] = df['Disease'].apply(
        lambda x: x if x in ('GERD', 'AIDS')\
        else x[0].lower() + x[1:]
    )

### Export Cleaned Data

In [None]:
# Save each DataFrame as .csv in Cleaned folder
file_names = {
    'dataset': data_df,
    'disease_description': desc_df,
    'disease_precaution': precaution_df,
    'symptom_severity': severity_df
}

for filename in file_names:
    filepath = f"./Data/Cleaned/{filename}_clean.csv"
    file_names[filename].to_csv(filepath, index=False)