In [76]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Notebook for cleaning data so that it is ready for ML.


In [77]:
def load_dataset(path: str): 
    #load in the data as a pandas dataframe
    df = pd.read_csv(path, na_values=["Unknown","N/A"])
    return df

def check_drop_duplicates(df: pd.DataFrame):
    #check for id and age duplicates
    is_duplicate = df.duplicated(subset=["id","age"])
    total_duplicates = is_duplicate.sum()
    df = df.drop_duplicates(subset=["id","age"])
    return df
                                        

In [None]:
def remove_missing_numeric_values(df: pd.DataFrame):
    #select numeric columns
    numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
    #only bmi has missing values (4 %)
    #remove all children
    mask = df["age"] > 17
    df = df[mask]
    #remove missing values
    df = df.dropna(subset=numeric_columns)
    return df
    
    

In [None]:
def remove_missing_categorical_values(df: pd.DataFrame):
    text_columns = df.select_dtypes(include=object).columns.tolist()
    # only smoking status has missing values here, but 20 % are missing. This is too high a proportion to simply drop missing values
    # for now we will just replace with mode for this column: "never smoked"
    smoking_status = df["smoking_status"]
    replacement_dict = {np.nan : "never smoked"}
    smoking_status = smoking_status.replace(replacement_dict)
    df["smoking_status"] = smoking_status
    return df
    
    

In [None]:
def clean_data(path: str):
    """_summary_

    Args:
        path (str): relative path to dataset

    Returns:
        dataframe: pandas dataframe of cleaned data
    """
    df = load_dataset(path)
    df = check_drop_duplicates(df)
    df = remove_missing_numeric_values(df)
    df = remove_missing_categorical_values(df)
    return df

In [None]:
if __name__ == "__main__":
    data_path = "healthcare-dataset-stroke-data.csv"
    cleaned_data = clean_data(path=data_path)
    
    #saves cleaned dataframe as a pickle file in workspace
    cleaned_data.to_pickle("cleaned_data.pkl")
