In [3]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import wrangle as w

# Acquire

* Data has been acquired from Kaggle at https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
* The data set contained 319,795 observations and 18 features before cleaning
* Each observation represents one person in the study
* Each feature represents an aspect of each person's health
* Data set is imbalanced under 9% of respondants report  having had coronary heart disease or myocardial infarction

In [4]:
train, validate, test = w.get_my_data()

In [5]:
train

Unnamed: 0,heart_disease,bmi,smoker,heavy_drinker,stroke,poor_physical_health_days,poor_mental_health_days,difficulty_walking,sex,age_category,...,age_category_80 or older,general_health_Excellent,general_health_Fair,general_health_Good,general_health_Poor,general_health_Very good,bmi_scaled,sleep_hours_scaled,poor_physical_health_days_scaled,poor_mental_health_days_scaled
0,No,25.94,No,No,No,30.0,30.0,No,Female,35-39,...,0,0,0,0,1,0,-0.187246,-1.0,15.0,10.000000
1,No,21.62,Yes,Yes,No,1.0,3.0,No,Male,55-59,...,0,0,0,0,0,1,-0.773406,0.5,0.5,1.000000
2,No,33.23,No,No,No,0.0,0.0,No,Male,45-49,...,0,1,0,0,0,0,0.801900,0.5,0.0,0.000000
3,No,27.20,No,No,No,0.0,0.0,No,Male,45-49,...,0,1,0,0,0,0,-0.016282,0.0,0.0,0.000000
4,No,18.16,Yes,No,No,2.0,0.0,No,Male,18-24,...,0,1,0,0,0,0,-1.242877,0.5,1.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204663,No,25.39,No,No,No,0.0,0.0,No,Female,65-69,...,0,0,0,1,0,0,-0.261872,0.5,0.0,0.000000
204664,No,22.15,Yes,No,No,0.0,1.0,No,Female,30-34,...,0,1,0,0,0,0,-0.701493,0.0,0.0,0.333333
204665,No,27.41,No,No,No,0.0,0.0,No,Male,50-54,...,0,0,0,0,0,1,0.012212,0.0,0.0,0.000000
204666,No,22.89,No,No,No,0.0,0.0,No,Male,50-54,...,0,0,0,1,0,0,-0.601085,0.5,0.0,0.000000


In [None]:
df = pd.read_csv('heart_disease_2020.csv')

In [None]:
df.columns

# Prepare

* There were no null values in this data set
* Features were renamed for convention and readability
* Dummy variables were created for catagorical features for use in modeling
* Data was split into train, validate, and test dataframes stratifying on heart disease ~(64/16/20)
* Scaled versions of numeric variables were added to the dataframe for use in modeling

In [None]:
df = df.rename(columns={'HeartDisease':'heart_disease', 
                        'BMI':'bmi', 
                        'Smoking':'smoker', 
                        'AlcoholDrinking':'heavy_drinker',
                        'Stroke':'stroke',
                        'PhysicalHealth':'poor_physical_health_days', 
                        'MentalHealth':'poor_mental_health_days', 
                        'DiffWalking': 'difficulty_walking', 
                        'Sex':'sex', 
                        'AgeCategory':'age_category',
                        'Race':'race', 
                        'Diabetic':'diabetic', 
                        'PhysicalActivity':'physical_activity', 
                        'GenHealth':'general_health', 
                        'SleepTime':'sleep_hours',
                        'Asthma':'asthma', 
                        'KidneyDisease':'kidney_disease', 
                        'SkinCancer':'skin_cancer'})

In [None]:
cat_cols = ['heart_disease',
            'smoker',
            'heavy_drinker',
            'difficulty_walking',
            'diabetic',
            'physical_activity',
            'kidney_disease',
            'skin_cancer',
            'age_category',
            'general_health']

dummies = pd.get_dummies(df[cat_cols])

df = df.join(dummies)

In [None]:
def split_my_data(df):
    '''Splits data and returns a train, validate, and test dataframe'''

    # split df into train_validate and test
    train_validate, test = train_test_split(df, 
                                            test_size=.2, 
                                            random_state=123, 
                                            stratify=df.heart_disease)

    # split train_validate into train and validate
    train, validate =  train_test_split(train_validate, 
                                        test_size=.2, 
                                        random_state=123, 
                                        stratify=train_validate.heart_disease)

    # reset index for train validate and test
    train.reset_index(drop=True, inplace=True)
    validate.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)

    return train, validate, test

In [None]:
train, validate, test = split_my_data(df)

In [None]:
def scale_data(train, validate, test):
    "Adds scaled columns to split data"

    # Scaling continuous variables
    cols_to_scale = ['bmi',
                     'sleep_hours',
                     'poor_physical_health_days',
                     'poor_mental_health_days']

    # create df's for train validate and test with only columns that need to be scaled
    train_to_be_scaled = train[cols_to_scale]
    validate_to_be_scaled = validate[cols_to_scale]
    test_to_be_scaled = test[cols_to_scale]

    # create scaler object and fit that object on the train data
    scaler = RobustScaler().fit(train_to_be_scaled)

    # transform data into an array using the scaler object 
    train_scaled = scaler.transform(train_to_be_scaled)
    validate_scaled = scaler.transform(validate_to_be_scaled)
    test_scaled = scaler.transform(test_to_be_scaled)

    # transform data into a dataframe
    train_scaled = pd.DataFrame(train_scaled, columns = cols_to_scale)
    validate_scaled = pd.DataFrame(validate_scaled, columns = cols_to_scale)
    test_scaled = pd.DataFrame(test_scaled, columns = cols_to_scale)

    # add _scaled to each column name in the scaled data
    for col in cols_to_scale:

        train_scaled = train_scaled.rename(columns={col: col + "_scaled"})
        validate_scaled = validate_scaled.rename(columns={col: col + "_scaled"})
        test_scaled = test_scaled.rename(columns={col: col + "_scaled"})

    # add scaled columns to their original dataframes
    train = train.join(train_scaled)
    validate = validate.join(validate_scaled)
    test = test.join(test_scaled)

    return train, validate, test

In [None]:
train, validate, test = scale_data(train, validate, test)

In [None]:
train

In [None]:
validate

In [None]:
test

In [None]:
len(train)/len(df)

In [None]:
len(validate)/len(df)

In [None]:
len(test)/len(df)