In [1]:
import pandas as pd 
import numpy as np

In [39]:
# acquire dataset
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [45]:
# overview of data
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,male,67.0,0,1,Yes,private,urban,228.69,36.6,formerly smoked,1
1,51676,female,61.0,0,0,Yes,self-employed,rural,202.21,,never smoked,1
2,31112,male,80.0,0,1,Yes,private,rural,105.92,32.5,never smoked,1
3,60182,female,49.0,0,0,Yes,private,urban,171.23,34.4,smokes,1
4,1665,female,79.0,1,0,Yes,self-employed,rural,174.12,24.0,never smoked,1


In [46]:
# lowercase column and row names
df.columns = df.columns.str.lower()

In [47]:
# lowercase rows
df['residence_type'] = df['residence_type'].apply(str.lower)
df['work_type'] = df['work_type'].apply(str.lower)
df['gender'] = df['gender'].apply(str.lower)

In [48]:
# finding nulls - - bmi 200 nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [49]:
# average for bmi is 28.9
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [50]:
# replace nulls with average bmi 
df.bmi.fillna(28.9, inplace = True)

In [51]:
# replace Yes: 1 and No: 0
df.ever_married = df.ever_married.replace("Yes", 1)
df.ever_married = df.ever_married.replace("No", 0)

In [52]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,male,67.0,0,1,1,private,urban,228.69,36.6,formerly smoked,1
1,51676,female,61.0,0,0,1,self-employed,rural,202.21,28.9,never smoked,1
2,31112,male,80.0,0,1,1,private,rural,105.92,32.5,never smoked,1
3,60182,female,49.0,0,0,1,private,urban,171.23,34.4,smokes,1
4,1665,female,79.0,1,0,1,self-employed,rural,174.12,24.0,never smoked,1


In [62]:
# additional gender category "other", only one value dropped from df
df.gender.value_counts()

female    2994
male      2115
Name: gender, dtype: int64

In [61]:
# drop "other" row
df.drop(df[df.gender == "other"].index, inplace=True)

In [73]:
# get dummies for columns
dummy_df = pd.get_dummies(df[["gender",
                   "work_type",
                   "residence_type",
                   "smoking_status"]],
                   drop_first=True)
# columns to keep from original df
num_df = df[["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "stroke"]]

# join dummy df and df
model_df = pd.concat([num_df, dummy_df], axis=1)
df= pd.concat([df, dummy_df], axis=1)
 

In [75]:
def clean_df():
    # acquire df
    df = pd.read_csv("healthcare-dataset-stroke-data.csv")
    
    # lowercase column and row names
    df.columns = df.columns.str.lower()
    
    # lowercase rows
    df['residence_type'] = df['residence_type'].apply(str.lower)
    df['work_type'] = df['work_type'].apply(str.lower)
    df['gender'] = df['gender'].apply(str.lower)
    
    # replace nulls with average bmi 
    df.bmi.fillna(28.9, inplace = True)
    
    # replace Yes: 1 and No: 0
    df.ever_married = df.ever_married.replace("Yes", 1)
    df.ever_married = df.ever_married.replace("No", 0)
    
    # drop "other" row
    df.drop(df[df.gender == "other"].index, inplace=True)
    
    # get dummies for columns
    dummy_df = pd.get_dummies(df[["gender",
                       "work_type",
                       "residence_type",
                       "smoking_status"]],
                       drop_first=True)
    # columns to keep from original df
    num_df = df[["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "stroke"]]

    # join dummy df and df
    model_df = pd.concat([num_df, dummy_df], axis=1)
    df= pd.concat([df, dummy_df], axis=1)
    
    return df, model_df
    

In [81]:
# split the data function
from sklearn.model_selection import train_test_split
def split_data(df, target_variable):
    '''
    Takes in two arguments the dataframe name and the ("target_variable" - must be in string format) to stratify  and 
    return train, validate, test subset dataframes will output train, validate, and test in that order.
    '''
    train, test = train_test_split(df, #first split
                                   test_size=.2, 
                                   random_state=123, 
                                   stratify= df[target_variable])
    train, validate = train_test_split(train, #second split
                                    test_size=.25, 
                                    random_state=123, 
                                    stratify=train[target_variable])
    return train, validate, test

In [83]:
# split into train, validate and test
train, validate, test = split_data(df, "stroke")