In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np

In [2]:
# Import and read the healthcare-dataset-stroke-data.csv.
import pandas as pd 
stroke_df = pd.read_csv("data/healthcare-dataset-stroke-data.csv")
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## EDA & Preprocessiong

In [3]:
# dataframe information
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [13]:
stroke_df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0
mean,43.229986,0.097475,0.054022,106.140399,28.919553,0.048738
std,22.613575,0.296633,0.226084,45.285004,7.7299,0.21534
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.24,23.7,0.0
50%,45.0,0.0,0.0,91.88,28.3,0.0
75%,61.0,0.0,0.0,114.09,32.8,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [4]:
# checking for NaNs in each column
stroke_df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
# create a series of avg BMI for each age
avg_bmi_by_age = stroke_df.groupby("age")["bmi"].mean()
avg_bmi_by_age

age
0.08     15.500000
0.16     14.766667
0.24     17.400000
0.32     18.080000
0.40     15.850000
           ...    
78.00    27.929032
79.00    27.862338
80.00    28.783582
81.00    27.951667
82.00    28.029091
Name: bmi, Length: 104, dtype: float64

In [6]:
# replace NaNs in BMI column with "avg_bmi_by_age" for the corresponding age
def replace_bmi(row):
    if pd.isna(row["bmi"]):
        return avg_bmi_by_age[row["age"]]
    else:
        return row["bmi"]
    
stroke_df["bmi"] = stroke_df.apply(replace_bmi, axis = 1)

In [7]:
# checking for NaNs for BMI column
stroke_df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [8]:
# Drop the Id column as Id column has no impact on data analysis
stroke_df = stroke_df.drop(columns = ["id"])
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,30.19,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [9]:
# checking unique value in each column
stroke_df.nunique()

gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   487
smoking_status          4
stroke                  2
dtype: int64

In [10]:
# gender value counts
stroke_df["gender"].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [11]:
# drop "other from stroke_df["gender"] column since it's just 1 record
stroke_df = stroke_df.drop(stroke_df[stroke_df["gender"] == "Other"].index)

In [12]:
# check is the "other" is droped
stroke_df["gender"].unique()

array(['Male', 'Female'], dtype=object)

In [None]:
# to check how the data is destributed
stroke_df.hist()
plt.tight_layout()