# Importing necessary modules 

In [119]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Reading the data

In [120]:
strokes = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Let's see the first five rows with head() method

In [121]:
strokes.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# The shape of the data
### the data consists of 5110 rows and twelve columns, in pandas terminology, 12 serieses with 5110 elements each.

In [122]:
strokes.shape

(5110, 12)

# Now let's see some basic statstics of our data

In [123]:
strokes.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


# And the data details, regarding type and wheter it is a null or not

In [124]:
strokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


# I probably don't need the id column

In [125]:
strokes = strokes.drop(columns = ['id'])
strokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


# Let's see what columns have null values
### The BMI is the columns that has null values, indicated by having true next to it.

In [126]:
strokes.isnull().any()

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status       False
stroke               False
dtype: bool

# Removing null values
### I will use the simple `dropna()` from pandas to drop the rows with the null values.
### I will not drop the whole column, for sure, since the null values are only about 200 of 5000
### I will drop the null values with `inplace = False` so I can show how to substitute the missing data later.

In [127]:
strokes.dropna(inplace = False)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# The rows still exist which can be indicated by the shape

In [128]:
strokes.shape

(5110, 11)

# Replacing the null values
### I was going to use `fillna(method = 'bfill')`, but checking the documentation, it says that this has been dperecated, and `bfill()` can be used istead.

# With `fillna()`

In [129]:
strokes.fillna(method = 'bfill', inplace = False)

  strokes.fillna(method = 'bfill', inplace = False)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,32.5,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,40.0,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# Using `bfill()`

In [130]:
strokes.bfill(inplace = True)

# Let's check for the null values
### No more null values :)

In [131]:
strokes.isnull().any()

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool

# Duplicate values
### We have no duplicate values
### so no need to remove them with `drop_duplicates()`

In [132]:
strokes.duplicated().any()

np.False_

# Normaliziation

In [133]:
for ser in strokes:
    if strokes[ser].dtype != 'object':
        strokes[[ser]] = MinMaxScaler(feature_range = (0, 1)).fit_transform(strokes[[ser]])

strokes.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,0.816895,0.0,1.0,Yes,Private,Urban,0.801265,0.30126,formerly smoked,1.0
1,Female,0.743652,0.0,0.0,Yes,Self-employed,Rural,0.679023,0.254296,never smoked,1.0
2,Male,0.975586,0.0,1.0,Yes,Private,Rural,0.234512,0.254296,never smoked,1.0
3,Female,0.597168,0.0,0.0,Yes,Private,Urban,0.536008,0.27606,smokes,1.0
4,Female,0.963379,1.0,0.0,Yes,Self-employed,Rural,0.549349,0.15693,never smoked,1.0


# Encoding the data with label encoder

In [134]:
for ser in strokes:
    if strokes[ser].dtype == 'object':
        strokes[ser] = LabelEncoder().fit_transform(strokes[ser])

# Now All the data is labeled :)

In [135]:
strokes.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,0.816895,0.0,1.0,1,2,1,0.801265,0.30126,1,1.0
1,0,0.743652,0.0,0.0,1,3,0,0.679023,0.254296,2,1.0
2,1,0.975586,0.0,1.0,1,2,0,0.234512,0.254296,2,1.0
3,0,0.597168,0.0,0.0,1,2,1,0.536008,0.27606,3,1.0
4,0,0.963379,1.0,0.0,1,3,0,0.549349,0.15693,2,1.0
