In [1]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('..\Data\stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


## Basic Pre-Processing

In [4]:
# checking for null values
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
# replacing the NA values in BMI with mean values
df['bmi'].fillna(float(df['bmi'].mean()), inplace=True)
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
# dropping the id column since it has no effect in stroke
df = df.drop(columns='id', axis='columns')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
# describing dataframe
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.698018,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.8,0.0
50%,45.0,0.0,0.0,91.885,28.4,0.0
75%,61.0,0.0,0.0,114.09,32.8,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


## Advanced Preprocessing

In [8]:
df['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [9]:
# removing 'Other' from Gender column because it's only 1
df.drop(df.loc[df['gender']=='Other'].index, inplace=True)
df = df.dropna()
df['gender'].value_counts()

Female    2994
Male      2115
Name: gender, dtype: int64

In [10]:
df.nunique()

gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3978
bmi                   419
smoking_status          4
stroke                  2
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5109 non-null   object 
 1   age                5109 non-null   float64
 2   hypertension       5109 non-null   int64  
 3   heart_disease      5109 non-null   int64  
 4   ever_married       5109 non-null   object 
 5   work_type          5109 non-null   object 
 6   Residence_type     5109 non-null   object 
 7   avg_glucose_level  5109 non-null   float64
 8   bmi                5109 non-null   float64
 9   smoking_status     5109 non-null   object 
 10  stroke             5109 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 479.0+ KB


## Changing object to numerical values

In [12]:
objCol = df.dtypes[df.dtypes == "object"].index.values
for col in objCol:
    tempDic = {}
    t = df[col].unique().tolist()
    
    for val,key in enumerate(t): tempDic[key]=val
    print(f'{col}\t{tempDic}')
    df[col]=df[col].map(tempDic)

gender	{'Male': 0, 'Female': 1}
ever_married	{'Yes': 0, 'No': 1}
work_type	{'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4}
Residence_type	{'Urban': 0, 'Rural': 1}
smoking_status	{'formerly smoked': 0, 'never smoked': 1, 'smokes': 2, 'Unknown': 3}


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5109 non-null   int64  
 1   age                5109 non-null   float64
 2   hypertension       5109 non-null   int64  
 3   heart_disease      5109 non-null   int64  
 4   ever_married       5109 non-null   int64  
 5   work_type          5109 non-null   int64  
 6   Residence_type     5109 non-null   int64  
 7   avg_glucose_level  5109 non-null   float64
 8   bmi                5109 non-null   float64
 9   smoking_status     5109 non-null   int64  
 10  stroke             5109 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 479.0 KB


In [14]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,0,0,228.69,36.6,0,1
1,1,61.0,0,0,0,1,1,202.21,28.893237,1,1
2,0,80.0,0,1,0,0,1,105.92,32.5,1,1
3,1,49.0,0,0,0,0,0,171.23,34.4,2,1
4,1,79.0,1,0,0,1,1,174.12,24.0,1,1


## Saving the cleaned data

In [15]:
df.to_csv('../data/final-stroke-data.csv', index=False)

## Train Test Validation Split

In [16]:
X = df.drop(['stroke'], axis='columns')
X.shape

(5109, 10)

In [17]:
y = df['stroke']
y.shape

(5109,)

In [18]:
# train splt from validation and test
x_train, x_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.30, random_state=10)

In [19]:
# test validation split
x_validation, x_test, y_validation, y_test = train_test_split(x_val_test, y_val_test, test_size=0.50, random_state=10)

In [20]:
print(x_train.shape, x_test.shape, x_validation, y_train.shape, y_test.shape, y_validation)

(3576, 10) (767, 10)       gender   age  hypertension  heart_disease  ever_married  work_type  \
1190       1  31.0             0              0             1          1   
4305       1  54.0             0              0             0          2   
1961       1  53.0             0              0             0          1   
2066       1  30.0             0              0             0          0   
5088       1  64.0             1              0             0          1   
...      ...   ...           ...            ...           ...        ...   
403        1  52.0             0              0             0          2   
4295       1  75.0             0              0             0          2   
873        1  28.0             0              0             0          0   
3257       1  71.0             0              0             0          0   
2304       0   5.0             0              0             1          3   

      Residence_type  avg_glucose_level   bmi  smoking_status  
11

### Saving the split data

In [21]:
x_train.to_csv('../data/train-data/X_train.csv', index=False)
x_test.to_csv('../data/test-data/X_test.csv', index=False)
x_validation.to_csv('../data/validation-data/X_validation.csv', index=False)
y_train.to_csv('../data/train-data/y_train.csv', index=False)
y_test.to_csv('../data/test-data/y_test.csv', index=False)
y_validation.to_csv('../data/validation-data/y_validation.csv', index=False)