# Import necessary libraries 

In [124]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot

In [125]:
import warnings 
warnings.filterwarnings('ignore')
sb.set() # set the default Seaborn style for graphic

In [126]:
from sklearn.preprocessing import RobustScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [127]:
stroke_data = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


### We wanted to clarify the variables have a speacial cases

#### Gender:

In [128]:
stroke_data['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [129]:
stroke_data = stroke_data[-(stroke_data['gender'] == 'Other')]

In [130]:
stroke_data['gender'].value_counts()

Female    2994
Male      2115
Name: gender, dtype: int64

Since 'Other' only have 1 data, we removed the data from the data set. 

#### Age:

In [131]:
stroke_data['age'].value_counts()

78.00    102
57.00     95
52.00     90
54.00     87
51.00     86
        ... 
1.40       3
0.48       3
0.16       3
0.40       2
0.08       2
Name: age, Length: 104, dtype: int64

Since there is a range of age, we need to categorize according to:
- 0-20: Youth
- 20-40: Adulthood
- 40-60: Middle Age
- 60-80: Senior
- 80-100: oldest old

#### Hypertension


In [132]:
stroke_data['hypertension'].value_counts()

0    4611
1     498
Name: hypertension, dtype: int64

This variable does not need resampling.

#### Heart diease

In [134]:
stroke_data['heart_disease'].value_counts()

0    4833
1     276
Name: heart_disease, dtype: int64

This variable does not need resampling as they either have or do not have it.

#### Ever Married:

In [135]:
stroke_data['ever_married'].value_counts()

Yes    3353
No     1756
Name: ever_married, dtype: int64

This variable does not need resampling as they either married or not

#### Work Type:

In [136]:
stroke_data['work_type'].value_counts()

Private          2924
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64

There is no distinct variables that we need to consider.

#### Residence Tyoe:

In [None]:
stroke_data['Residence_type'].value_counts()

There are 2 types residence areas.

#### Glucose level:

In [137]:
stroke_data['avg_glucose_level'].value_counts()

93.88     6
91.68     5
91.85     5
83.16     5
73.00     5
         ..
94.07     1
111.93    1
94.40     1
95.57     1
85.28     1
Name: avg_glucose_level, Length: 3978, dtype: int64

Since there is a range of age, we need to categorize according to:
- less than 100
- 100-150
- 150-200
- 200-250
- 250+

#### BMI:

In [139]:
stroke_data['bmi'].value_counts()

28.7    41
28.4    38
26.7    37
27.6    37
26.1    37
        ..
48.7     1
49.2     1
51.0     1
49.4     1
14.9     1
Name: bmi, Length: 418, dtype: int64

Since there is a range of age, we need to categorize according to:
- Underweight: below 18.5 
- Normal: between 18.5 and 24.9
- Overweight: between 25 and 29.9
- Obese: between 30 and 65
- ExtremeObese: above 65

#### Smoking:

In [142]:
stroke_data['smoking_status'].value_counts()

never smoked       1892
Unknown            1544
formerly smoked     884
smokes              789
Name: smoking_status, dtype: int64

There is no distinct variables that we need to consider as the unknown variable is large enough to be considered a variable that will affect the dataset.

## Variables that needs resampling:

In [143]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [144]:
# drop id column since it's not needed for analysis
stroke_data.drop('id', axis=1, inplace=True)

# replace missing BMI values with mean
stroke_data['bmi'].fillna(stroke_data['bmi'].mean(), inplace=True)

In [145]:
stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.89456,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [146]:
stroke_data.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0
mean,43.229986,0.097475,0.054022,106.140399,28.89456,0.048738
std,22.613575,0.296633,0.226084,45.285004,7.698235,0.21534
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.24,23.8,0.0
50%,45.0,0.0,0.0,91.88,28.4,0.0
75%,61.0,0.0,0.0,114.09,32.8,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


### Age group

In [147]:
#relabeling the agegroup
bins = [0, 20, 40, 60, 80, 1000]
labels = ['0-20', '20-40', '40-60', '60-80', '80+']
stroke_data['AgeGroup'] = pd.cut(stroke_data['age'], bins=bins, labels=labels, right=False)
stroke_data['AgeGroup'].value_counts()

40-60    1564
20-40    1203
60-80    1190
0-20      966
80+       186
Name: AgeGroup, dtype: int64

### Sugar level

In [148]:
bins = [0, 100, 150, 200, 250, 1000]
labels = ['< 100', '100-150', '150-200', '200-250', '250+']
stroke_data['GlucoseLevelRange'] = pd.cut(stroke_data['avg_glucose_level'], bins=bins, labels=labels, right=False)
stroke_data.drop('avg_glucose_level', axis=1, inplace=True)
stroke_data.GlucoseLevelRange.value_counts()

< 100      3131
100-150    1247
200-250     409
150-200     297
250+         25
Name: GlucoseLevelRange, dtype: int64

### BMI

In [149]:
# replace missing BMI values with mean
stroke_data['bmi'].fillna(stroke_data['bmi'].mean(), inplace=True)

#labeling of data
bins = [0, 19, 25, 30, 40, 1000]
labels = ['Underweight', 'Normal', 'Overweight', 'Obese', 'ExtremeObese']
stroke_data['BMIGroup'] = pd.cut(stroke_data['bmi'], bins=bins, labels=labels, right=False)
stroke_data.drop('bmi', axis=1, inplace=True)
stroke_data.BMIGroup.value_counts()

Overweight      1610
Obese           1506
Normal          1175
ExtremeObese     414
Underweight      404
Name: BMIGroup, dtype: int64

## The resampled data:

In [150]:
stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,stroke,AgeGroup,GlucoseLevelRange,BMIGroup
0,Male,67.0,0,1,Yes,Private,Urban,formerly smoked,1,60-80,200-250,Obese
1,Female,61.0,0,0,Yes,Self-employed,Rural,never smoked,1,60-80,200-250,Overweight
2,Male,80.0,0,1,Yes,Private,Rural,never smoked,1,80+,100-150,Obese
3,Female,49.0,0,0,Yes,Private,Urban,smokes,1,40-60,150-200,Obese
4,Female,79.0,1,0,Yes,Self-employed,Rural,never smoked,1,60-80,150-200,Normal


In [155]:
stroke_data.to_csv('resampled_data.csv')

In [156]:
resampledata = pd.read_csv("resampled_data.csv")

In [157]:
resampledata.head()

Unnamed: 0.1,Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,smoking_status,stroke,AgeGroup,GlucoseLevelRange,BMIGroup
0,0,Male,67.0,0,1,Yes,Private,Urban,formerly smoked,1,60-80,200-250,Obese
1,1,Female,61.0,0,0,Yes,Self-employed,Rural,never smoked,1,60-80,200-250,Overweight
2,2,Male,80.0,0,1,Yes,Private,Rural,never smoked,1,80+,100-150,Obese
3,3,Female,49.0,0,0,Yes,Private,Urban,smokes,1,40-60,150-200,Obese
4,4,Female,79.0,1,0,Yes,Self-employed,Rural,never smoked,1,60-80,150-200,Normal
