In [411]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import skew

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler

In [347]:
df = pd.read_csv("csv_files/healthcare-dataset-stroke-data.csv")

In [348]:
df.columns = df.columns.str.lower()

In [349]:
#drop the only "other" in gender column
df = df[df['gender'] != 'Other']


In [350]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [351]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5109 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5109 non-null   int64  
 1   gender             5109 non-null   object 
 2   age                5109 non-null   float64
 3   hypertension       5109 non-null   int64  
 4   heart_disease      5109 non-null   int64  
 5   ever_married       5109 non-null   object 
 6   work_type          5109 non-null   object 
 7   residence_type     5109 non-null   object 
 8   avg_glucose_level  5109 non-null   float64
 9   bmi                4908 non-null   float64
 10  smoking_status     5109 non-null   object 
 11  stroke             5109 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 518.9+ KB


## Simple ML


### Rows to Numeric - For Simple ML Models

In [352]:
dfv1 = df.copy()

In [353]:
dfv1

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [354]:
dfv1['gender'].unique()


array(['Male', 'Female'], dtype=object)

In [355]:
dfv1['work_type'].unique()


array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [356]:
dfv1['residence_type'].unique()


array(['Urban', 'Rural'], dtype=object)

In [357]:
dfv1['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [358]:
"""
Gender collumn :
    Male = 1
    Female = 0

Age Collumn:
    Its fine

Hypertension:
    Its fine

Heart Disease:
    Its fine

ever_married:
    Yes = 1
    No = 0

work_type:
    Private = 1
    Self-employed = 1
    Govt_job = 1
    children = 1
    Never_worked = 1

residence_type:
    Urban = 1
    Rural = 0

avg_glucose_level:
    Its fine

bmi:
    delete missing values

smoking_status:
    never smoked = 1
    formerly smoked = 1
    smokes = 1
    Unknown = 1

"""
    
    

'\nGender collumn :\n    Male = 1\n    Female = 0\n\nAge Collumn:\n    Its fine\n\nHypertension:\n    Its fine\n\nHeart Disease:\n    Its fine\n\never_married:\n    Yes = 1\n    No = 0\n\nwork_type:\n    Private = 1\n    Self-employed = 1\n    Govt_job = 1\n    children = 1\n    Never_worked = 1\n\nresidence_type:\n    Urban = 1\n    Rural = 0\n\navg_glucose_level:\n    Its fine\n\nbmi:\n    delete missing values\n\nsmoking_status:\n    never smoked = 1\n    formerly smoked = 1\n    smokes = 1\n    Unknown = 1\n\n'

In [359]:
dfv1.dropna(subset=['bmi'], inplace=True)

In [360]:
# convert gender to numeric

In [361]:
dfv1['gender_numeric'] = df['gender'].map({'Male': 1, 'Female': 0})


In [362]:
dfv1.drop(columns=['gender'], inplace=True)

In [363]:
dfv1['ever_married_numeric'] = df['ever_married'].map({'Yes': 1, 'No': 0})

In [394]:
dfv1.drop(columns=['ever_married'], inplace=True)

In [364]:
dfv1['work_type'].unique()


array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [365]:
dfv1['work_type_private'] = (dfv1['work_type'].str.lower() == 'private').astype(int)
dfv1['work_type_self_employed'] = (dfv1['work_type'].str.lower() == 'self-employed').astype(int)
dfv1['work_type_govt_job'] = (dfv1['work_type'].str.lower() == 'govt_job').astype(int)
dfv1['work_type_children'] = (dfv1['work_type'].str.lower() == 'children').astype(int)
dfv1['work_type_never_worked'] = (dfv1['work_type'].str.lower() == 'never_worked').astype(int)

In [366]:
dfv1.drop(columns=['work_type'], inplace=True)

In [367]:
dfv1['residence_type_numeric'] = df['residence_type'].map({'Rural': 0, 'Urban': 1})

In [398]:
dfv1.drop(columns=['residence_type'], inplace=True)

In [369]:
dfv1['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [371]:
dfv1['smoke_status_formerly_smoked'] = (dfv1['smoking_status'].str.lower() == 'formerly smoked').astype(int)
dfv1['smoke_status_never_smoked'] = (dfv1['smoking_status'].str.lower() == 'never smoked').astype(int)
dfv1['smoke_status_smokes'] = (dfv1['smoking_status'].str.lower() == 'smokes').astype(int)
dfv1['smoke_status_unknown'] = (dfv1['smoking_status'].str.lower() == 'unknown').astype(int)

In [399]:
dfv1.drop(columns=['smoking_status'], inplace=True)

### Normalization & Scaling

In [400]:
dfv1.columns

Index(['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi', 'stroke', 'gender_numeric', 'ever_married_numeric',
       'work_type_private', 'work_type_self_employed', 'work_type_govt_job',
       'work_type_children', 'work_type_never_worked',
       'residence_type_numeric', 'smoke_status_formerly_smoked',
       'smoke_status_never_smoked', 'smoke_status_smokes',
       'smoke_status_unknown'],
      dtype='object')

In [None]:
sns.histplot(dfv1)

In [None]:
standardized_saleprice = (df['SalePrice'] - df['SalePrice'].mean()) / df['SalePrice'].std()
ks_test_statistic, ks_p_value = stats.kstest(standardized_saleprice, 'norm')

ks_test_statistic, ks_p_value

In [417]:
columns_to_test = [
     'age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi', 'stroke', 'gender_numeric', 'ever_married_numeric',
       'work_type_private', 'work_type_self_employed', 'work_type_govt_job',
       'work_type_children', 'work_type_never_worked',
       'residence_type_numeric', 'smoke_status_formerly_smoked',
       'smoke_status_never_smoked', 'smoke_status_smokes',
       'smoke_status_unknown'
]

ks_results = {}

for col in columns_to_test:
    # Standardize the column
    standardized_col = (dfv1[col] - dfv1[col].mean()) / dfv1[col].std()
    
    # Run Kolmogorov–Smirnov test against normal distribution
    ks_stat, ks_p = stats.kstest(standardized_col, 'norm')
    
    # Save results
    ks_results[col] = {'ks_statistic': ks_stat, 'p_value': ks_p}

# To display the results
for col, result in ks_results.items():
    print(f"{col}: KS Statistic = {result['ks_statistic']:.4f}, p-value = {result['p_value']}")


age: KS Statistic = 0.0501, p-value = 3.845529119596773e-11
hypertension: KS Statistic = 0.5329, p-value = 0.0
heart_disease: KS Statistic = 0.5407, p-value = 0.0
avg_glucose_level: KS Statistic = 0.1779, p-value = 2.1733756468196902e-136
bmi: KS Statistic = 0.0588, p-value = 3.4963563106914987e-15
stroke: KS Statistic = 0.5409, p-value = 0.0
gender_numeric: KS Statistic = 0.3879, p-value = 0.0
ever_married_numeric: KS Statistic = 0.4199, p-value = 0.0
work_type_private: KS Statistic = 0.3787, p-value = 0.0
work_type_self_employed: KS Statistic = 0.5096, p-value = 0.0
work_type_govt_job: KS Statistic = 0.5210, p-value = 0.0
work_type_children: KS Statistic = 0.5179, p-value = 0.0
work_type_never_worked: KS Statistic = 0.5223, p-value = 0.0
residence_type_numeric: KS Statistic = 0.3451, p-value = 0.0
smoke_status_formerly_smoked: KS Statistic = 0.5044, p-value = 0.0
smoke_status_never_smoked: KS Statistic = 0.4045, p-value = 0.0
smoke_status_smokes: KS Statistic = 0.5127, p-value = 0.0


In [408]:
dfv1

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_numeric,ever_married_numeric,work_type_private,work_type_self_employed,work_type_govt_job,work_type_children,work_type_never_worked,residence_type_numeric,smoke_status_formerly_smoked,smoke_status_never_smoked,smoke_status_smokes,smoke_status_unknown
0,9046,67.0,0,1,228.69,36.6,1,1,1,1,0,0,0,0,1,1,0,0,0
2,31112,80.0,0,1,105.92,32.5,1,1,1,1,0,0,0,0,0,0,1,0,0
3,60182,49.0,0,0,171.23,34.4,1,0,1,1,0,0,0,0,1,0,0,1,0
4,1665,79.0,1,0,174.12,24.0,1,0,1,0,1,0,0,0,0,0,1,0,0
5,56669,81.0,0,0,186.21,29.0,1,1,1,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,13.0,0,0,103.08,18.6,0,0,0,0,0,0,1,0,0,0,0,0,1
5106,44873,81.0,0,0,125.20,40.0,0,0,1,0,1,0,0,0,1,0,1,0,0
5107,19723,35.0,0,0,82.99,30.6,0,0,1,0,1,0,0,0,0,0,1,0,0
5108,37544,51.0,0,0,166.29,25.6,0,1,1,1,0,0,0,0,0,1,0,0,0


In [None]:
"""
 Columns to normalize:
- age
- avg_glucose_level
- bmi

"""

In [412]:
columns_to_check = ['age', 'avg_glucose_level', 'bmi']
skew_results = {}

for col in columns_to_check:
    # Drop missing values before computing skewness
    col_skew = skew(dfv1[col])
    skew_results[col] = col_skew

# Display results
for col, value in skew_results.items():
    print(f"{col}: skewness = {value:.4f}")

age: skewness = -0.1194
avg_glucose_level: skewness = 1.6141
bmi: skewness = 1.0547


In [None]:
# bmi - square root transformation
# avg_glucose_level - log transformation
# age