In [2]:
import numpy as np
import pandas as pd
import os

In [5]:
data = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

In [6]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
for dirname, _, filenames in os.walk('/content/healthcare-dataset-stroke-data.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [9]:
avg = data['bmi'].mean()
data.bmi=(data.bmi.fillna(avg))
data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [10]:
def preprocess_inputs(df):
    df = df.copy()
    le = LabelEncoder()
    df['gender'] = le.fit_transform(df['gender'])
    df['ever_married'] = le.fit_transform(df['ever_married'])
    df['work_type'] = le.fit_transform(df['work_type'])
    df['Residence_type'] = le.fit_transform(df['Residence_type'])
    df['smoking_status'] = le.fit_transform(df['smoking_status'])
    return df

In [11]:
df = preprocess_inputs(data)

In [12]:
df=df.drop(columns='id')

In [13]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.600000,1,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
3,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
4,0,79.0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,2,1,83.75,28.893237,2,0
5106,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


In [14]:
features = ['age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'gender',
 'work_type',
 'smoking_status']

label = ['stroke']

X = df[features]
y = df[label]

In [15]:
train_X, val_X, train_y, val_y = train_test_split(X, y,test_size=0.2,random_state=100)

In [16]:
sc=StandardScaler()
X_std= sc.fit_transform(X)
train_X_std = sc.fit_transform(train_X)
val_X_std = sc.transform(val_X)

In [17]:
model_accuracy = pd.DataFrame(columns=['Model','Accuracy'])
models = {"LR": LogisticRegression(),
          "KNN" : KNeighborsClassifier(),
          "DT" : DecisionTreeClassifier(),
          'RFC' : RandomForestClassifier(),
          'BGC' : BaggingClassifier(),
          'ABC' : AdaBoostClassifier(),
          'DTC' : DecisionTreeClassifier(),
          }


for model_name, model in models.items():
    model.fit(train_X_std, train_y.values.ravel())
    pred = model.predict(val_X_std)
    ac = accuracy_score(val_y,pred)
    print( model_name + ' Accuracy scores')
    print(ac)
    model_accuracy = model_accuracy.append({'Model': model_name, 'Accuracy': ac}, ignore_index=True)

LR Accuracy scores
0.9549902152641878
KNN Accuracy scores
0.9559686888454012
DT Accuracy scores
0.9129158512720157
RFC Accuracy scores
0.9549902152641878
BGC Accuracy scores
0.9500978473581213
ABC Accuracy scores
0.9530332681017613
DTC Accuracy scores
0.9207436399217221


In [18]:
model_accuracy.sort_values('Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy
1,KNN,0.955969
0,LR,0.95499
3,RFC,0.95499
5,ABC,0.953033
4,BGC,0.950098
6,DTC,0.920744
2,DT,0.912916


In [19]:
final_model=KNeighborsClassifier()
final_model.fit(X_std,y.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
p= final_model.predict(val_X_std)
acc = accuracy_score(val_y,p)
print(' Accuracy scores')
print(acc)

 Accuracy scores
0.9569471624266145
