In [1]:
#install extreme gradient boost package
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1


In [2]:
#install imbalanced learning package
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
Collecting scikit-learn>=1.1.0
  Using cached scikit_learn-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.2 MB)
Installing collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
Successfully installed imbalanced-learn-0.9.1 scikit-learn-1.1.1


In [3]:
#import the necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [4]:
#load csv file into a dataframe & preview the dataframe
df = pd.read_csv('Stroke.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
#check the number of null values in the dataset
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [6]:
#drop all the rows with null values
df.dropna(axis=0,inplace=True)
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [7]:
#check data types of features
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [8]:
#convert float data type to integer data type for certain features
df['age'] = df['age'].astype(int)
df['bmi'] = df['bmi'].astype(int)
df['avg_glucose_level'] = df['avg_glucose_level'].astype(int)

In [9]:
#check if data type changes carried through
df.dtypes

id                    int64
gender               object
age                   int64
hypertension          int64
heart_disease         int64
ever_married         object
work_type            object
Residence_type       object
avg_glucose_level     int64
bmi                   int64
smoking_status       object
stroke                int64
dtype: object

In [10]:
#profile the data
for column in df:
    print(f'{column} : {df[column].unique()}')

id : [ 9046 31112 60182 ... 19723 37544 44679]
gender : ['Male' 'Female' 'Other']
age : [67 80 49 79 81 74 69 78 61 54 50 64 75 60 71 52 82 65 57 42 48 72 58 76
 39 77 63 73 56 45 70 59 66 43 68 47 53 38 55 46 32 51 14  3  8 37 40 35
 20 44 25 27 23 17 13  4 16 22 30 29 11 21 18 33 24 36  0 34 41  5 26 31
  7 12 62  2  9 15 28 10  1 19  6]
hypertension : [0 1]
heart_disease : [1 0]
ever_married : ['Yes' 'No']
work_type : ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type : ['Urban' 'Rural']
avg_glucose_level : [228 105 171 174 186  70  94  58  80 120 104 214 167 191 221  89 193 233
 208 102 100 195 212  83 196 252  84 219  74  92  60  78  71 144 213 243
 107  99 127 124  59 194 180 185  61  93 113  86  72 179 116  96  66 240
 110 143  88  79 111  98 226  68  64 235  76  82 190 231  73 129 224 216
  62 259 249 131 200 130 182 206 263 140 207 199 103 151  67 239 223  77
 203 133 162  91  97  56 112 137 215 205 118 271 242 175  90 109  87 106
 134  95 210 250 

In [11]:
#drop irrelevant features
df.drop('id',axis=1,inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,0,1,Yes,Private,Urban,228,36,formerly smoked,1
2,Male,80,0,1,Yes,Private,Rural,105,32,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171,34,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174,24,never smoked,1
5,Male,81,0,0,Yes,Private,Urban,186,29,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13,0,0,No,children,Rural,103,18,Unknown,0
5106,Female,81,0,0,Yes,Self-employed,Urban,125,40,never smoked,0
5107,Female,35,0,0,Yes,Self-employed,Rural,82,30,never smoked,0
5108,Male,51,0,0,Yes,Private,Rural,166,25,formerly smoked,0


In [12]:
#convert categorical data into numerical
hot_data = pd.get_dummies(df)
hot_data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67,0,1,228,36,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80,0,1,105,32,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49,0,0,171,34,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79,1,0,174,24,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81,0,0,186,29,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13,0,0,103,18,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,81,0,0,125,40,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,35,0,0,82,30,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,51,0,0,166,25,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [13]:
#normalize the data to minimize the effects of large numbers
cols_to_scale = ['age','avg_glucose_level','bmi']

scaler = MinMaxScaler()

hot_data[cols_to_scale] = scaler.fit_transform(hot_data[cols_to_scale])
hot_data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.817073,0,1,0.800926,0.298851,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,0.975610,0,1,0.231481,0.252874,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,0.597561,0,0,0.537037,0.275862,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,0.963415,1,0,0.550926,0.160920,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,0.987805,0,0,0.606481,0.218391,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,0.158537,0,0,0.222222,0.091954,0,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,0.987805,0,0,0.324074,0.344828,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,0.426829,0,0,0.125000,0.229885,0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,0.621951,0,0,0.513889,0.172414,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [14]:
#check to see if  the dataset is balanced
df['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [15]:
#split dependent variables from independent
X = hot_data.drop('stroke',axis=1)
y = hot_data['stroke']

In [16]:
#balance the dataset using over-sampling method to create data for under-sampled
smote = SMOTE(sampling_strategy='minority')
features, target = smote.fit_resample(X,y)

target.value_counts()

1    4700
0    4700
Name: stroke, dtype: int64

In [17]:
#split the dataset into training and testing (80/20)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [18]:
#convert training and testing data into sets for xgb purposes
training_data = xgb.DMatrix(X_train, label=y_train)
testing_data = xgb.DMatrix(X_test, label=y_test)

In [19]:
#specify xgb model parameters
param = {
    'booster': 'gbtree',
    'max_depth': 3,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': 2
}
epochs = 10

In [20]:
#train the model
model = xgb.train(param,training_data,epochs)

In [21]:
#get predictions using the model
predictions = model.predict(testing_data)

In [22]:
#score the model
accuracy_score(y_test,predictions)

0.8531914893617021