# Machine learning development 

## data exploration

In [24]:
import pandas as pd

In [25]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv", encoding='utf8')

data = pd.DataFrame(data)
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [26]:
data.shape

(5110, 12)

In [27]:
data.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [28]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [29]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [30]:
data.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [31]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

## Data Cleaning

In [32]:
data.drop('id', inplace=True, axis=1)
data = data.dropna()
data = data.drop_duplicates()
data.drop(data[data['gender'] == 'Other'].index,inplace=True)
data.shape

(4908, 11)

## Data Preprocessing

In [33]:
for col in data.select_dtypes(exclude=['float64', 'int64']).columns:
    print(col, data[col].unique())

gender ['Male' 'Female']
ever_married ['Yes' 'No']
work_type ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type ['Urban' 'Rural']
smoking_status ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [34]:
from imblearn.over_sampling import *
from imblearn.under_sampling import *

In [35]:
X = data.copy()
y = X.pop('stroke')


In [36]:
oversampler = RandomOverSampler()
undersampler = RandomUnderSampler()
X,y = oversampler.fit_resample(X,y)

In [37]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import make_column_transformer
import numpy as np


In [38]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [39]:
ord_col = ['gender', 'ever_married', 'Residence_type']
nom_col = ['work_type', 'smoking_status']

In [40]:
ord = OrdinalEncoder(categories='auto', dtype=np.int64)
hot = OneHotEncoder(sparse=False,  dtype=np.int64)
column_transformer = make_column_transformer(
    (ord, ord_col), (hot, nom_col), remainder='passthrough'
)

In [41]:
column_transformer.fit_transform(X)

array([[  1.  ,   1.  ,   1.  , ...,   1.  , 228.69,  36.6 ],
       [  1.  ,   1.  ,   0.  , ...,   1.  , 105.92,  32.5 ],
       [  0.  ,   1.  ,   1.  , ...,   0.  , 171.23,  34.4 ],
       ...,
       [  0.  ,   1.  ,   1.  , ...,   0.  ,  97.43,  26.4 ],
       [  0.  ,   1.  ,   1.  , ...,   0.  ,  74.11,  20.5 ],
       [  0.  ,   1.  ,   0.  , ...,   0.  ,  72.67,  28.9 ]])

## ML Model Development

In [42]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix


from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from matplotlib import pyplot

In [43]:
models = {
    'SVM': SVC(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    'Random Forest' : RandomForestClassifier(),
    'Stochastic Gradient Descent' : SGDClassifier(),
    'logistics Regression' : LogisticRegression(), 
    'Naive bayes' : GaussianNB(),
    'KNeighbors' : KNeighborsClassifier()
}


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
for k, v in models.items():
    pipe = make_pipeline(column_transformer, v)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(confusion_matrix(y_test,y_pred))
    print(k, cross_val_score(pipe,X,y,cv=5, scoring='accuracy').mean())
    print()


[[671 251]
 [170 788]]
SVM 0.7599495544257358

[[730 192]
 [ 72 886]]
Gradient Boosting 0.8616736494060897

[[910  12]
 [  0 958]]
Random Forest 0.9921259044534778

[[734 188]
 [284 674]]
Stochastic Gradient Descent 0.6945199461008006

[[704 218]
 [169 789]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

logistics Regression 0.7751658306251628

[[330 592]
 [ 16 942]]
Naive bayes 0.6795065279177471

[[818 104]
 [  0 958]]
KNeighbors 0.9390284556067623



In [46]:
raw_data = pd.DataFrame({'gender': ['Male'],
    'age': [67],
    'hypertension': [0],
    'heart_disease': [1],
    'ever_married': ['Yes'],
    'work_type': ['Private'],
    'Residence_type': ['Urban'],
    'avg_glucose_level': [228.69],
    'bmi': [36.6],
    'smoking_status':['formerly smoked'],
})
print(raw_data)
best_model = make_pipeline(column_transformer, RandomForestClassifier())
best_model.fit(X_train, y_train)
y_pred = best_model.predict(raw_data)
print(y_pred)


  gender  age  hypertension  heart_disease ever_married work_type  \
0   Male   67             0              1          Yes   Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  
0          Urban             228.69  36.6  formerly smoked  
[1]


In [47]:
pipe = make_pipeline(column_transformer, RandomForestClassifier())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy :", cross_val_score(pipe,X,y,cv=5, scoring='accuracy').mean())

[[911  11]
 [  0 958]]
Accuracy : 0.993509222877719


In [48]:
import pickle
with open('model.pkl', 'wb') as model_file:
  pickle.dump(best_model, model_file)