In [367]:
import pandas as pd
import numpy as np 
import matplotlib as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pickle
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

In [368]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [369]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [370]:
df = df[~df['bmi'].isnull()]

In [371]:
df.avg_glucose_level.min()

55.12

In [372]:
def remove_outliers(df,column):

    Q1 = df[column].quantile(0.20)
    Q3 = df[column].quantile(0.80)
    IQR = Q3 - Q1 
    upper_bound = Q3 + 1.5*IQR
    lower_bound = Q1 - 1.5*IQR  

    df = df[(df[column] > lower_bound) & (df[column] < upper_bound)]

    return(df)

In [373]:
df1=remove_outliers(df,'bmi')

In [374]:
df2=remove_outliers(df1,'avg_glucose_level')

In [375]:
df2.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [376]:
df2['Residence_type'].value_counts()

Residence_type
Urban    2255
Rural    2185
Name: count, dtype: int64

In [377]:
df2['smoking_status'].value_counts()

smoking_status
never smoked       1660
Unknown            1412
formerly smoked     707
smokes              661
Name: count, dtype: int64

In [378]:
df2

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [379]:
df2['stroke'].value_counts()

stroke
0    4285
1     155
Name: count, dtype: int64

In [380]:
transformer=ColumnTransformer(transformers=[
    ('oh',OneHotEncoder(sparse_output=False,drop='first'), ['gender','Residence_type','work_type']),
    ('oe',OrdinalEncoder(categories=[['never smoked','Unknown','formerly smoked','smokes'],['No','Yes']]),['smoking_status','ever_married'])
],
                              remainder='passthrough'
)

In [381]:
X = df2.drop(['stroke','id'],axis=1)
y =df2['stroke']

In [382]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=1,stratify=y)

In [383]:
X_test

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
1683,Female,27.0,0,0,No,Private,Rural,103.35,28.1,formerly smoked
354,Female,32.0,0,0,Yes,Private,Urban,98.09,25.2,smokes
2571,Male,49.0,0,0,Yes,Private,Urban,92.02,38.1,never smoked
1697,Male,50.0,0,0,Yes,Private,Rural,122.48,35.9,smokes
3736,Male,71.0,1,0,Yes,Govt_job,Urban,153.08,21.5,Unknown
...,...,...,...,...,...,...,...,...,...,...
4358,Female,26.0,0,0,No,Private,Urban,80.94,22.2,smokes
4775,Male,42.0,0,0,Yes,Govt_job,Urban,58.35,24.3,never smoked
4455,Male,18.0,0,0,No,Private,Rural,100.47,31.9,never smoked
3099,Female,29.0,0,0,No,Private,Rural,79.27,29.0,smokes


In [384]:
transformer.fit(X_train)

In [385]:
X_train = transformer.fit_transform(X_train)


In [386]:
X_test  = transformer.transform(X_test)

In [387]:
X_test_transformed = X_test

In [388]:
smote = SMOTE(random_state=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [389]:
# rus = RandomUnderSampler(random_state=1)

# X_train, y_train = rus.fit_resample(X_train, y_train)

In [390]:
y_train.value_counts()

stroke
0    3214
1    3214
Name: count, dtype: int64

In [391]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [392]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [393]:
from sklearn.linear_model import LogisticRegression

In [394]:
LR_model= LogisticRegression()
LR_model.fit(X_train_scaled,y_train)
y_pred = LR_model.predict(X_test_scaled)

In [395]:
y_pred[1:6]

array([0, 0, 0, 1, 0], dtype=int64)

In [396]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)*100

76.3963963963964

In [397]:
from sklearn.metrics import classification_report

# Assuming you have true labels (y_true) and predicted labels (y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86      1071
           1       0.11      0.82      0.20        39

    accuracy                           0.76      1110
   macro avg       0.55      0.79      0.53      1110
weighted avg       0.96      0.76      0.84      1110



In [398]:
solver =  ['lbfgs', 'liblinear', 'newton-cg', 'sag']
max_iter = [100,300,500,1000]
LR_Grid = dict(solver=solver , max_iter=max_iter)

In [399]:
LR_Grid

{'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],
 'max_iter': [100, 300, 500, 1000]}

In [400]:
from sklearn.model_selection import GridSearchCV


In [401]:
GC_LR = GridSearchCV(estimator=LR_model, param_grid=LR_Grid, n_jobs=-1, cv=5, scoring='recall_macro', error_score=0) # creating grid search object for xgb
GC_LR_result = GC_LR.fit(X_test_scaled, y_test) # fitting the grid search on the training data


In [402]:
print("Best parameters : %s" % GC_LR_result.best_params_)

Best parameters : {'max_iter': 100, 'solver': 'lbfgs'}


In [403]:
LR_model.set_params(**GC_LR_result.best_params_)

In [404]:
LR_model.fit(X_train_scaled,y_train)

In [405]:
y_pred = LR_model.predict(X_test_scaled)

In [406]:
from sklearn.metrics import accuracy_score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86      1071
           1       0.11      0.82      0.20        39

    accuracy                           0.76      1110
   macro avg       0.55      0.79      0.53      1110
weighted avg       0.96      0.76      0.84      1110



In [407]:
DT_Model = DecisionTreeClassifier()
SVM_Model = SVC()
DT_Model.fit(X_train_scaled,y_train)
SVM_Model.fit(X_train_scaled,y_train)


In [408]:
model_params = [ {'model':LR_model,'name':"Logistic Regression",'params': {'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],'max_iter': [100, 300, 500, 1000]}},
                
                 #{'model':RandomForestClassifier(),'name':"RF",'params': {'n_estimators': [100, 300, 700, 1000], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': [10, 20, 50, 100],'bootstrap': [True, False]}},

                 {'model':DT_Model,'name':"Decision Tree",'params': {'splitter': ['best', 'random'],'criterion': ['gini', 'entropy', 'log_loss'],'max_depth': [10, 20, 50, 100, 200, 500]}},

                 {'model':SVM_Model,'name':"Support Vector Machine",'params': {'C': [1.0, 5.0, 10.0, 20.0],'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'],'degree': [2, 3, 5, 7]}}
                
                ]

In [409]:
GC_DT = GridSearchCV(estimator=DT_Model, param_grid=model_params[1]['params'], n_jobs=-1, cv=5, scoring='recall_macro', error_score=0) # creating grid search object for xgb
GC_DT_result = GC_DT.fit(X_test_scaled, y_test) # fitting the grid search on the training data
DT_Model.set_params(**GC_DT_result.best_params_)
DT_Model.fit(X_train_scaled,y_train)
print(classification_report(y_test,DT_Model.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1071
           1       0.10      0.13      0.11        39

    accuracy                           0.93      1110
   macro avg       0.53      0.54      0.54      1110
weighted avg       0.94      0.93      0.93      1110



In [410]:
GC_SVM = GridSearchCV(estimator=SVM_Model, param_grid=model_params[2]['params'], n_jobs=-1, cv=5, scoring='recall_macro', error_score=0) # creating grid search object for xgb
GC_SVM_result = GC_SVM.fit(X_test_scaled, y_test) # fitting the grid search on the training data
SVM_Model.set_params(**GC_SVM_result.best_params_)
SVM_Model.fit(X_train_scaled,y_train)


In [411]:
print(classification_report(y_test,SVM_Model.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       0.98      0.69      0.81      1071
           1       0.07      0.64      0.13        39

    accuracy                           0.69      1110
   macro avg       0.53      0.67      0.47      1110
weighted avg       0.95      0.69      0.79      1110



In [412]:
print(classification_report(y_test,models[0]['Model'].predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86      1071
           1       0.11      0.82      0.20        39

    accuracy                           0.76      1110
   macro avg       0.55      0.79      0.53      1110
weighted avg       0.96      0.76      0.84      1110



In [413]:
from sklearn.pipeline import Pipeline

In [414]:
LR_Pipe = Pipeline([('scaler',StandardScaler()),('estimator',models[0]['Model'])])

In [415]:
LR_Pipe.fit(X_train,y_train)
print(classification_report(y_test,LR_Pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86      1071
           1       0.11      0.82      0.20        39

    accuracy                           0.76      1110
   macro avg       0.55      0.79      0.53      1110
weighted avg       0.96      0.76      0.84      1110



In [416]:
DT_Pipe = Pipeline([('scaler',StandardScaler()),('estimator',models[1]['Model'])])
DT_Pipe.fit(X_train,y_train)
print(classification_report(y_test,DT_Pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1071
           1       0.12      0.13      0.12        39

    accuracy                           0.94      1110
   macro avg       0.55      0.55      0.55      1110
weighted avg       0.94      0.94      0.94      1110



In [417]:
SVM_Pipe = Pipeline([('scaler',StandardScaler()),('estimator',models[2]['Model'])])
SVM_Pipe.fit(X_train,y_train)
print(classification_report(y_test,SVM_Pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.69      0.81      1071
           1       0.07      0.64      0.13        39

    accuracy                           0.69      1110
   macro avg       0.53      0.67      0.47      1110
weighted avg       0.95      0.69      0.79      1110



In [224]:
# def make_pipelines(models):
#     pipes = []
#     for model in models:
#         pipes.append({"Model":model['name'],"Pipe":Pipeline([('scaler',scaler), ('estimator',model['Model'])])})
#     return(pipes)

In [418]:
pipes = [{"Model":"Logistic Regression","Pipe":LR_Pipe},
         {"Model":"Decision Tree","Pipe":DT_Pipe},
         {"Model":"Support Vector Machine","Pipe":SVM_Pipe}
        ]


In [419]:
pipes

[{'Model': 'Logistic Regression',
  'Pipe': Pipeline(steps=[('scaler', StandardScaler()),
                  ('estimator', LogisticRegression())])},
 {'Model': 'Decision Tree',
  'Pipe': Pipeline(steps=[('scaler', StandardScaler()),
                  ('estimator',
                   DecisionTreeClassifier(criterion='log_loss', max_depth=10))])},
 {'Model': 'Support Vector Machine',
  'Pipe': Pipeline(steps=[('scaler', StandardScaler()),
                  ('estimator', SVC(C=20.0, degree=2, kernel='sigmoid'))])}]

In [420]:
preds = pipes[2]['Pipe'].predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.69      0.81      1071
           1       0.07      0.64      0.13        39

    accuracy                           0.69      1110
   macro avg       0.53      0.67      0.47      1110
weighted avg       0.95      0.69      0.79      1110



In [421]:
import json
feature_names = json.dumps(X.columns.to_list())
# Write to a JSON file
with open('feature_names.json', 'w') as file:
    file.write(feature_names)

In [422]:
import json
with open('feature_names.json', 'r') as file:
    column_names_json = file.read()

# Convert JSON to list (column names)
column_names = json.loads(column_names_json)
print(column_names)

['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']


In [423]:
df2['work_type'].value_counts()

work_type
Private          2522
Self-employed     671
children          665
Govt_job          560
Never_worked       22
Name: count, dtype: int64

In [424]:
row2 = ['Female', 32.0, 0, 0, 'Yes', 'Private', 'Urban', 98.09, 25.2, 'smokes']
row3 = ['Male', 10.0, 0, 0, 'No', 'Never_worked', 'Rural', 80, 21, 'never smoked']
row4 = ['Male', 50.0, 0, 0, 'Yes', 'Private', 'Rural', 122.48, 35.9, 'smokes']
row5 = ['Male', 71.0, 1, 0, 'Yes', 'Govt_job', 'Urban', 153.08, 21.5, 'Unknown']
row6 = ['Female', 26.0, 0, 0, 'No', 'Private', 'Urban', 80.94, 22.2, 'smokes']

In [466]:
if row5 == ['Male', 71, 1, 0, 'Yes', 'Govt_job', 'Urban', 153.08, 21.5, 'Unknown']:
    print('Same')

Same


In [425]:
def predict(model,input):

    try:
        for m in pipes:
            if m['Model'] == model:
                pipe = m['Pipe']

                input_data =(np.array([input],dtype=object)).reshape((1,-1))
                datafr = pd.DataFrame(input_data,columns=column_names)
                data = transformer.transform(datafr)
                pred = int(pipe.predict(data)[0])
                
                return data
    
    except Exception as e:
        print(e) 
        

In [426]:
X_train

array([[  1.        ,   0.        ,   1.        , ...,   0.        ,
         77.76      ,  18.1       ],
       [  1.        ,   0.        ,   1.        , ...,   0.        ,
         66.13      ,  46.2       ],
       [  0.        ,   0.        ,   1.        , ...,   0.        ,
         67.07      ,  24.5       ],
       ...,
       [  0.        ,   0.        ,   0.56932736, ...,   0.        ,
        175.1073059 ,  29.07815245],
       [  0.68554086,   0.        ,   0.        , ...,   0.31445914,
         78.24068762,  22.48493387],
       [  1.        ,   0.        ,   0.86498454, ...,   0.        ,
        187.31172613,  28.72996909]])

In [433]:
scaler=StandardScaler()
scaler.fit_transform(X_train)

array([[ 1.31611038, -0.01247372,  1.11928719, ..., -0.36652661,
        -0.653255  , -1.76713935],
       [ 1.31611038, -0.01247372,  1.11928719, ..., -0.36652661,
        -0.99974193,  2.98112609],
       [-0.90292877, -0.01247372,  1.11928719, ..., -0.36652661,
        -0.97173696, -0.68568388],
       ...,
       [-0.90292877, -0.01247372,  0.18019119, ..., -0.36652661,
         2.2469659 ,  0.0879205 ],
       [ 0.61831323, -0.01247372, -1.06124603, ...,  0.87384344,
        -0.6389341 , -1.02618455],
       [ 1.31611038, -0.01247372,  0.8248815 , ..., -0.36652661,
         2.61056626,  0.02908538]])

In [434]:
scaler.transform(transformer.transform(test))

array([[ 1.31611038, -0.01247372, -1.06124603, -0.05299163,  0.96751183,
        -0.59125307, -0.30155567,  1.78148499,  0.60255977, -0.16525186,
        -0.4956239 , -0.36652661,  0.6790662 ,  1.24065868]])

In [463]:
predict(model='Support Vector Machine', input=row5)

1

In [116]:
input = ['Female', 32.0, 0, 0, 'Yes', 'Private', 'Urban', 98.09, 25.2, 'smokes']
input_data =(np.array([input],dtype=object)).reshape((1,-1))
datafr = pd.DataFrame(input_data,columns=column_names)
data = transformer.transform(datafr)
unique_values, counts = np.unique(pipes[0]['Pipe'].predict(X_test), return_counts=True)

# Combine unique values and their counts into a dictionary
result_dict = dict(zip(unique_values, counts))

print("Unique values:", unique_values)
print("Counts:", counts)
print("Result as dictionary:", result_dict)

Unique values: [0 1]
Counts: [823 287]
Result as dictionary: {0: 823, 1: 287}


In [220]:
scaler.transform(X_test)

array([[-2.90656307, -1.01262931, -3.37532825, ..., -1.81227448,
        -2.96666941, -4.83870357],
       [-2.90656307, -1.01262931,  1.37939687, ..., -1.81227448,
        -2.97133815, -4.92150834],
       [ 2.01757169, -1.01262931,  1.37939687, ..., -1.81227448,
        -2.97672585, -4.5531699 ],
       ...,
       [ 2.01757169, -1.01262931, -3.37532825, ..., -1.81227448,
        -2.96922568, -4.73020078],
       [-2.90656307, -1.01262931, -3.37532825, ..., -1.81227448,
        -2.98804267, -4.81300554],
       [-2.90656307, -1.01262931,  1.37939687, ..., -1.81227448,
        -2.97788859, -4.33330897]])

In [214]:
data.reshape(-1).shape

(14,)

In [456]:
pickle.dump(transformer,open('models/transformer.pkl','wb'))
pickle.dump(pipes,open('models/pipes.pkl','wb'))

In [457]:
with open('models/pipes.pkl','rb') as file:
     loaded_dict = pickle.load(file)

print("Loaded Dictionary:", loaded_dict)


Loaded Dictionary: [{'Model': 'Logistic Regression', 'Pipe': Pipeline(steps=[('scaler', StandardScaler()),
                ('estimator', LogisticRegression())])}, {'Model': 'Decision Tree', 'Pipe': Pipeline(steps=[('scaler', StandardScaler()),
                ('estimator',
                 DecisionTreeClassifier(criterion='log_loss', max_depth=10))])}, {'Model': 'Support Vector Machine', 'Pipe': Pipeline(steps=[('scaler', StandardScaler()),
                ('estimator', SVC(C=20.0, degree=2, kernel='sigmoid'))])}]


In [458]:
input_data = ['Male']

In [460]:
dt = predict(model='Logistic Regression',input=row5)
dt

1

In [132]:
transformer.transform(dt)

array([[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 3.0, 1.0, 32.0, 0, 0, 98.09,
        25.2]], dtype=object)

In [90]:
p = pipes[0]['Pipe'].predict(X_test)
p[1:6]

array([0, 0, 0, 1, 0], dtype=int64)

In [129]:
X_test[1].shape

(14,)

In [128]:
dt.shape

(1, 14)