Proportion Of Different Smoking Categories Among Stroke Population

In [654]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import plotly
import plotly.express as px

# import warnings
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
import os

In [655]:
origin_data = pd.read_csv(r'..\Dataset\strokeDataset.csv')
data = origin_data.copy()

In [656]:
#Dropping the id column that has no correlation with the target variable
data.drop(['id'], axis=1, inplace=True)

In [657]:
data['bmi'].fillna(data['bmi'].mean(), inplace=True)

In [658]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [659]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [660]:
# Convert Marrital Status, Residence and Gender into 0's and 1's
data['gender']=data['gender'].apply(lambda x : 1 if x=='Male' else 0) 
data["Residence_type"] = data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data["ever_married"] = data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)


In [661]:
# Removing the observations that have smoking type unknown. 
data=data[data['smoking_status']!='Unknown']

# One Hot encoding smoking_status, work_type
data_dummies = data[['smoking_status','work_type']]
data_dummies=pd.get_dummies(data_dummies)
data.drop(columns=['smoking_status','work_type'],inplace=True)

In [662]:
data_stroke=data['stroke']
data.drop(columns=['stroke'],inplace=True)
data=data.merge(data_dummies,left_index=True, right_index=True,how='left')


In [663]:
X_train, X_test, Y_train, Y_test=train_test_split(data,data_stroke,test_size=0.2,random_state=10)

In [664]:
sm = SMOTE(random_state=4)
X_train, Y_train = sm.fit_resample(X_train, Y_train.ravel())

In [665]:
class_models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(criterion='log_loss', random_state=0,verbose=0),
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(var_smoothing=1e-09,),
}

class_scores = {}

for class_name, model in class_models.items():
    model.fit(X_train, Y_train)


    #get the score of each model 
    class_scores[class_name] = model.score(X_test, Y_test)


    #print model and its score
    print(f"Model Name :  {class_name}")
    print(f"Model Score :  {class_scores[class_name]}")
    print("")

Model Name :  Decision Tree
Model Score :  0.8809523809523809



Model Name :  Random Forest
Model Score :  0.907563025210084

Model Name :  Logistic Regression
Model Score :  0.896358543417367

Model Name :  KNN
Model Score :  0.7829131652661064

Model Name :  SVM
Model Score :  0.6904761904761905

Model Name :  Naive Bayes
Model Score :  0.19327731092436976



In [666]:
input_data =(1,64,0,0,1,0,113.68,24.2,0,1,0,1,0,0,0,0)
#input_data =(0,49,0,0,1,1,301.23,34.4,0,0,1,0,0,1,0,0)
#input_data=(1,67,0,1,1,1,228.69,36.6,1,0,0,0,0,1,0,0)

#commented because of the errors
# input_data=std.fit_transform([input_data])

#get the best model 
best_class=class_models[max(class_scores, key=class_scores.get)]
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = best_class.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
    print("The patient is not likely to get a stroke")
else:   
    print("The patient is likely to get a stroke")

[0]
The patient is not likely to get a stroke


In [667]:
#get the best model 
best_class=class_models[max(class_scores, key=class_scores.get)]


#create a .sav file to save the model
sav_filename = 'trained_model.sav'
pickle.dump(best_class, open(sav_filename, 'wb'))

In [668]:
#load the model
trained_model = pickle.load(open('trained_model.sav', 'rb'))

In [669]:
input_data2 =(1,64,0,0,1,0,113.68,24.2,0,1,0,1,0,0,0,0)


#commented bec of errors
# input_data2=std.transform([input_data2])
input_data_as_numpy_array2 = np.asarray(input_data2)

# reshape the array as we are predicting for one instance
input_data_reshaped2 = input_data_as_numpy_array2.reshape(1,-1)

prediction2 = trained_model.predict(input_data_reshaped2)
print(prediction2)

if (prediction2[0] == 0):
    print("The patient is not likely to get a stroke")
else:   
    print("The patient is likely to get a stroke")

[0]
The patient is not likely to get a stroke
