Proportion Of Different Smoking Categories Among Stroke Population

In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE



import plotly
import plotly.express as px

# import warnings
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
import os

In [184]:
origin_data = pd.read_csv(r'..\Dataset\strokeDataset.csv')
data = origin_data.copy()

In [185]:
#Dropping the id column that has no correlation with the target variable
data.drop(['id'], axis=1, inplace=True)

In [186]:
data['bmi'].fillna(data['bmi'].mean(), inplace=True)

In [187]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [188]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [189]:
# Convert Marrital Status, Residence and Gender into 0's and 1's
data['gender']=data['gender'].apply(lambda x : 1 if x=='Male' else 0) 
data["Residence_type"] = data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data["ever_married"] = data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
# Removing the observations that have smoking type unknown. 
data=data[data['smoking_status']!='Unknown']

In [190]:
# One Hot encoding smoking_status, work_type
data_dummies = data[['smoking_status','work_type']]
data_dummies=pd.get_dummies(data_dummies)
data.drop(columns=['smoking_status','work_type'],inplace=True)

In [191]:
data_dummies

Unnamed: 0,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,True,False,False,False,False,True,False,False
1,False,True,False,False,False,False,True,False
2,False,True,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False
4,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
5102,False,True,False,False,False,True,False,False
5105,False,True,False,False,False,True,False,False
5106,False,True,False,False,False,False,True,False
5107,False,True,False,False,False,False,True,False


In [192]:
data_stroke=data['stroke']
data.drop(columns=['stroke'],inplace=True)
data=data.merge(data_dummies,left_index=True, right_index=True,how='left')


In [193]:
data_dummies

Unnamed: 0,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,True,False,False,False,False,True,False,False
1,False,True,False,False,False,False,True,False
2,False,True,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False
4,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
5102,False,True,False,False,False,True,False,False
5105,False,True,False,False,False,True,False,False
5106,False,True,False,False,False,False,True,False
5107,False,True,False,False,False,False,True,False


In [194]:
X_train, X_test, Y_train, Y_test=train_test_split(data,data_stroke,test_size=0.2,random_state=10)

In [195]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
5005,0,21.0,0,0,0,0,71.34,24.0,False,True,False,False,False,True,False,False
1583,0,37.0,0,0,1,1,65.29,32.9,False,True,False,True,False,False,False,False
222,0,63.0,0,0,1,0,205.35,42.2,True,False,False,True,False,False,False,False
382,1,53.0,1,1,1,0,109.51,41.9,False,True,False,True,False,False,False,False
4895,0,56.0,1,0,1,0,177.56,30.1,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4928,1,11.0,0,0,0,0,121.71,23.4,False,True,False,False,False,False,False,True
1878,1,63.0,0,0,1,0,104.70,21.0,True,False,False,False,False,False,True,False
709,0,23.0,0,0,0,1,60.50,27.1,True,False,False,False,False,True,False,False
4586,0,53.0,0,0,1,1,85.46,30.0,False,True,False,False,False,True,False,False


In [196]:
Y_train

5005    0
1583    0
222     1
382     0
4895    0
       ..
4928    0
1878    0
709     0
4586    0
1793    0
Name: stroke, Length: 2852, dtype: int64

In [197]:
sm = SMOTE(random_state=4)
X_train, Y_train = sm.fit_resample(X_train, Y_train.ravel())

In [198]:
#commented out because of errors 
# std=StandardScaler()
# X_train=std.fit_transform(X_train)
# X_test=std.fit_transform(X_test)

In [199]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,0,21.000000,0,0,0,0,71.340000,24.000000,False,True,False,False,False,True,False,False
1,0,37.000000,0,0,1,1,65.290000,32.900000,False,True,False,True,False,False,False,False
2,0,63.000000,0,0,1,0,205.350000,42.200000,True,False,False,True,False,False,False,False
3,1,53.000000,1,1,1,0,109.510000,41.900000,False,True,False,True,False,False,False,False
4,0,56.000000,1,0,1,0,177.560000,30.100000,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5387,0,78.987326,0,0,1,1,97.671698,21.593705,True,False,True,False,False,True,True,False
5388,0,45.832450,0,0,1,0,95.125175,30.399788,True,False,True,False,False,True,False,False
5389,0,62.133719,0,0,0,1,90.414883,31.465155,True,True,False,False,False,True,False,False
5390,0,38.078759,0,0,0,1,82.355609,24.181146,True,True,False,False,False,True,True,False


In [200]:
X_test

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
4050,0,26.0,0,0,0,0,58.55,29.0,False,True,False,False,False,True,False,False
3915,1,61.0,0,0,1,1,68.17,43.8,True,False,False,False,False,True,False,False
2025,0,54.0,0,0,1,0,151.33,30.9,True,False,False,False,False,True,False,False
4176,1,51.0,0,0,1,0,232.64,45.2,False,True,False,False,False,True,False,False
4977,0,44.0,0,0,0,0,116.95,26.1,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,0,80.0,1,0,1,0,72.67,28.9,False,True,False,False,False,False,True,False
3363,1,61.0,0,0,1,0,81.25,43.4,False,False,True,False,False,True,False,False
4199,0,44.0,0,0,1,0,87.71,34.0,True,False,False,False,False,True,False,False
2349,1,47.0,0,0,0,0,111.15,23.8,False,True,False,False,False,True,False,False


In [201]:
dtclass=DecisionTreeClassifier()

dtclass.fit(X_train,Y_train)

In [202]:

dtclass.fit(X_train,Y_train)
score = dtclass.score(X_test, Y_test)
# prediction = dtclass.predict(X_test_std)
print('Testing Score \n',score)



Testing Score 
 0.8851540616246498


In [203]:
data[data['avg_glucose_level'] == 228.69]


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,1,67.0,0,1,1,1,228.69,36.6,True,False,False,False,False,True,False,False


In [204]:
input_data =(1,64,0,0,1,0,113.68,24.2,0,1,0,1,0,0,0,0)
#input_data =(0,49,0,0,1,1,301.23,34.4,0,0,1,0,0,1,0,0)
#input_data=(1,67,0,1,1,1,228.69,36.6,1,0,0,0,0,1,0,0)

#commented because of the errors
# input_data=std.fit_transform([input_data])


input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = dtclass.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
    print("The patient is not likely to get a stroke")
else:   
    print("The patient is likely to get a stroke")

[0]
The patient is not likely to get a stroke


In [206]:
#create a .sav file to save the model

sav_filename = 'trained_model.sav'
pickle.dump(dtclass, open(sav_filename, 'wb'))

In [207]:
trained_model = pickle.load(open('trained_model.sav', 'rb'))

In [221]:
input_data2 =(1,64,0,0,1,0,113.68,24.2,0,1,0,1,0,0,0,0)


#commented bec of errors
# input_data2=std.transform([input_data2])
input_data_as_numpy_array2 = np.asarray(input_data2)

# reshape the array as we are predicting for one instance
input_data_reshaped2 = input_data_as_numpy_array2.reshape(1,-1)

prediction2 = trained_model.predict(input_data_reshaped2)
print(prediction2)

if (prediction2[0] == 0):
    print("The patient is not likely to get a stroke")
else:   
    print("The patient is likely to get a stroke")

[0]
The patient is not likely to get a stroke
