Proportion Of Different Smoking Categories Among Stroke Population

In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE



import plotly
import plotly.express as px

# import warnings
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
import os

In [142]:
origin_data = pd.read_csv(r'C:\Users\nrhhe\Downloads\SLIIT\FDM\Datasets\strokeDataset.csv')
data = origin_data.copy()

In [143]:
#Dropping the id column that has no correlation with the target variable
data.drop(['id'], axis=1, inplace=True)

In [144]:
data['bmi'].fillna(data['bmi'].mean(), inplace=True)

In [145]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [146]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [147]:
# Convert Marrital Status, Residence and Gender into 0's and 1's
data['gender']=data['gender'].apply(lambda x : 1 if x=='Male' else 0) 
data["Residence_type"] = data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data["ever_married"] = data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
# Removing the observations that have smoking type unknown. 
data=data[data['smoking_status']!='Unknown']

In [148]:
# One Hot encoding smoking_status, work_type
data_dummies = data[['smoking_status','work_type']]
data_dummies=pd.get_dummies(data_dummies)
data.drop(columns=['smoking_status','work_type'],inplace=True)

In [149]:
data_dummies

Unnamed: 0,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,True,False,False,False,False,True,False,False
1,False,True,False,False,False,False,True,False
2,False,True,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False
4,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
5102,False,True,False,False,False,True,False,False
5105,False,True,False,False,False,True,False,False
5106,False,True,False,False,False,False,True,False
5107,False,True,False,False,False,False,True,False


In [150]:
data_stroke=data['stroke']
data.drop(columns=['stroke'],inplace=True)
data=data.merge(data_dummies,left_index=True, right_index=True,how='left')


In [151]:
data_dummies

Unnamed: 0,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,True,False,False,False,False,True,False,False
1,False,True,False,False,False,False,True,False
2,False,True,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False
4,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
5102,False,True,False,False,False,True,False,False
5105,False,True,False,False,False,True,False,False
5106,False,True,False,False,False,False,True,False
5107,False,True,False,False,False,False,True,False


In [152]:
X_train, X_test, Y_train, Y_test=train_test_split(data,data_stroke,test_size=0.2,random_state=10)

In [153]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
5005,0,21.0,0,0,0,0,71.34,24.0,False,True,False,False,False,True,False,False
1583,0,37.0,0,0,1,1,65.29,32.9,False,True,False,True,False,False,False,False
222,0,63.0,0,0,1,0,205.35,42.2,True,False,False,True,False,False,False,False
382,1,53.0,1,1,1,0,109.51,41.9,False,True,False,True,False,False,False,False
4895,0,56.0,1,0,1,0,177.56,30.1,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4928,1,11.0,0,0,0,0,121.71,23.4,False,True,False,False,False,False,False,True
1878,1,63.0,0,0,1,0,104.70,21.0,True,False,False,False,False,False,True,False
709,0,23.0,0,0,0,1,60.50,27.1,True,False,False,False,False,True,False,False
4586,0,53.0,0,0,1,1,85.46,30.0,False,True,False,False,False,True,False,False


In [154]:
Y_train

5005    0
1583    0
222     1
382     0
4895    0
       ..
4928    0
1878    0
709     0
4586    0
1793    0
Name: stroke, Length: 2852, dtype: int64

In [155]:

std=StandardScaler()
X_train=std.fit_transform(X_train)
X_test=std.transform(X_test)

In [156]:
sm = SMOTE(random_state=4)
X_train, Y_train = sm.fit_resample(X_train, Y_train.ravel())

In [157]:
dtclass=DecisionTreeClassifier()

dtclass.fit(X_train,Y_train)

In [158]:

dtclass.fit(X_train,Y_train)
score = dtclass.score(X_test, Y_test)
# prediction = dtclass.predict(X_test_std)
print('Testing Score \n',score)



Testing Score 
 0.8417366946778712


In [160]:
input_data =(0,49,0,0,1,1,171.23,34.4,0,0,1,0,0,1,0,0)

input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = dtclass.predict(input_data_reshaped)
print(prediction)
if (prediction[0] == 0):
    print("The patient is not likely to get a stroke")
else:   
    print("The patient is likely to get a stroke")

[1]
The patient is likely to get a stroke


In [161]:
#create a .sav file to save the model

sav_filename = 'trained_model.sav'
pickle.dump(dtclass, open(sav_filename, 'wb'))

In [162]:
trained_model = pickle.load(open('trained_model.sav', 'rb'))

In [163]:
input_data =(0,49,0,0,1,1,171.23,34.4,0,0,1,0,0,1,0,0)

input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = trained_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic
