In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

In [2]:
data = pd.read_csv("data/healthcare-dataset-stroke-data.csv")

In [3]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
data['bmi'].fillna(data['bmi'].mean(),inplace=True)

In [5]:
data.drop('id',inplace=True,axis=1)

In [6]:
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                        'Residence_type', 'smoking_status']
encoders = {}

In [7]:
def age_group(x):
    if x<13: return "Child"
    elif 13<x<20: return "Teenager"
    elif 20<x<=60: return "Adult"
    else: return "Elder"
    
data["age_group"] = data.age.apply(age_group)

def bmi_group(x):
    if x<18.5 : return "UnderWeight"
    elif 18.5<x<25: return "Healthy"
    elif 25<x<30: return "OverWeight"
    else: return "Obese"

data["bmi_group"] = data.bmi.apply(bmi_group);

categorical_features.extend(['age_group', 'bmi_group'])

In [8]:
for each in categorical_features:
    encoders[each] = LabelEncoder()
    data[each] = encoders[each].fit_transform(data[each])

In [9]:
X = data.drop('stroke',axis=1)
y = data['stroke']

In [10]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_group,bmi_group
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1,2,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,2,1,2,2
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1,0,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1,2,0


In [11]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X,y,test_size =0.2)

In [12]:
x_train_std = std.fit_transform(x_train)
x_test_std = std.transform(x_test)
print(x_train_std)
print(x_test_std)

[[-0.8437384  -0.31612935 -0.3229055  ... -1.28560955 -0.8331852
   0.90751964]
 [-0.8437384   0.30331701  3.09688127 ...  0.57972207 -0.8331852
  -0.22806049]
 [-0.8437384   0.25907084 -0.3229055  ...  0.57972207 -0.8331852
   0.90751964]
 ...
 [-0.8437384  -0.67009869 -0.3229055  ... -0.35294374 -0.8331852
  -1.36364063]
 [-0.8437384   0.65728635 -0.3229055  ... -0.35294374 -0.8331852
   0.90751964]
 [ 1.1828201   0.1705785  -0.3229055  ...  0.57972207 -0.8331852
  -0.22806049]]
[[-0.8437384   0.21482467 -0.3229055  ... -1.28560955 -0.8331852
  -0.22806049]
 [ 1.1828201   0.74577869 -0.3229055  ...  0.57972207 -0.8331852
  -0.22806049]
 [ 1.1828201  -1.89837231 -0.3229055  ... -1.28560955  0.14025324
  -1.36364063]
 ...
 [-0.8437384  -0.75859103 -0.3229055  ... -1.28560955 -0.8331852
   0.90751964]
 [-0.8437384  -0.71434486 -0.3229055  ...  1.51238788 -0.8331852
   0.90751964]
 [ 1.1828201  -1.64351439 -0.3229055  ... -1.28560955  0.14025324
   2.04309978]]


In [13]:
columns = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type','avg_glucose_level', 'bmi', 'smoking_status']
def process_input(x):
    x = pd.DataFrame(x, columns=columns)
    x["bmi_group"] = data.bmi.apply(bmi_group);
    x["age_group"] = data.age.apply(age_group)
    for each in categorical_features:
        x[each] = encoders[each].transform(x[each])
    x = std.transform(x)
    return x
        
x = [['Female', 67.0, 0, 1, 'Yes', 'Self-employed', 'Urban', 228.69, 36.6, 'never smoked']]
x = process_input(x)  

In [14]:
from sklearn.svm import SVC
sv = SVC(probability=True)
sv.fit(x_train_std, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [23]:
y_pred = sv.predict(x_test)
ac_rf = accuracy_score(y_test,y_pred)
print(f"Accuracy of support vector classifier in total dataset: {ac_rf}")
y1 = sv.predict(x)
print(y1)
results = sv.predict_proba(x)
print(results)

Accuracy of support vector classifier in total dataset: 0.9432485322896281
[0]
[[0.94545379 0.05454621]]


In [17]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train_std,y_train)
print(dt.feature_importances_)
print(x_train.columns.values)

[0.04160376 0.20723647 0.04431137 0.01086074 0.00710964 0.03881722
 0.03687878 0.2735354  0.24660159 0.07920381 0.         0.01384122]
['gender' 'age' 'hypertension' 'heart_disease' 'ever_married' 'work_type'
 'Residence_type' 'avg_glucose_level' 'bmi' 'smoking_status' 'age_group'
 'bmi_group']


In [24]:
for j,i in zip(dt.feature_importances_,x_train.columns.values):
    print(f"{i} : {j}")

gender : 0.04160376346835837
age : 0.20723647213493282
hypertension : 0.044311371754317375
heart_disease : 0.010860736901365574
ever_married : 0.007109638471478938
work_type : 0.03881722082796566
Residence_type : 0.03687878015290564
avg_glucose_level : 0.2735353980164986
bmi : 0.246601586858204
smoking_status : 0.07920380955348266
age_group : 0.0
bmi_group : 0.013841221860490315
