In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv("data/healthcare-dataset-stroke-data.csv")

In [3]:
data["bmi"].describe()

count    4909.000000
mean       28.893237
std         7.854067
min        10.300000
25%        23.500000
50%        28.100000
75%        33.100000
max        97.600000
Name: bmi, dtype: float64

In [5]:
data['bmi'].fillna(data['bmi'].mean(),inplace=True,)

In [6]:
data['bmi']

0       36.600000
1       28.893237
2       32.500000
3       34.400000
4       24.000000
          ...    
5105    28.893237
5106    40.000000
5107    30.600000
5108    25.600000
5109    26.200000
Name: bmi, Length: 5110, dtype: float64

In [7]:
data.drop('id',inplace=True,axis=1)

In [8]:
data['work_type'].unique

<bound method Series.unique of 0             Private
1       Self-employed
2             Private
3             Private
4       Self-employed
            ...      
5105          Private
5106    Self-employed
5107    Self-employed
5108          Private
5109         Govt_job
Name: work_type, Length: 5110, dtype: object>

In [9]:
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                        'Residence_type', 'smoking_status']
enc = LabelEncoder()

In [10]:
for each in categorical_features:
    data[each] = enc.fit_transform(data[each])

In [11]:
X = data.drop('stroke',axis=1)
y = data['stroke']

In [12]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X,y,test_size =0.2)

In [13]:
data.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,0.414286,43.226614,0.097456,0.054012,0.656164,2.16771,0.508023,106.147677,28.893237,1.376908,0.048728
std,0.493044,22.612647,0.296607,0.226063,0.475034,1.090293,0.499985,45.28356,7.698018,1.071534,0.21532
min,0.0,0.08,0.0,0.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0
25%,0.0,25.0,0.0,0.0,0.0,2.0,0.0,77.245,23.8,0.0,0.0
50%,0.0,45.0,0.0,0.0,1.0,2.0,1.0,91.885,28.4,2.0,0.0
75%,1.0,61.0,0.0,0.0,1.0,3.0,1.0,114.09,32.8,2.0,0.0
max,2.0,82.0,1.0,1.0,1.0,4.0,1.0,271.74,97.6,3.0,1.0


In [14]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

In [15]:
x_train_std = std.fit_transform(x_train)
x_test_std = std.transform(x_test)

In [16]:
print(x_train_std)
print(x_test_std)

[[-0.8340046   0.82742926 -0.33160926 ... -0.39762635  0.97799873
  -1.27745579]
 [-0.8340046  -0.06139403 -0.33160926 ... -0.58216069 -0.33694768
   0.58530122]
 [ 1.1966153   1.13851741  3.01559731 ...  2.72350723  0.16273195
   1.51667973]
 ...
 [ 1.1966153   0.20525295  3.01559731 ...  1.74412392 -0.36324661
   0.58530122]
 [-0.8340046  -0.99465849 -0.33160926 ... -0.37281501 -0.91552411
   1.51667973]
 [ 1.1966153   1.18295858  3.01559731 ... -0.71397093  0.03123731
   0.58530122]]
[[ 1.1966153   1.00519392 -0.33160926 ...  3.01348976  0.84650409
   1.51667973]
 [-0.8340046   0.16081179 -0.33160926 ... -0.24809372  1.30673533
  -0.34607729]
 [ 1.1966153  -1.21686432 -0.33160926 ... -0.08792767 -0.91552411
  -1.27745579]
 ...
 [-0.8340046  -1.12798199 -0.33160926 ... -0.38588527  0.96484926
  -1.27745579]
 [-0.8340046  -0.19471753 -0.33160926 ...  3.0405164   1.09634391
   1.51667973]
 [-0.8340046   0.82742926 -0.33160926 ... -0.32009091 -1.21796178
  -1.27745579]]


In [17]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [18]:
dt.fit(x_train_std,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [19]:
dt.feature_importances_

array([0.02531676, 0.17792981, 0.03091387, 0.02458648, 0.03171355,
       0.04424361, 0.03547507, 0.3264886 , 0.23407804, 0.06925422])

In [20]:
x_train.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
y_pred = dt.predict(x_test)

In [23]:
ac = accuracy_score(y_test,y_pred)

In [36]:
columns = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type','avg_glucose_level', 'bmi', 'smoking_status']

def process_input(X):
    X = pd.DataFrame(X, columns=columns)
    for each in categorical_features:
        X[each] = enc.fit_transform(X[each])
        
    return X
    
    

In [37]:
X = [['Male', 67.0, 0, 1, 'Yes', 'Private', 'Urban', 228.69, 36.6, 'formerly smoked']]
X = process_input(X)

In [27]:
x_test_std_test = std.transform(X)

ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [28]:
x_test_std = std.transform(x_test)

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()


In [30]:
rf.fit(x_train_std,y_train)
y_pred = rf.predict(x_test)
ac_rf = accuracy_score(y_test,y_pred)

In [None]:
print(ac_rf)

In [None]:
## SVM

In [32]:
from sklearn.svm import SVC
sv = SVC()

In [33]:
sv.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [38]:
y_pred = sv.predict(X)

ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
ac_rf = accuracy_score(y_test,y_pred)

In [None]:
print(ac_rf)