In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
# Memuat dataset stroke ke dalam DataFrame pandas
stroke_dataset = pd.read_csv('stroke_cleaned.csv')

In [3]:
# Mencetak 10 baris pertama dari dataset
print(stroke_dataset.head(10))

      id   age  hypertension  heart_disease  avg_glucose_level   bmi  \
0   9046  67.0             0              1             228.69  36.6   
1  51676  61.0             0              0             202.21   NaN   
2  31112  80.0             0              1             105.92  32.5   
3  60182  49.0             0              0             171.23  34.4   
4   1665  79.0             1              0             174.12  24.0   
5  56669  81.0             0              0             186.21  29.0   
6  53882  74.0             1              1              70.09  27.4   
7  10434  69.0             0              0              94.39  22.8   
8  27419  59.0             0              0              76.15   NaN   
9  60491  78.0             0              0              58.57  24.2   

    smoking_status  stroke  
0  formerly smoked       1  
1     never smoked       1  
2     never smoked       1  
3           smokes       1  
4     never smoked       1  
5  formerly smoked       1  
6   

In [4]:
# Jumlah baris dan kolom dalam dataset
stroke_dataset.shape

(5110, 8)

In [5]:
# Mendapatkan ukuran statistik dari data
stroke_dataset.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
stroke_dataset['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [7]:
stroke_dataset['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [8]:
stroke_dataset

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,9046,67.0,0,1,228.69,36.6,formerly smoked,1
1,51676,61.0,0,0,202.21,,never smoked,1
2,31112,80.0,0,1,105.92,32.5,never smoked,1
3,60182,49.0,0,0,171.23,34.4,smokes,1
4,1665,79.0,1,0,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,83.75,,never smoked,0
5106,44873,81.0,0,0,125.20,40.0,never smoked,0
5107,19723,35.0,0,0,82.99,30.6,never smoked,0
5108,37544,51.0,0,0,166.29,25.6,formerly smoked,0


In [9]:
##mengubah nilai smoking_status menjadi 0 dan 1, 0 jika tidak dan 1 jika ya merokok
stroke_dataset['smoking_status'] = stroke_dataset['smoking_status'].replace({
    'never smoked': 0,
    'Unknown': 0,
    'formerly smoked': 1,
    'smokes': 1
})

In [10]:
## mengubah nilai nan pada bmi menjadi median dari kolom bmi agar tidak kosong
imputer = SimpleImputer(strategy='median')
stroke_dataset['bmi'] = imputer.fit_transform(stroke_dataset[['bmi']])

In [11]:
stroke_dataset

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,9046,67.0,0,1,228.69,36.6,1,1
1,51676,61.0,0,0,202.21,28.1,0,1
2,31112,80.0,0,1,105.92,32.5,0,1
3,60182,49.0,0,0,171.23,34.4,1,1
4,1665,79.0,1,0,174.12,24.0,0,1
...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,83.75,28.1,0,0
5106,44873,81.0,0,0,125.20,40.0,0,0
5107,19723,35.0,0,0,82.99,30.6,0,0
5108,37544,51.0,0,0,166.29,25.6,1,0


In [12]:
stroke_dataset.groupby('stroke').mean()

Unnamed: 0_level_0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,36487.236371,41.971545,0.088871,0.04711,104.795513,28.799115,0.321333
1,37115.068273,67.728193,0.26506,0.188755,132.544739,30.090361,0.449799


In [13]:
# Memisahkan data dan label
X = stroke_dataset.drop(columns = 'stroke', axis=1)
Y = stroke_dataset['stroke']

In [14]:
print(X)

         id   age  hypertension  heart_disease  avg_glucose_level   bmi  \
0      9046  67.0             0              1             228.69  36.6   
1     51676  61.0             0              0             202.21  28.1   
2     31112  80.0             0              1             105.92  32.5   
3     60182  49.0             0              0             171.23  34.4   
4      1665  79.0             1              0             174.12  24.0   
...     ...   ...           ...            ...                ...   ...   
5105  18234  80.0             1              0              83.75  28.1   
5106  44873  81.0             0              0             125.20  40.0   
5107  19723  35.0             0              0              82.99  30.6   
5108  37544  51.0             0              0             166.29  25.6   
5109  44679  44.0             0              0              85.28  26.2   

      smoking_status  
0                  1  
1                  0  
2                  0  
3      

In [15]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64


## Standarisasi Data

In [16]:
scaler = StandardScaler()

In [17]:
scaler.fit(X)

In [18]:
standardized_data = scaler.transform(X)

In [19]:
print(standardized_data)

[[-1.29831203  1.05143428 -0.32860186 ...  2.70637544  1.00508597
   1.43267885]
 [ 0.71637149  0.78607007 -0.32860186 ...  2.12155854 -0.09898092
  -0.69799313]
 [-0.25547819  1.62639008 -0.32860186 ... -0.0050283   0.47253605
  -0.69799313]
 ...
 [-0.79371959 -0.36384151 -0.32860186 ... -0.51144264  0.22574463
  -0.69799313]
 [ 0.04849658  0.34379639 -0.32860186 ...  1.32825706 -0.42370648
   1.43267885]
 [ 0.38569496  0.03420481 -0.32860186 ... -0.46086746 -0.34577235
  -0.69799313]]


In [20]:
X = standardized_data
Y = stroke_dataset['stroke']

In [21]:
print(X)
print(Y)

[[-1.29831203  1.05143428 -0.32860186 ...  2.70637544  1.00508597
   1.43267885]
 [ 0.71637149  0.78607007 -0.32860186 ...  2.12155854 -0.09898092
  -0.69799313]
 [-0.25547819  1.62639008 -0.32860186 ... -0.0050283   0.47253605
  -0.69799313]
 ...
 [-0.79371959 -0.36384151 -0.32860186 ... -0.51144264  0.22574463
  -0.69799313]
 [ 0.04849658  0.34379639 -0.32860186 ...  1.32825706 -0.42370648
   1.43267885]
 [ 0.38569496  0.03420481 -0.32860186 ... -0.46086746 -0.34577235
  -0.69799313]]
0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64


## Pemisahan Data Train dan Test

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(5110, 7) (4088, 7) (1022, 7)


## Pelatihan Model

In [24]:
classifier = svm.SVC(kernel='linear')

In [25]:
# Pelatihan Klasifikasi Support Vector Machine (SVM)
classifier.fit(X_train, Y_train)

## Evaluasi Model

#### akurasi score

In [26]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [27]:
print('Accuracy score dari training data : ', training_data_accuracy)

Accuracy score dari training data :  0.951320939334638


### Membuat Sistem Prediksi

In [32]:
input_data = (19723,35.0,0,0,82.99,30.6,0)

# Mengubah data masukan menjadi array numpy
input_data_as_numpy_array = np.asarray(input_data)

# Mengubah bentuk array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# Menyesuaikan data masuk
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('Anda bukan penderita stroke')
else:
  print('Anda penderita stroke')

[[-0.79371959 -0.36384151 -0.32860186 -0.2389468  -0.51144264  0.22574463
  -0.69799313]]
[0]
Anda penderita stroke


