In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
# Load Data

dbt = pd.read_csv('dataset/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Cek nama kolom
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
# Cek kolom null
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
# Impute nilai 0 dengan mean
feature_columns = dbt.columns[:-1]  # semua kolom kecuali 'Outcome'
imputer = SimpleImputer(missing_values=0, strategy='mean')
dbt[feature_columns] = imputer.fit_transform(dbt[feature_columns])

In [7]:
# Fitur dan label
X = dbt[feature_columns]
y = dbt['Outcome']

In [8]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [10]:
logreg = LogisticRegression(solver='liblinear')

param_grid_logreg = {
    'C': [0.01, 0.1, 1, 10, 100]
}
grid_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5)
grid_logreg.fit(X_train_std, y_train)

best_logreg = grid_logreg.best_estimator_

In [11]:
svm_poly = SVC(kernel='poly')

param_grid_svm = {
    'C': [0.01, 0.1, 1, 10, 100],
    'degree': [2, 3, 4]
}
grid_svm = GridSearchCV(svm_poly, param_grid_svm, cv=5)
grid_svm.fit(X_train_std, y_train)

best_svm = grid_svm.best_estimator_

In [12]:
dtree = DecisionTreeClassifier()

param_grid_dtree = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_dtree = GridSearchCV(dtree, param_grid_dtree, cv=5)
grid_dtree.fit(X_train, y_train)

best_dtree = grid_dtree.best_estimator_

In [13]:
voting = VotingClassifier(
    estimators=[
        ('logreg', best_logreg),
        ('svm_poly', best_svm),
        ('decision_tree', best_dtree)
    ],
    voting='hard'
)

# Fit model voting
voting.fit(X_train_std, y_train)

# Prediksi dan evaluasi akurasi
y_pred_voting = voting.predict(X_test_std)
accuracy_voting = accuracy_score(y_test, y_pred_voting)

print("Voting Classifier Test set accuracy: {:.2f}".format(accuracy_voting))

Voting Classifier Test set accuracy: 0.76
