# **Case**:
In this case we will use one of the stacking methods, namely voting, to classify diabetes patients according to several characteristics.  Patients will be classified into patients suffering from diabetes (1) and not suffering from diabetes (0).  

First of all, we will use several classification algorithms separately, namely Logistic Regression, SVM polynomial, and Decision Tree.  After that, we will combine the performance of the 3 algorithms using the ensemble voting method. 

# **Import Libraries and Load Data**

In [37]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load the data
dbt = pd.read_csv('../Data/diabetes.csv')
# show the first 5 rows of the data
dbt.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


# **Check Data Description**

In [38]:
dbt.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# **Check columns name**

In [39]:
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

# **Check Null Column**

In [40]:
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# **Data Imputation**

In [41]:
# In this case, it doesn't make sense for some parameters to be 0
#  for example the values for 'Glucose', 'BloodPlessure' or 'Insulin'.
#  No matter how small the values are, every living human being must have 
#  these values

# We will manipulate the value 0 by 'imputation' or replacing the value 
# with a synthetic value
# In this case, we will use the mean value 

# Check the number of 0 values in each column
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

# Replace 0 with mean value
from sklearn.impute import SimpleImputer
fill_values = SimpleImputer(missing_values=0, strategy='mean', copy=False)
dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


# **Split Data**

In [42]:
X = dbt[feature_columns]
y = dbt['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
334,1.0,95.0,60.0,18.0,58.0,23.9,0.26,22.0
139,5.0,105.0,72.0,29.0,325.0,36.9,0.159,28.0
485,4.494673,135.0,68.0,42.0,250.0,42.3,0.365,24.0
547,4.0,131.0,68.0,21.0,166.0,33.1,0.16,28.0
18,1.0,103.0,30.0,38.0,83.0,43.3,0.183,33.0
593,2.0,82.0,52.0,22.0,115.0,28.5,1.699,25.0
140,3.0,128.0,78.0,29.15342,155.548223,21.1,0.268,55.0
326,1.0,122.0,64.0,32.0,156.0,35.1,0.692,30.0
266,4.494673,138.0,72.405184,29.15342,155.548223,36.3,0.933,25.0
626,4.494673,125.0,68.0,29.15342,155.548223,24.7,0.206,21.0


# **Feature Standardization**

In [43]:
# So we need to standardize 
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standardization in X_train dan X_test data
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# **Train Model**

## **Function to Check Mean accuracy and STD**

In [44]:
from sklearn.model_selection import cross_val_score
def check_model(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, cv=5)

    # Calculate mean and standard deviation
    mean_score = np.mean(scores)
    std_dev = np.std(scores)
    
    print(f'Mean Cross-Validation Score: {mean_score:.4f}')
    print(f'Standard Deviation: {std_dev:.4f}')
    
    # Interpretation
    print('Interpretation:')
    if std_dev > 0.1:
        print("The model's performance is inconsistent across folds.")
    elif mean_score < 0.7:
        print("The model is performing poorly.")
    else:
        print("The model appears to generalize well.")

## **Logistic Regression**

### **Hyperparameter Tuning**

In [45]:
# create default logistic regression object
logreg = LogisticRegression()

# define hyperparameter
param_logreg = {
    'penalty': ['l2', 'l1'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['liblinear', 'saga'],
    'max_iter': [50, 100, 200, 400, 800]
}

# create GridSearchCV object
logreg_tuned = GridSearchCV(logreg, param_logreg, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# fit data that has been standardized
logreg_tuned.fit(X_train_std, y_train)

# print best parameter after tuning
print(f'Best Hyperparameter: {logreg_tuned.best_params_}')

# print best score after tuning
print(f'Best Accuracy Score: {logreg_tuned.best_score_}')

Fitting 5 folds for each of 140 candidates, totalling 700 fits
Best Hyperparameter: {'C': 1, 'max_iter': 50, 'penalty': 'l2', 'solver': 'liblinear'}
Best Accuracy Score: 0.7783142956040152


### **Model Evaluation**

In [46]:
# set best estimator
logreg = logreg_tuned.best_estimator_

# fit data 
logreg.fit(X_train_std, y_train)

# predict train set
y_pred_train = logreg.predict(X_train_std)

# predict test set
y_pred_test = logreg.predict(X_test_std)

# calculate train and test data accuracy score
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

check_model(logreg, X_train_std, y_train)

# print train and test data accuracy score
print(f"\nAccuracy on train set: {acc_train * 100:.2f}%")
print(f"Accuracy on test set: {acc_test * 100:.2f}%")

Mean Cross-Validation Score: 0.7783
Standard Deviation: 0.0329
Interpretation:
The model appears to generalize well.

Accuracy on train set: 78.77%
Accuracy on test set: 73.59%


## **SVM Polynomial**

### **Hyperparameter Tuning**

In [47]:
# create SVM object with polynomial kernel
svm_poly = SVC(kernel='poly')

# define hyperparameter
param_svm_poly = {
    'degree': [2, 3, 4, 5],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'coef0': [0.0, 0.1, 0.5, 1.0]
}

# create GridSearchCV object
svm_poly_tuned = GridSearchCV(svm_poly, param_svm_poly, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# fit data that has been standardized
svm_poly_tuned.fit(X_train_std, y_train)

# print best parameter after tuning
print(f'Best Hyperparameter: {svm_poly_tuned.best_params_}')

# print best score after tuning
print(f'Best Accuracy Score: {svm_poly_tuned.best_score_}')

Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best Hyperparameter: {'C': 0.1, 'coef0': 1.0, 'degree': 2, 'gamma': 'scale'}
Best Accuracy Score: 0.7783662166839737


### **Model Evaluation**

In [48]:
# set best estimator
svm_poly = svm_poly_tuned.best_estimator_

# fit data 
svm_poly.fit(X_train_std, y_train)

# predict train set
y_pred_train = svm_poly.predict(X_train_std)

# predict test set
y_pred_test = svm_poly.predict(X_test_std)

# calculate train and test data accuracy score
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

check_model(svm_poly, X_train_std, y_train)

# print train and test data accuracy score
print(f"\nAccuracy on train set: {acc_train * 100:.2f}%")
print(f"Accuracy on test set: {acc_test * 100:.2f}%")

Mean Cross-Validation Score: 0.7784
Standard Deviation: 0.0129
Interpretation:
The model appears to generalize well.

Accuracy on train set: 79.33%
Accuracy on test set: 75.76%


## **Decision Tree**

### **Hyperparameter Tuning**

In [49]:
# define default DecisionTreeClassifier model
dt = DecisionTreeClassifier()

# define the grid of hyperparameters
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20, 25, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 5, 10, 15]
}

# perform GridSearch
dt_grid = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# fit the model
dt_grid.fit(X_train_std, y_train)

# print best hyperparameters
print(f'Best hyperparameters: {dt_grid.best_params_}')

# print best model accuracy
print(f'\nBest model accuracy: {dt_grid.best_score_}')

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best hyperparameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 15, 'min_samples_split': 2}

Best model accuracy: 0.7597784700588439


### **Model Evaluation**

In [50]:
# set best estimator
dt = dt_grid.best_estimator_

# fit data
dt.fit(X_train_std, y_train)

# predict train set
y_pred_train = dt.predict(X_train_std)

# predict test set
y_pred_test = dt.predict(X_test_std)

# calculate train and test data accuracy score
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

check_model(dt, X_train_std, y_train)

# print train and test data accuracy score
print(f"\nAccuracy on train set: {acc_train * 100:.2f}%")
print(f"Accuracy on test set: {acc_test * 100:.2f}%")

Mean Cross-Validation Score: 0.7598
Standard Deviation: 0.0246
Interpretation:
The model appears to generalize well.

Accuracy on train set: 80.26%
Accuracy on test set: 77.06%


# **Voting Classifier**

In [53]:
# define algorithm that used in voting classifier
clf1 = logreg
clf2 = svm_poly
clf3 = dt

# create hard voting classifier object
voting_clf = VotingClassifier(estimators=[('Log-Reg', clf1), ('SVM-POLY', clf2), ('Dec-Tree', clf3)], voting='hard')

# fit data that has been standardized
voting_clf.fit(X_train_std, y_train)

# predict train set
y_pred_train = voting_clf.predict(X_train_std)

# predict test set
y_pred_voting = voting_clf.predict(X_test_std)

# calculate test and train data accuracy score
acc_train = accuracy_score(y_train, y_pred_train)
acc_voting = accuracy_score(y_test, y_pred_voting)

check_model(voting_clf, X_train_std, y_train)

# print accuracy score
print('\nVoting Hard')
print(f"Accuracy on train set: {acc_train * 100:.2f}%")
print(f"Accuracy on test set: {acc_voting * 100:.2f}%")

Mean Cross-Validation Score: 0.7783
Standard Deviation: 0.0156
Interpretation:
The model appears to generalize well.

Voting Hard
Accuracy on train set: 79.14%
Accuracy on test set: 75.76%
