# Model ensemble with Stacking: K-Nearest Neighbors (KNN), Support Vector Machine (SVM), Random Forest and Logistic Regression

In [196]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [197]:
# Load the dataset
df = pd.read_csv('Model_ensemble_diabetes.csv')
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Feature description:
- `pregnancies`: Number of times pregnant
- `glucose`: Plasma glucose concentration over 2 hours in an oral glucose tolerance test
- `diastolic` (blood pressure): Diastolic blood pressure (mm Hg)
- `triceps` (skin thickness): Triceps skin fold thickness (mm)
- `insulin`: 2-Hour serum insulin (mu U/ml)
- `bmi`: Body mass index (weight in kg/(height in m)2)
- `dpf` (Diabetes Pedigree Function): Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
- `age`: Age (years)

`diabetes` (outcome): Class variable (0 if non-diabetic, 1 if diabetic)

In [198]:
# Dataset shape
print("Shape of dataset: " + str(df.shape))

Shape of dataset: (768, 9)


In [199]:
# Separate data features and data labels
X = df.drop(columns = ['diabetes'])     # Data features
y = df['diabetes']                      # Data lebels

In [200]:
# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("Shape of training set: " + str(X_train.shape))
print("Shape of test set: " + str(X_test.shape))

Shape of training set: (537, 8)
Shape of test set: (231, 8)


In [201]:
# Initialize and use StandardScaler to normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)     # Fit and transform thr training data
X_test = scaler.transform(X_test)           # Only transform the test data.

# K-Nearest Neighbors (KNN)

KNN, or K-Nearest Neighbors, is a simple yet powerful algorithm used in machine learning for both classification and regression tasks. It works by assuming that similar data points tend to have similar labels or values.

The KNN algorithm stores the entire training dataset. This dataset consists of data points that have already been labeled or have known values. When a new data point comes in, the algorithm calculates the distance between it and all the data points in the training set, then identifies the K nearest neighbors to the new data point. The value of K is a hyperparameter that needs to be tuned, and it determines how many neighbors will influence the prediction.

In [202]:
# Train a K-Nearest Neighbor (KNN) model
knn = KNeighborsClassifier()                           # Initialize KNN model.
params_knn = {'n_neighbors': np.arange(1, 25)}         # n_neighbors in KNeighborsClassifier() indicates the number of neighbors K.
knn_gs = GridSearchCV(knn, params_knn, cv=5)           # Initialize GridSearchCV to find an optimal value of K.
knn_gs.fit(X_train, y_train)                           # Fit GridSearch in training set to find the optimal K.

In [203]:
# Best number of neighbors K
knn_best = knn_gs.best_estimator_
print(knn_gs.best_params_)

{'n_neighbors': 22}


# Support Vector Machine (SVM)

In [204]:
# Train a Support Vector Machine (SVM) model
svm = SVC()

# C is a hyperparameter that controls the trade-off between training error and margin maximization.
# Higher C: Stricter enforcement of correct classification for all training points, and less emphasis on margin maximization.
# Lower C: More tolerance for misclassifications on the training data, and more emphasis on margin maximization.
# For more hyperparameters of SVM, kindly refer to https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
params_svm = {"C": [0.1, 1, 10, 100]}

svm_gs = GridSearchCV(svm, params_svm, cv=5)    # Initialize GridSearchCV to find an optimal value if the hyperparameter C.
svm_gs.fit(X_train, y_train)                    # Fit GridSearch in training set to find the optimal C.

In [205]:
# Best value of the hyperparameter C.
svm_best = svm_gs.best_estimator_
print(svm_gs.best_params_)

{'C': 1}


# Random Forest

In [206]:
# Train a Random Forest classifier
rf = RandomForestClassifier()                        # Initialize a Random Forest Classifier.
params_rf = {'n_estimators': [50, 100, 200]}         # n_estimator in RandomForestClassifier(...) indicates the number of Trees in the Forest.
rf_gs = GridSearchCV(rf, params_rf, cv=5)            # Initialize GridSearchCV to find an optimal number of Trees.
rf_gs.fit(X_train, y_train)                          # Fit GridSearch in training set to find the optimal number of Trees.

In [207]:
# Best number of Trees.
rf_best = rf_gs.best_estimator_
print(rf_gs.best_params_)

{'n_estimators': 50}


# Logistic Regression

In [208]:
# Train a Logistic Regression model
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)   # Initialize Logistic Regression model.
log_reg.fit(X_train, y_train)                                 # Fit the model to training set.

# Model testing

In [209]:
# Print accuracy of single models on the test set
print('KNN: {}'.format(knn_best.score(X_test, y_test)))                     # KNN accuracy
print('SVM: {}'.format(svm_best.score(X_test, y_test)))                     # SVM accuracy
print('Random Forest: {}'.format(rf_best.score(X_test, y_test)))            # Random Forest accuracy
print('Logistic Regression: {}'.format(log_reg.score(X_test, y_test)))      # Logistic Regression accuracy

KNN: 0.7662337662337663
SVM: 0.7792207792207793
Random Forest: 0.7835497835497836
Logistic Regression: 0.7705627705627706


# Model ensembling

In [210]:
# Ensemble the four models using hard (majority) voting
estimators=[('knn', knn_best), ('svm', svm_best), ('rf', rf_best), ('log_reg', log_reg)]    # Initialize base models in the ensemble
ensemble = VotingClassifier(estimators, voting='hard')                                      # Define how to ensemble them, i.e., hard voting

In [211]:
# Train the model ensemble on the training set
ensemble.fit(X_train, y_train)          # Train the ensemble on the training set
ensemble.score(X_test, y_test)          # Test the ensemble on the test set

0.7878787878787878