# Model ensemble with Bagging and Boosting

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [28]:
# Load the dataset
df = pd.read_csv('Model_ensemble_diabetes.csv')
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Feature description:
- `pregnancies`: Number of times pregnant
- `glucose`: Plasma glucose concentration over 2 hours in an oral glucose tolerance test
- `diastolic` (blood pressure): Diastolic blood pressure (mm Hg)
- `triceps` (skin thickness): Triceps skin fold thickness (mm)
- `insulin`: 2-Hour serum insulin (mu U/ml)
- `bmi`: Body mass index (weight in kg/(height in m)2)
- `dpf` (Diabetes Pedigree Function): Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
- `age`: Age (years)

`diabetes` (outcome): Class variable (0 if non-diabetic, 1 if diabetic)

In [29]:
# Dataset shape
print("Shape of dataset: " + str(df.shape))

Shape of dataset: (768, 9)


In [30]:
# Seperate data features and data labels
X = df.drop(columns = ['diabetes'])     # Data features
y = df['diabetes']                      # Data lebels

In [31]:
# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("Shape of training set: " + str(X_train.shape))
print("Shape of test set: " + str(X_test.shape))

Shape of training set: (537, 8)
Shape of test set: (231, 8)


In [32]:
# Initialize and use StandardScaler to normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)     # Fit and transform thr training data
X_test = scaler.transform(X_test)           # Only transform the test data.

# Bagging with Support Vector Machine (SVM)

In [33]:
# Creating a Support Vector Machine Classifier as the base estimator
base_svm = SVC(kernel='linear', C=1.0)

In [34]:
# Create a Bagging Classifier with SVM as the base model
# estimator: the base model
# n_estimators: the number of base models to use
# max_samples: the fraction (if float) or the number of samples (if integer) to draw from X_train to train each base estimator (with replacement by default, i.e., bootstrap)
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
bagging_clf = BaggingClassifier(estimator=base_svm, n_estimators=10, max_samples=0.5)

In [35]:
# Training the Bagging Classifier
bagging_clf.fit(X_train, y_train)

In [36]:
# Making predictions on the test set
y_pred = bagging_clf.predict(X_test)

# Evaluating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7662337662337663


# Boosting with AdaBoost, Gradient Boosting and XGBoost

## AdaBoost

In [37]:
# Create an AdaBoost Classifier with Decision Tree as the base model
# estimator: the base model
# n_estimators: the number of base models to use
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
ada_clf = AdaBoostClassifier(estimator = DecisionTreeClassifier(), n_estimators=10)

# Train the AdaBoost Classifier
ada_clf.fit(X_train, y_train)

# Making predictions on the test set
y_pred_ada = ada_clf.predict(X_test)

# Evaluating the accuracy of the model
accuracy_ada = accuracy_score(y_test, y_pred_ada)
print("AdaBoost Classifier Accuracy:", accuracy_ada)

AdaBoost Classifier Accuracy: 0.683982683982684


## Gradient Boosting

In [38]:
# Create a Gradient Boosting Classifier which uses Decision Tree as boosting model by default
# n_estimators: the number of boosting stages to perform.
# learning_rate: learning rate shrinks the contribution of each tree by learning_rate, i.e., f_0 + lr*delta_1 + lr*delta_2 + ...
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
gb_clf = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1)

# Train the Gradient Boosting Classifier
gb_clf.fit(X_train, y_train)

# Making predictions on the test set
y_pred_gb = gb_clf.predict(X_test)

# Evaluating the accuracy of the model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Classifier Accuracy:", accuracy_gb)

Gradient Boosting Classifier Accuracy: 0.7316017316017316


## XGBoost

In [39]:
# Create an XGBoost Classifier
# n_estimators: the number of boosting stages to perform.
# learning_rate: learning rate shrinks the contribution of each tree by learning_rate, i.e., f_0 + lr*delta_1 + lr*delta_2 + ...
# Reference: https://xgboost.readthedocs.io/en/latest/python/python_api.html
xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.1)

# Train the XGBoost Classifier
xgb_clf.fit(X_train, y_train)

# Making predictions on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Evaluating the accuracy of the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Classifier Accuracy:", accuracy_xgb)

XGBoost Classifier Accuracy: 0.7619047619047619
