In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('C:\\Users\\HP\\Documents\\KESHTECH\\DSML\\Projects\\Diabetes_Prediction\\Diabetes_Classification\\Datasets\\diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, RandomizedSearchCV, cross_validate, GridSearchCV
from sklearn.metrics import classification_report

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score, recall_score, precision_score

In [5]:
X = df.drop(columns="Diabetes_binary")
y = df["Diabetes_binary"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify = y, random_state=42)

# check shapes of split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(56553, 21)
(14139, 21)
(56553,)
(14139,)


In [7]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler

In [8]:
pipe_rfc = make_pipeline(
    MinMaxScaler(),
    RandomForestClassifier(
        n_estimators=300,
        criterion='entropy',
        max_depth=15,
        min_samples_split=8,
        min_samples_leaf=3,
        max_features='log2',
        bootstrap=False,
        oob_score=False,
        ccp_alpha=0.005
    )
)

In [9]:
pipe_svmp = make_pipeline(MinMaxScaler(), SVC(C = 4, degree = 2, gamma = 'scale', kernel = 'poly'))

In [10]:
pipe_xgb = make_pipeline(
    MinMaxScaler(),
    XGBClassifier(
        booster='gbtree',
        learning_rate=0.1,
        n_estimators=200,
        max_depth=5,
        min_child_weight=2,
        gamma=0.1,
        subsample=0.9,
        colsample_bytree=0.8,
        reg_alpha=0.01,
        reg_lambda=1.5
    )
)

#### Implement Bagging with Scikit-learn

We can use BaggingClassifier to combine these models. Since BaggingClassifier typically works with a single type of base estimator, the approach is to create a custom voting mechanism after training individual models using bagging.

In [11]:
from scipy.stats import mode

def majority_vote(predictions):
    """
    Perform majority voting on the predictions.
    :param predictions: List of predictions from different models. A 2D array of the models
    :return: Final prediction after majority voting.
    """
    return mode(predictions, axis=1)[0].flatten()

In [12]:
# Number of estimators for bagging
n_estimators = 5

# Initialize lists to hold models and their predictions
bagging_models = []
predictions = []

In [13]:
from sklearn.ensemble import BaggingClassifier
# Train SVM with bagging
svm_bagging = BaggingClassifier(base_estimator=pipe_svmp, n_estimators=n_estimators, random_state=42)
svm_bagging.fit(X_train, y_train)
svm_preds = svm_bagging.predict(X_test)
bagging_models.append(svm_bagging)
predictions.append(svm_preds)

In [14]:
# Train XGBoost with bagging
xgb_bagging = BaggingClassifier(base_estimator=pipe_xgb, n_estimators=n_estimators, random_state=42)
xgb_bagging.fit(X_train, y_train)
xgb_preds = xgb_bagging.predict(X_test)
bagging_models.append(xgb_bagging)
predictions.append(xgb_preds)

In [15]:
# Train Random Forest with bagging
rf_bagging = BaggingClassifier(base_estimator=pipe_rfc, n_estimators=n_estimators, random_state=42)
rf_bagging.fit(X_train, y_train)
rf_preds = rf_bagging.predict(X_test)
bagging_models.append(rf_bagging)
predictions.append(rf_preds)

In [16]:
# Combine predictions from all models
combined_predictions = np.array(predictions).T
final_predictions = majority_vote(combined_predictions)

#### Evaluation

In [17]:
# Function to evaluate models
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, recall, precision, f1

In [18]:
# Evaluate the final ensemble model
ensemble_accuracy, ensemble_recall, ensemble_precision, ensemble_f1 = evaluate_model(y_test, final_predictions)

In [19]:
# Create a DataFrame to display the results
results = {
    'Model': ['SVM Bagging', 'XGB Bagging', 'RF Bagging', 'Ensemble'],
    'Accuracy': [accuracy_score(y_test, svm_preds), accuracy_score(y_test, xgb_preds), accuracy_score(y_test, rf_preds), ensemble_accuracy],
    'Recall': [recall_score(y_test, svm_preds), recall_score(y_test, xgb_preds), recall_score(y_test, rf_preds), ensemble_recall],
    'Precision': [precision_score(y_test, svm_preds), precision_score(y_test, xgb_preds), precision_score(y_test, rf_preds), ensemble_precision],
    'F1 Score': [f1_score(y_test, svm_preds), f1_score(y_test, xgb_preds), f1_score(y_test, rf_preds), ensemble_f1]
}

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Accuracy,Recall,Precision,F1 Score
0,SVM Bagging,0.749912,0.82204,0.718383,0.766724
1,XGB Bagging,0.753236,0.800396,0.731386,0.764336
2,RF Bagging,0.73612,0.782006,0.716248,0.747684
3,Ensemble,0.751892,0.810157,0.72558,0.765539
