In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('C:\\Users\\HP\\Documents\\KESHTECH\\DSML\\Projects\\Diabetes_Prediction\\Diabetes_Classification\\Datasets\\diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, RandomizedSearchCV, cross_validate, GridSearchCV
from sklearn.metrics import classification_report

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score, recall_score, precision_score

In [5]:
X = df.drop(columns="Diabetes_binary")
y = df["Diabetes_binary"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify = y, random_state=42)

# check shapes of split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(56553, 21)
(14139, 21)
(56553,)
(14139,)


In [7]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler

In [8]:
pipe_rfc = make_pipeline(
    MinMaxScaler(),
    RandomForestClassifier(
        n_estimators=300,
        criterion='entropy',
        max_depth=15,
        min_samples_split=8,
        min_samples_leaf=3,
        max_features='log2',
        bootstrap=False,
        oob_score=False,
        ccp_alpha=0.005
    )
)

In [9]:
pipe_svmp = make_pipeline(MinMaxScaler(), SVC(C = 4, degree = 2, gamma = 'scale', kernel = 'poly'))

In [10]:
pipe_xgb = make_pipeline(
    MinMaxScaler(),
    XGBClassifier(
        booster='gbtree',
        learning_rate=0.1,
        n_estimators=200,
        max_depth=5,
        min_child_weight=2,
        gamma=0.1,
        subsample=0.9,
        colsample_bytree=0.8,
        reg_alpha=0.01,
        reg_lambda=1.5
    )
)

In [11]:
# Train SVM model
pipe_svmp.fit(X_train, y_train)
svm_preds = pipe_svmp.predict(X_test)

# Evaluate SVM model
svm_accuracy = accuracy_score(y_test, svm_preds)
svm_recall = recall_score(y_test, svm_preds)
svm_precision = precision_score(y_test, svm_preds)
svm_f1 = f1_score(y_test, svm_preds)

print(f"SVM Model - Accuracy: {svm_accuracy}, Recall: {svm_recall}, Precision: {svm_precision}, F1 Score: {svm_f1}")

SVM Model - Accuracy: 0.7492043284532145, Recall: 0.8199179516197482, Precision: 0.7183046226298179, F1 Score: 0.7657550535077289


In [12]:
# Train XGBoost model
pipe_xgb.fit(X_train, y_train)
xgb_preds = pipe_xgb.predict(X_test)

# Evaluate XGBoost model
xgb_accuracy = accuracy_score(y_test, xgb_preds)
xgb_recall = recall_score(y_test, xgb_preds)
xgb_precision = precision_score(y_test, xgb_preds)
xgb_f1 = f1_score(y_test, xgb_preds)

print(f"XGBoost Model - Accuracy: {xgb_accuracy}, Recall: {xgb_recall}, Precision: {xgb_precision}, F1 Score: {xgb_f1}")

XGBoost Model - Accuracy: 0.7540844472734989, Recall: 0.8019521855990946, Precision: 0.7318616059901885, F1 Score: 0.765305433682079


In [13]:
# Train RandomForest model
pipe_rfc.fit(X_train, y_train)
rf_preds = pipe_rfc.predict(X_test)

# Evaluate RandomForest model
rf_accuracy = accuracy_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)

print(f"RandomForest Model - Accuracy: {rf_accuracy}, Recall: {rf_recall}, Precision: {rf_precision}, F1 Score: {rf_f1}")

RandomForest Model - Accuracy: 0.73520050922979, Recall: 0.7803083887395671, Precision: 0.7157129881925522, F1 Score: 0.7466161342717921


In [14]:
# Create a list of base estimators
base_estimators = [
    ('rfc', pipe_rfc.named_steps['randomforestclassifier']),
    ('svmp', pipe_svmp.named_steps['svc']),
    ('xgb', pipe_xgb.named_steps['xgbclassifier'])
]

#### Implement Boosting using ADABoost

In [15]:
# Create a custom AdaBoost classifier
class CustomAdaBoostClassifier(AdaBoostClassifier):
    def __init__(self, base_estimators, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):
        self.base_estimators = base_estimators
        super().__init__(base_estimator=None, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm, random_state=random_state)
    
    def fit(self, X, y):
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)
        
        for iboost in range(self.n_estimators):
            sample_weight = np.ones(X.shape[0]) / X.shape[0]
            estimator = self.base_estimators[iboost % len(self.base_estimators)][1]
            estimator.fit(X, y, sample_weight=sample_weight)
            y_predict = estimator.predict(X)
            incorrect = (y_predict != y)
            estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))
            
            if estimator_error <= 0:
                break
                
            self.estimator_errors_[iboost] = estimator_error
            self.estimator_weights_[iboost] = self.learning_rate * (np.log((1. - estimator_error) / estimator_error))
            
            if iboost + 1 < self.n_estimators:
                sample_weight *= np.exp(self.estimator_weights_[iboost] * incorrect * (sample_weight > 0))
                sample_weight /= np.sum(sample_weight)
            
            self.estimators_.append(estimator)
        
        return self

    def predict(self, X):
        pred = sum(weight * estimator.predict(X) for weight, estimator in zip(self.estimator_weights_, self.estimators_))
        return np.sign(pred)

In [16]:
# Initialize the custom AdaBoost classifier
ada_boost = CustomAdaBoostClassifier(base_estimators=base_estimators, n_estimators=5, learning_rate=1.0, algorithm='SAMME.R', random_state=42)

# Train the AdaBoost ensemble
ada_boost.fit(X_train, y_train)

In [17]:
# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, recall, precision, f1

In [19]:
# Calculate metrics for Random Forest
rfc_metrics = calculate_metrics(y_test, rf_preds)

In [21]:
# Calculate metrics for SVM
svmp_metrics = calculate_metrics(y_test, svm_preds)

In [22]:
# Calculate metrics for XGBoost
xgb_metrics = calculate_metrics(y_test, xgb_preds)

In [23]:
# Calculate metrics for AdaBoost ensemble
ada_preds = ada_boost.predict(X_test)
ada_metrics = calculate_metrics(y_test, ada_preds)

In [24]:
# Create a table of results
results = pd.DataFrame({
    'Model': ['Random Forest', 'SVM', 'XGBoost', 'AdaBoost Ensemble'],
    'Accuracy': [rfc_metrics[0], svmp_metrics[0], xgb_metrics[0], ada_metrics[0]],
    'Recall': [rfc_metrics[1], svmp_metrics[1], xgb_metrics[1], ada_metrics[1]],
    'Precision': [rfc_metrics[2], svmp_metrics[2], xgb_metrics[2], ada_metrics[2]],
    'F1 Score': [rfc_metrics[3], svmp_metrics[3], xgb_metrics[3], ada_metrics[3]]
})

print(results)

               Model  Accuracy    Recall  Precision  F1 Score
0      Random Forest  0.735201  0.780308   0.715713  0.746616
1                SVM  0.749204  0.819918   0.718305  0.765755
2            XGBoost  0.754084  0.801952   0.731862  0.765305
3  AdaBoost Ensemble  0.736049  0.788513   0.713609  0.749194
