In [78]:
!pip install ucimlrepo --quiet

In [79]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

In [80]:
# Read csv file from UCI ML repo
import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00529/diabetes_data_upload.csv')

In [81]:
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


In [82]:
# Import all necessary libraries for training a RandomForestClassifier, evaluating accuracy, and xgboost
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from xgboost import XGBClassifier, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz

In [83]:
X = df.drop('class', axis=1)
y = df['class']

OneHotEncode categorical columns & MinMaxScale

In [84]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [85]:
encoder = OneHotEncoder(sparse_output=False)
scaler = MinMaxScaler()

In [86]:
X = encoder.fit_transform(X.select_dtypes(include=['object']))
X = scaler.fit_transform(X)

In [87]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [88]:
model = RandomForestClassifier(n_estimators=50, max_depth=15, max_features=15)

Prepare the dataset for the model by including only numeric cols for X_train, X_val,

In [89]:
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_val, y_val))

0.9939759036144579
0.9642857142857143


In [90]:
print('Accuracy: ', model.score(X_test, y_test))

Accuracy:  0.9711538461538461


# Using a XG Booster

We'll need to OneHotEncode the y_train, y_val, and y_test columns to allow for the XGB booster to predict a positive or negative case

In [91]:
y_train

Unnamed: 0,class
288,Negative
343,Positive
512,Negative
383,Positive
315,Negative
...,...
110,Positive
390,Negative
98,Positive
495,Negative


In [94]:
from sklearn.preprocessing import LabelEncoder

# Intialize the encoder
label_encoder = LabelEncoder()

# Convert 'negative' to 0 and 'positive' to 1
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.fit_transform(y_val)
y_test_encoded = label_encoder.fit_transform(y_test)

# Get the number of unique classes
num_classes = len(np.unique(y_train_encoded))

modelXGB = XGBClassifier(objective='multi:softmax', learning_rate=0.05,
                      max_depth=1, n_estimators=250, num_class=num_classes)
modelXGB.fit(X_train, y_train_encoded)

In [96]:
preds = modelXGB.predict(X_val)
print(sum(preds == y_val_encoded), len(y_val))
print('Accuracy: ', accuracy_score(y_val_encoded, preds))

75 84
Accuracy:  0.8928571428571429


In [97]:
print('Accuracy of test set: ', accuracy_score(y_test_encoded, modelXGB.predict(X_test)))

Accuracy of test set:  0.8942307692307693


# Findings: Random Forest Classifier yields an impressive 8% higher accuracy than an XGBoost model.



1.   Random Forest Classifiers grow **multiple decision trees**, with all their decisions merged together for a more accurate prediction, whereas **XGBoost** builds trees **sequentially** - each tree is trained to correct the errors of the previous one. While **XGBoost** can **reduce bias** through this method and **create highly accurate models**, it **limits diversity** across trees - each new tree focuses heavily on correcting the same residuals. For datasets where **capturing varied perspectives is essential**, such as **medical predictions** involving **complex, interdependent features**, ***Random Forest's independent, ensemble-driven approach*** more often leads to a better performance.
2.  **Random Forest model** has the advantage of multiple uncorrelated models - they perform much better as a group. **Each tree** gives a classification or a **"vote"**
3.  **An Analogy for Predicting Heart Disease with Random Forest Classifiers:** Imagine consulting a panel of 200 medical professionals who analyze a patient's data (without knowing the actual diagnosis). Each medical expert assesses the likelihood of heart disease based on different criteria and training backgrounds, analogous to individual decision trees in the Random Forest. While any single expert might make a misjudgment, the collective opinion of all 200 experts is more likely to be accurate, as it averages out individual biases or errors. This collaborative approach is akin to how Random Forests aggregate predictions, making them highly effective for complex, feature-rich data like heart disease diagnosis.




