In [7]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
# Read the CSV file
data = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')
# Separate features (X) and label (y)
x = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']

In [None]:
x.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [9]:
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_binary, dtype: float64

In [10]:
# Split Data for Train, Validation, Testing as 70%, 20%, 10% respectively.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.10, random_state = 42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=42)

In [11]:
x_train.shape

(182649, 21)

In [12]:
y_train.shape

(182649,)

In [13]:
x_test.shape

(25368, 21)

In [14]:
y_test.shape

(25368,)

In [15]:
x_val.shape

(45663, 21)

In [16]:
y_val.shape

(45663,)

In [17]:
# Build Decision tree model and explain its hyper parameters.
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_split=2)

In [25]:
missing_values = x_train.isnull().sum()
print(missing_values)

HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


In [26]:
# Replace missing values with the mean of each column
x_train = x_train.fillna(x_train.mean())

In [27]:
# Train the model on the training data
model.fit(x_train, y_train)

In [28]:
# Predict labels for the validation set
y_pred = model.predict(x_val)

In [29]:
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [39]:
# State the Evaluation score (Accuracy, Precision, Recall, F1)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred,average='weighted',zero_division=0)
recall = recall_score(y_val, y_pred,average='weighted',zero_division=0)
f1 = f1_score(y_val, y_pred,average='weighted',zero_division=0)

## *Results*

In [40]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8612005343494733
Precision: 0.7416663603638184
Recall: 0.8612005343494733
F1 Score: 0.7969763028496503


In [41]:
# 5-Fold Cross-Validation
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
scores = cross_val_score(model, x, y, cv=cv, scoring='accuracy')
print("5-Fold Cross-Validation Accuracy:", scores.mean())

5-Fold Cross-Validation Accuracy: 0.8606709239987385
