In [117]:
import pandas as pd

In [118]:
df = pd.read_csv(r"C:\Users\HP\Documents\GitHub\health-risk-prediction\data\Diabetes_Classification.csv")

In [119]:
df.head(10)

Unnamed: 0,Age,Gender,BMI,Blood Pressure,FBS,HbA1c,Family History of Diabetes,Smoking,Diet,Exercise,Diagnosis
0,45,Male,25,Normal,100,5.7,No,No,Healthy,Regular,No
1,55,Female,30,High,120,6.4,Yes,Yes,Poor,No,Yes
2,65,Male,35,High,140,7.1,Yes,Yes,Poor,No,Yes
3,75,Female,40,High,160,7.8,Yes,Yes,Poor,No,Yes
4,40,Male,20,Normal,80,5.0,No,No,Healthy,Regular,No
5,50,Female,25,Normal,100,5.7,No,No,Healthy,Regular,No
6,60,Male,30,Normal,120,6.4,No,No,Healthy,Regular,No
7,70,Female,35,Normal,140,7.1,No,No,Healthy,Regular,No
8,45,Male,25,Low,80,5.0,Yes,Yes,Poor,No,No
9,55,Female,30,Normal,100,5.7,Yes,Yes,Poor,No,No


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         128 non-null    int64  
 1   Gender                      128 non-null    object 
 2   BMI                         128 non-null    int64  
 3   Blood Pressure              128 non-null    object 
 4   FBS                         128 non-null    int64  
 5   HbA1c                       128 non-null    float64
 6   Family History of Diabetes  128 non-null    object 
 7   Smoking                     128 non-null    object 
 8   Diet                        128 non-null    object 
 9   Exercise                    128 non-null    object 
 10  Diagnosis                   128 non-null    object 
dtypes: float64(1), int64(3), object(7)
memory usage: 11.1+ KB


In [121]:
df.describe()

Unnamed: 0,Age,BMI,FBS,HbA1c
count,128.0,128.0,128.0,128.0
mean,42.03125,35.359375,162.5,7.8875
std,16.783915,14.981739,61.323975,2.146339
min,12.0,10.0,80.0,5.0
25%,28.0,24.0,120.0,6.4
50%,40.0,34.0,160.0,7.8
75%,55.0,45.5,205.0,9.375
max,75.0,67.0,280.0,12.0


In [122]:
df.isnull().sum()

Age                           0
Gender                        0
BMI                           0
Blood Pressure                0
FBS                           0
HbA1c                         0
Family History of Diabetes    0
Smoking                       0
Diet                          0
Exercise                      0
Diagnosis                     0
dtype: int64

In [123]:
df.duplicated().sum()

0

The dataset is a fully clean set, not needing data cleaning again

## Decision Tree for the Dataset

1st, Define features and targets

In [124]:
X = df.drop('Diagnosis', axis=1)
y =df['Diagnosis']

In [125]:
X

Unnamed: 0,Age,Gender,BMI,Blood Pressure,FBS,HbA1c,Family History of Diabetes,Smoking,Diet,Exercise
0,45,Male,25,Normal,100,5.7,No,No,Healthy,Regular
1,55,Female,30,High,120,6.4,Yes,Yes,Poor,No
2,65,Male,35,High,140,7.1,Yes,Yes,Poor,No
3,75,Female,40,High,160,7.8,Yes,Yes,Poor,No
4,40,Male,20,Normal,80,5.0,No,No,Healthy,Regular
...,...,...,...,...,...,...,...,...,...,...
123,17,Female,15,Normal,100,5.7,No,Yes,Poor,No
124,22,Male,19,Normal,120,6.4,No,Yes,Poor,No
125,27,Female,24,High,140,7.1,No,Yes,Poor,No
126,32,Male,29,High,160,7.8,No,Yes,Poor,No


There are numerical and categorical(non numerical) features, lets preprocess by encoding the categorical variables 

In [126]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numerical_cols = ['Age','BMI','FBS','HbA1c']
categorical_cols = ['Gender', 'Family History of Diabetes', 'Smoking', 'Diet', 'Exercise', 'Blood Pressure']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

### Train-Test

In [127]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

dt_pipeline = Pipeline(steps=[
    ('preprocessor' , preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

dt_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [129]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_test_pred = dt_pipeline.predict(X_test)

In [131]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9230769230769231


In [132]:
# Detailed performance
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Classification Report:
               precision    recall  f1-score   support

          No       1.00      0.90      0.95        20
         Yes       0.75      1.00      0.86         6

    accuracy                           0.92        26
   macro avg       0.88      0.95      0.90        26
weighted avg       0.94      0.92      0.93        26



Lets compare the train and test metrics to detect over or underfitting

In [133]:
# Make predictions on the train set
y_train_pred = dt_pipeline.predict(X_train)

In [135]:
# Evaluate accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [136]:
# Detailed performance
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))


Classification Report:
               precision    recall  f1-score   support

          No       1.00      1.00      1.00        77
         Yes       1.00      1.00      1.00        25

    accuracy                           1.00       102
   macro avg       1.00      1.00      1.00       102
weighted avg       1.00      1.00      1.00       102

