In [1]:
import pandas as pd  
import numpy as np  

from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
# Loading the data
df_dt = pd.read_csv('diabetic_data_clean.csv')
df_dt.head()

  df_dt = pd.read_csv('diabetic_data_clean.csv')


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,readmit_30d
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,NO,0
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,0
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,Yes,NO,0
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,Up,No,No,No,No,No,Ch,Yes,NO,0
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,Steady,No,No,No,No,No,Ch,Yes,NO,0


In [3]:
# Set the "readmit_30d" as target/model output y and the rest features as model inputs X
y_data = df_dt['readmit_30d']
X_data = df_dt.drop(['readmit_30d', 'readmitted', 'encounter_id', 'patient_nbr'],axis=1)


In [4]:
# Split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data,
    test_size=0.2,
    random_state=42,
    stratify=y_data
)

In [5]:
# Identify numeric vs categorical columns
numeric_cols = X_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X_data.columns if c not in numeric_cols]

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)


In [6]:
# =========================
# 5) Decision Tree model
# =========================

# Create Decision Tree Model
# Intentional constrained depth tp preserve interpretability and avoid overfitting
dt_pipeline = Pipeline([
    ("preprocess", preprocess),   
    ("model", DecisionTreeClassifier(max_depth=5, class_weight='balanced', random_state=42)) #random state defined for reproducibility
])

# Fit on training data
dt_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [7]:
# =========================
# 6) Evaluation
# =========================
def evaluate(pipe, name):
    # For decision trees, just use predict
    pred = pipe.predict(X_test)
    auc = np.nan  # optional, trees might not have predict_proba
    proba = dt_pipeline.predict_proba(X_test)[:, 1]
    
    return {
        "model": name,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "f1": f1_score(y_test, pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, proba),
        "confusion_matrix": confusion_matrix(y_test, pred)
    }

# Evaluate your Decision Tree pipeline
results = [evaluate(dt_pipeline, "Decision Tree")]

In [8]:
# Print summary metrics
print(pd.DataFrame([{k:v for k,v in r.items() if k!="confusion_matrix"} for r in results]))

# Print confusion matrix
print("\nConfusion Matrix:")
for r in results:
    print(f"\n{r['model']}")
    print(r["confusion_matrix"])


           model  accuracy  precision    recall        f1   roc_auc
0  Decision Tree  0.629753   0.171963  0.607662  0.268065  0.656326

Confusion Matrix:

Decision Tree
[[11438  6645]
 [  891  1380]]


Tree depth was varied to study the bias–variance tradeoff. Performance improved up to a depth of 5, after which gains were marginal. A maximum depth of 5 was therefore selected as it provided the best balance between minority-class F1-score and model complexity.

To summarize, the decision tree identified intuitive non-linear relationships, particularly the number of prior inpatient visits and discharge disposition. However, when constrained to a shallow depth to reduce overfitting, the model tended to favor the majority class and underperformed the logistic regression baseline in terms of F1-score. This suggests that while decision trees offer interpretability, they require careful tuning to effectively handle class imbalance in this dataset.