# XGBOOST Version by Precious Kings

In [None]:
# xgboost_diabetes.py
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from xgboost import XGBClassifier

In [None]:
# xgboost_diabetes.py
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from xgboost import XGBClassifier

# ---------------------------
# CONFIG
# ---------------------------
CSV_PATH = "../data/new_diabetic_data.csv"   # <-- change to your path
RANDOM_STATE = 42
TEST_SIZE = 0.2

# ---------------------------
# 1) Load & basic cleaning
# ---------------------------
df = pd.read_csv(CSV_PATH)
# unify missing tokens
df = df.replace("?", np.nan)

# ensure encounter_id and patient_nbr are usable for sorting
df['encounter_id'] = pd.to_numeric(df['encounter_id'], errors='coerce')
df['patient_nbr'] = pd.to_numeric(df['patient_nbr'], errors='coerce')

# ---------------------------
# 2) Map A1Cresult & max_glu_serum to numeric proxies
# (these numeric proxies are used to compute risk_score)
# ---------------------------
a1c_map = {
    'None': 0.0, 'Norm': 1.0, '>7': 2.0, '>8': 3.0
}
glu_map = {
    'None': 0.0, 'Norm': 1.0, '>200': 2.0, '>300': 3.0
}
df['A1C_numeric'] = df['A1Cresult'].astype(str).map(a1c_map)
df['max_glu_numeric'] = df['max_glu_serum'].astype(str).map(glu_map)

# ---------------------------
# 3) Ensure numeric columns are numeric
# ---------------------------
num_cols = [
    'num_lab_procedures', 'num_procedures', 'num_medications',
    'number_outpatient', 'number_emergency', 'number_inpatient',
    'number_diagnoses', 'time_in_hospital'
]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# ---------------------------
# 4) Compute risk_score exactly as before and risk_class
#    risk_score = 0.4*A1C + 0.3*number_inpatient + 0.2*number_diagnoses + 0.1*num_medications
# ---------------------------
df['A1C_numeric'] = df['A1C_numeric'].fillna(0.0)
for c in ['number_inpatient', 'number_diagnoses', 'num_medications']:
    if c not in df.columns:
        df[c] = 0
    df[c] = df[c].fillna(0)

df['risk_score_raw'] = (
    0.4 * df['A1C_numeric'] +
    0.3 * df['number_inpatient'] +
    0.2 * df['number_diagnoses'] +
    0.1 * df['num_medications']
)

# normalize 0-100 (safe: if constant, set to 0)
minv = df['risk_score_raw'].min()
maxv = df['risk_score_raw'].max()
if pd.isna(minv) or pd.isna(maxv) or maxv == minv:
    df['risk_score'] = 0.0
else:
    df['risk_score'] = 100 * (df['risk_score_raw'] - minv) / (maxv - minv)

def risk_class(score):
    if score <= 40:
        return 0  # Fair
    elif score <= 70:
        return 1  # Moderate
    else:
        return 2  # High

df['risk_class'] = df['risk_score'].apply(risk_class)

# ---------------------------
# 5) Build progression label using patient_nbr (next visit worse than current)
# ---------------------------
df = df.sort_values(['patient_nbr', 'encounter_id'])
df['next_risk_class'] = df.groupby('patient_nbr')['risk_class'].shift(-1)
df['progression'] = (df['next_risk_class'] > df['risk_class']).astype('Int64')  # 1 = worsened, 0 = same/improved

# drop last encounters (no next visit)
df = df.dropna(subset=['next_risk_class']).copy()
df['progression'] = df['progression'].astype(int)

# ---------------------------
# 6) Feature list: remove identifiers and targets
# ---------------------------
exclude = {
    'encounter_id', 'patient_nbr', 'risk_score_raw', 'risk_score',
    'risk_class', 'next_risk_class', 'progression'
}
features = [c for c in df.columns if c not in exclude]

# small safety: remove columns that are entirely NA or non-informative
features = [c for c in features if df[c].notna().any()]

X = df[features]
y_risk = df['risk_class']
y_prog = df['progression']

# ---------------------------
# 7) Preprocessing pipeline (numeric impute+scale, categorical impute+OHE)
# ---------------------------
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

# ---------------------------
# 8) Train-test split (stratify by risk_class so class balance preserved)
# ---------------------------
X_train, X_test, y_risk_train, y_risk_test, y_prog_train, y_prog_test = train_test_split(
    X, y_risk, y_prog, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_risk
)

# ---------------------------
# 9) XGBoost pipelines for both tasks
# ---------------------------
# Risk classification (multiclass)
xgb_risk = Pipeline(steps=[
    ('preproc', preprocessor),
    ('model', XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=RANDOM_STATE,
        tree_method='hist',
        verbosity=0
    ))
])
xgb_risk.fit(X_train, y_risk_train)
y_risk_pred = xgb_risk.predict(X_test)

# Progression (binary)
xgb_prog = Pipeline(steps=[
    ('preproc', preprocessor),
    ('model', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=RANDOM_STATE,
        tree_method='hist',
        verbosity=0
    ))
])
xgb_prog.fit(X_train, y_prog_train)
y_prog_pred = xgb_prog.predict(X_test)

# ---------------------------
# 10) Evaluation & save metrics
# ---------------------------
print("==== XGBoost - Risk Stratification (multiclass) ====")
print(classification_report(y_risk_test, y_risk_pred))
print("Accuracy:", accuracy_score(y_risk_test, y_risk_pred))
print("Macro F1:", f1_score(y_risk_test, y_risk_pred, average='macro'))

print("\n==== XGBoost - Progression (binary) ====")
print(classification_report(y_prog_test, y_prog_pred))
print("Accuracy:", accuracy_score(y_prog_test, y_prog_pred))
print("F1 (binary):", f1_score(y_prog_test, y_prog_pred, average='binary'))

# Save basic metrics for comparison
metrics = {
    'model': 'xgboost',
    'risk_accuracy': accuracy_score(y_risk_test, y_risk_pred),
    'risk_macro_f1': f1_score(y_risk_test, y_risk_pred, average='macro'),
    'prog_accuracy': accuracy_score(y_prog_test, y_prog_pred),
    'prog_f1': f1_score(y_prog_test, y_prog_pred, average='binary')
}
pd.DataFrame([metrics]).to_csv("metrics_xgboost.csv", index=False)
print("\nSaved metrics_xgboost.csv")

==== XGBoost - Risk Stratification (multiclass) ====
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4640
           1       0.99      0.99      0.99      1381
           2       0.96      0.90      0.93        29

    accuracy                           1.00      6050
   macro avg       0.98      0.96      0.97      6050
weighted avg       1.00      1.00      1.00      6050

Accuracy: 0.9958677685950413
Macro F1: 0.9724161540181487

==== XGBoost - Progression (binary) ====
              precision    recall  f1-score   support

           0       0.83      0.99      0.90      5016
           1       0.42      0.05      0.09      1034

    accuracy                           0.83      6050
   macro avg       0.63      0.52      0.49      6050
weighted avg       0.76      0.83      0.76      6050

Accuracy: 0.8261157024793389
F1 (binary): 0.08521739130434783

Saved metrics_xgboost.csv
