# Model training Diabetes Prediction Challenge

## 1. Imports 

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier,BaggingClassifier
from sklearn.feature_selection import RFE,SelectFromModel
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import xgboost as xgb
import optuna

## 2. Helper functions

In [17]:
def create_my_balanced_accuracy(weights):
    """
    Creates scoring function for cross_val_score with weights 
    """
    def my_scoring(est,X,y):
        preds = est.predict(X)
        wagi = np.asarray([weights[i] for i in y])

        score = balanced_accuracy_score(y,preds,sample_weight=wagi)

        return score
    
    return my_scoring


In [18]:
def calculate_weights(y: pd.Series):
    """
    Calculates weights for inbalanced class datasets.
    Returns the dictionary with the weights for each class.

    Args:
        y: The target variable
    """
    values = y.value_counts()
    weights = {}
    total = len(y)

    for class_label, count in values.items():
        other_counts = total - count
        weights[class_label] = 1.0 / other_counts if other_counts > 0 else 0

    return weights
        

In [19]:
def compute_scale_pos_weight(y):
    neg = np.sum(y==0)
    pos = np.sum(y==1)
    return neg / pos

In [20]:
def create_transformer(scaler_name,quantitative_columns,categorical_columns):
    if scaler_name == "StandardScaler":
        scaler = StandardScaler()
    elif scaler_name == "RobustScaler":
        scaler = RobustScaler()
    elif scaler_name == "MinMaxScaler":
        scaler = MinMaxScaler()

    numeric_transformer = Pipeline([
        ('inputer', SimpleImputer(strategy='median')),
        ('scaler', scaler)
    ])

    categorical_transformer = Pipeline([
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore',drop='first'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, quantitative_columns),
            ('cat', categorical_transformer, categorical_columns)
        ],
        remainder='drop',
        n_jobs=-1
    )

    return preprocessor

In [21]:
def create_best_model(X_train,y_train,weights,quantitative_columns,categorical_columns):
    skf = StratifiedKFold(n_splits=5,shuffle=True)

    def objective(trial:optuna.trial.Trial):
        n_features = trial.suggest_int('n_features',3,X_train.shape[1])
        model_name = trial.suggest_categorical('model_name',["SVC","KNN"])

        if model_name == "SVC":
            degree = 3
            C = trial.suggest_float("C", 0.001, 100, log=True)
            kernel = trial.suggest_categorical("kernel", ["rbf","poly","linear"])

            if kernel == "poly":
                degree = trial.suggest_int("degree",2,5)

            model = SVC(C=C, kernel=kernel, degree=degree)
        elif model_name == "KNN":
            n_neighbors = trial.suggest_int("n_neighbors", 3, 16)
            weights_knn = trial.suggest_categorical("weights_knn",["uniform","distance"])

            model = KNeighborsClassifier(n_neighbors=n_neighbors,weights=weights_knn)


        bagging = BaggingClassifier(
            estimator=model,
            n_estimators=3,
            max_samples=50000,
            bootstrap=False,
            n_jobs=-1
        )

        preprocessor_name = trial.suggest_categorical("preprocessor",["StandardScaler","RobustScaler","MinMaxScaler"])
        #imputer_strategy = trial.suggest_categorical("imputer_strategy",['median',"mean"])

        #preprocessor = create_transformer(preprocessor_name,imputer_strategy,quantitative_columns,categorical_columns)
        preprocessor = create_transformer(
            preprocessor_name,
            quantitative_columns,
            categorical_columns
        )


        # if estimator_name == "SVC":
        #     estimator = SVC(kernel="linear")
        # elif estimator_name == "GBC":
        estimator = GradientBoostingClassifier(n_estimators=20)

        pipe = Pipeline([
            ("Preprocessor", preprocessor),
            ("RFE",SelectFromModel(estimator=estimator,max_features=n_features)),
            ("Classifier",bagging)
        ])

        scorer = create_my_balanced_accuracy(weights)
        print("Cross Val score")
        scores = cross_val_score(
            pipe,
            X_train,
            y_train,
            cv=skf,
            scoring=scorer,
            n_jobs=-1
        )
        print("END Cross Val Score")
        return scores.mean()

    study = optuna.create_study(direction="maximize")
    study.optimize(objective,n_trials=30)

    return study.best_params, study.best_value


In [22]:
def create_best_model2(X, y, numeric_cols, categorical_cols, use_gpu=False):

    scale_pos_weight = compute_scale_pos_weight(y)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    def objective(trial:optuna.trial.Trial):

        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 800),
            "max_depth": trial.suggest_int("max_depth", 3, 8),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
            "scale_pos_weight": scale_pos_weight,
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "tree_method": "gpu_hist" if use_gpu else "hist",
            "n_jobs": -1,
            "random_state": 42
        }

        preprocessor_name = trial.suggest_categorical("preprocessor",["StandardScaler","RobustScaler","MinMaxScaler"])

        model = xgb.XGBClassifier(**params)

        pipe = Pipeline([
            ("preprocessor", create_transformer(preprocessor_name,numeric_cols, categorical_cols)),
            ("model", model)
        ])

        scores = cross_val_score(
            pipe,
            X,
            y,
            cv=skf,
            scoring="balanced_accuracy",
            n_jobs=-1
        )

        return scores.mean()

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    return study.best_params, study.best_value


## 3. Training

In [23]:
train_data = pd.read_csv("../Data/DiabetesPredictionChallenge/train.csv")
test_data = pd.read_csv("../Data/DiabetesPredictionChallenge/test.csv")

In [24]:
X_train = train_data.drop(columns=["id","diagnosed_diabetes"],axis=1)
y_train = train_data['diagnosed_diabetes']

X_test = test_data.drop(columns=["id"],axis=1)

In [25]:
X_train.isna().sum()

age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0
dtype: int64

In [26]:
categorical_columns = ["gender","education_level","income_level","employment_status","ethnicity","smoking_status","family_history_diabetes","hypertension_history","cardiovascular_history"]
quantitative_columns = [name for name in train_data.columns if name not in categorical_columns and name not in ["id","diagnosed_diabetes"]]

In [27]:
# preporcessor = create_transformer("StandardScaler","constant",quantitative_columns,categorical_columns)

# preporcessor.fit(X_train,y_train)
# dir(preporcessor)
# data = preporcessor.transform(X_train)

In [28]:
#data[:10]

In [29]:
weights = calculate_weights(y_train)

In [30]:
best_params,best_score = create_best_model2(X_train,y_train,quantitative_columns,categorical_columns)

[I 2025-12-27 12:01:48,383] A new study created in memory with name: no-name-3481e25f-39c3-4725-bc2e-1a2c9774ef5d
[I 2025-12-27 12:02:49,396] Trial 0 finished with value: 0.6449267029678425 and parameters: {'n_estimators': 413, 'max_depth': 4, 'learning_rate': 0.011282467650258034, 'subsample': 0.7553712656802619, 'colsample_bytree': 0.988549154204735, 'min_child_weight': 5, 'gamma': 0.358178851315642, 'reg_alpha': 3.8463017959303167, 'reg_lambda': 0.6243435854073187, 'preprocessor': 'StandardScaler'}. Best is trial 0 with value: 0.6449267029678425.
[I 2025-12-27 12:03:49,277] Trial 1 finished with value: 0.6634005684141875 and parameters: {'n_estimators': 565, 'max_depth': 4, 'learning_rate': 0.17114149516559996, 'subsample': 0.846217577550171, 'colsample_bytree': 0.9274721400642025, 'min_child_weight': 7, 'gamma': 1.4992012714585545, 'reg_alpha': 2.129620760376984, 'reg_lambda': 0.1371709669575849, 'preprocessor': 'MinMaxScaler'}. Best is trial 1 with value: 0.6634005684141875.
[I 20

In [31]:

estimator = GradientBoostingClassifier(n_estimators=best_params.pop("n_estimators"))

preprocessor_name = best_params.pop("preprocessor")
preprocessor = create_transformer(preprocessor_name,quantitative_columns,categorical_columns)

# if best_params['model_name'] == "KNN":
#     model = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],weights=best_params['weights_knn'])
# elif best_params['model_name'] == "SVC":
#     model = SVC(C=best_params["C"],kernel=best_params['kernel'],degree=best_params['degree'])

model = xgb.XGBClassifier(**best_params)

pipe = Pipeline([
    ("Preprocessor",preprocessor),
    #("RFE",RFE(estimator=estimator,n_features_to_select=best_params['n_features'])),
    ('Classifier', model)
])

pipe.fit(X_train,y_train)

0,1,2
,steps,"[('Preprocessor', ...), ('Classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9588194183555717
,device,
,early_stopping_rounds,
,enable_categorical,False


In [32]:
import joblib

In [33]:
joblib.dump(pipe,"DiabetesPredictionModel.pkl")

['DiabetesPredictionModel.pkl']

## 3.2 Adjust the decision threshold

In [34]:
from sklearn.model_selection import cross_val_predict

In [37]:
predict_proba

array([[0.49908417, 0.5009158 ],
       [0.37135983, 0.6286402 ],
       [0.7450325 , 0.2549675 ],
       ...,
       [0.36774194, 0.63225806],
       [0.3876645 , 0.6123355 ],
       [0.31140035, 0.68859965]], shape=(700000, 2), dtype=float32)

In [39]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
predict_proba = cross_val_predict(
    pipe,
    X_train,
    y_train,
    cv=skf,
    method='predict_proba'
)

predict_1 = predict_proba[:,1]

best_threshold = 0.5
best_accuracy = 0.0

for thr in np.linspace(0.1,0.9+0.05,18):
    preds = (predict_1 >= thr).astype(int)

    accuracy = balanced_accuracy_score(y_train,preds)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = thr

    print(f"Threshold: {thr} | Accuracy: {accuracy}")

print("-----")
print(f"Best threshold: {best_threshold} | best accuracy: {best_accuracy}")


Threshold: 0.1 | Accuracy: 0.5000860766594086
Threshold: 0.15000000000000002 | Accuracy: 0.5007728577922916
Threshold: 0.2 | Accuracy: 0.5040490137505003
Threshold: 0.25 | Accuracy: 0.5117912097003297
Threshold: 0.30000000000000004 | Accuracy: 0.5239767998699415
Threshold: 0.35 | Accuracy: 0.5405815920225154
Threshold: 0.4 | Accuracy: 0.5629702179226374
Threshold: 0.45000000000000007 | Accuracy: 0.5906280432113235
Threshold: 0.5 | Accuracy: 0.6182692433875953
Threshold: 0.55 | Accuracy: 0.6416177890174648
Threshold: 0.6 | Accuracy: 0.6539548500138204
Threshold: 0.65 | Accuracy: 0.652761328062182
Threshold: 0.7000000000000001 | Accuracy: 0.6384776905785826
Threshold: 0.75 | Accuracy: 0.616999262381101
Threshold: 0.8 | Accuracy: 0.5938757149355689
Threshold: 0.85 | Accuracy: 0.5706307729637137
Threshold: 0.9 | Accuracy: 0.5370545699182141
Threshold: 0.9500000000000001 | Accuracy: 0.5050261647450497
-----
Best threshold: 0.6 | best accuracy: 0.6539548500138204


## 4. Predictions

In [40]:
test_proba = pipe.predict_proba(X_test)[:,1]
yhat = (test_proba >= best_threshold).astype(int)

yhat[:10]


array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1])

In [52]:
proba = pd.DataFrame({'diagnosed_diabetes':test_proba})
result_df = pd.concat([test_data['id'],proba],axis=1)
result_df.set_index("id",inplace=True)

In [None]:
result_df

Unnamed: 0_level_0,diagnosed_diabetes
id,Unnamed: 1_level_1
700000,0.531740
700001,0.631553
700002,0.731613
700003,0.488986
700004,0.883706
...,...
999995,0.737392
999996,0.670430
999997,0.504522
999998,0.619035


In [54]:
result_df.to_csv("diagnosed_diabetes_test.csv")