In [1]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [2]:
from pathlib import Path
DATA_PATH = Path("C:/ML_Projects/kaggle-workflow/data/")
OUTPUT_PATH = Path("C:/ML_Projects/kaggle-workflow/output/")

# General settings
SEED = 42
N_FOLDS = 5

# Metric choice placeholder
#Adjust depending on the competition
METRIC = "auc"

In [3]:
import pandas as pd 
import numpy as np
#Load Data
train = pd.read_csv(DATA_PATH / "Diabetes Prediction Challenge" / "train.csv")
test = pd.read_csv(DATA_PATH / "Diabetes Prediction Challenge" / "test.csv")

# DISPLAY DATA
print("Train shape, test shape:")
print(train.shape, test.shape)
train.head()

Train shape, test shape:
(700000, 26) (300000, 25)


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


In [4]:
for df in [train,test]:
    df.rename(columns = ({"alcohol_consumption_per_week":"alc",'physical_activity_minutes_per_week':"activity", 'diet_score':"diet",
                          'sleep_hours_per_day': "sleep", 'screen_time_hours_per_day':"screen", 'education_level':"edu",
                           'income_level':"inc", 'smoking_status': "smoke", 'employment_status':"empl", 'family_history_diabetes': "fam_his",
                          'hypertension_history': "hyp_his", 'cardiovascular_history': "card_his", 'diagnosed_diabetes': "label"} ), inplace = True)

In [5]:
train.head()

Unnamed: 0,id,age,alc,activity,diet,sleep,screen,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,edu,inc,smoke,empl,fam_his,hyp_his,card_his,label
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


In [6]:
train.columns

Index(['id', 'age', 'alc', 'activity', 'diet', 'sleep', 'screen', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'edu', 'inc', 'smoke', 'empl',
       'fam_his', 'hyp_his', 'card_his', 'label'],
      dtype='object')

In [7]:
FEATURES = ['age','activity','diet','sleep','screen','bmi',
    'waist_to_hip_ratio','systolic_bp','diastolic_bp',
    'heart_rate','cholesterol_total','hdl_cholesterol',
    'ldl_cholesterol','triglycerides']
TARGET = "label"

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.layers import Activation
import tensorflow.keras.backend as K

print('TF Version',tf.__version__)

  if not hasattr(np, "object"):


TF Version 2.20.0


In [9]:
def build_model(size):
    x_in = Input(shape=(size,))

    x = Dense(32)(x_in)
    x = BatchNormalization()(x)
    x = Activation("swish")(x)

    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation("swish")(x)

    x = Dense(32)(x)
    x = BatchNormalization()(x)
    x = Activation("swish")(x)

    # Binary classification → sigmoid
    x = Dense(1, activation="sigmoid")(x)

    model = Model(inputs = x_in, outputs = x)
    return model


In [10]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
def make_callbacks():
    lr_callback = ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=3,
        verbose=1,
        min_lr=1e-6
    )

    early_stop_cb = EarlyStopping(
        monitor="val_loss",
        patience=10,
        restore_best_weights=True,
        verbose=1
    )

    return [lr_callback, early_stop_cb]

EPOCHS = 100

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import tensorflow.keras.backend as K

FOLDS = 5
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof = np.zeros(len(train))
pred = np.zeros(len(test))

for i, (train_idx, valid_idx) in enumerate(kf.split(train, train[TARGET])):

    print(f"\n{'#'*28}")
    print(f"{'#'*10} Fold {i+1} {'#'*10}")
    print(f"{'#'*28}")

    # SPLIT
    X_train = train.loc[train_idx, FEATURES].copy()
    y_train = train.loc[train_idx, TARGET].values

    X_valid = train.loc[valid_idx, FEATURES].copy()
    y_valid = train.loc[valid_idx, TARGET].values

    X_test = test[FEATURES].copy()

    # NORMALIZATION (numeric only assumed)
    norm_cols = FEATURES
    means = X_train[norm_cols].mean()
    stds = X_train[norm_cols].std().replace(0, 1)

    X_train[norm_cols] = (X_train[norm_cols] - means) / stds
    X_valid[norm_cols] = (X_valid[norm_cols] - means) / stds
    X_test[norm_cols] = (X_test[norm_cols] - means) / stds

    # MODEL
    K.clear_session()
    model = build_model(X_train.shape[1])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.AUC(curve="ROC")]
    )

    model.fit(
        X_train, y_train,
        validation_data=(X_valid, y_valid),
        callbacks=make_callbacks(),
        batch_size=256,
        epochs=EPOCHS,
        verbose=2
    )

    # PREDICTIONS
    oof[valid_idx] = model.predict(X_valid, batch_size=512).ravel()
    pred += model.predict(X_test, batch_size=512).ravel()

    fold_auc = roc_auc_score(y_valid, oof[valid_idx])
    print(f"Fold {i+1} AUC: {fold_auc:.5f}")

pred /= FOLDS


############################
########## Fold 1 ##########
############################

Epoch 1/100
2188/2188 - 7s - 3ms/step - auc: 0.6458 - loss: 0.6305 - val_auc: 0.6527 - val_loss: 0.6268 - learning_rate: 0.0010
Epoch 2/100
2188/2188 - 5s - 2ms/step - auc: 0.6512 - loss: 0.6275 - val_auc: 0.6523 - val_loss: 0.6270 - learning_rate: 0.0010
Epoch 3/100
2188/2188 - 5s - 2ms/step - auc: 0.6518 - loss: 0.6271 - val_auc: 0.6536 - val_loss: 0.6263 - learning_rate: 0.0010
Epoch 4/100
2188/2188 - 5s - 2ms/step - auc: 0.6521 - loss: 0.6269 - val_auc: 0.6540 - val_loss: 0.6257 - learning_rate: 0.0010
Epoch 5/100
2188/2188 - 5s - 2ms/step - auc: 0.6526 - loss: 0.6266 - val_auc: 0.6542 - val_loss: 0.6258 - learning_rate: 0.0010
Epoch 6/100
2188/2188 - 5s - 2ms/step - auc: 0.6528 - loss: 0.6266 - val_auc: 0.6544 - val_loss: 0.6257 - learning_rate: 0.0010
Epoch 7/100

Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
2188/2188 - 5s - 2ms/step - auc: 0.6530 - loss: 0.6264

In [12]:
cv_auc = roc_auc_score(train[TARGET], oof)
print("CV AUC:", cv_auc)

CV AUC: 0.6538628354277897


In [24]:
submission = pd.DataFrame({
    "id": test["id"],      # or your index column
    "label": pred          # predicted probability
})
submission.head()

Unnamed: 0,id,label
0,700000,0.60857
1,700001,0.644877
2,700002,0.696341
3,700003,0.62672
4,700004,0.869465


In [31]:
FEATURES

['age',
 'activity',
 'diet',
 'sleep',
 'screen',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides']

In [36]:
nina =  pd.read_csv(DATA_PATH / "Diabetes Prediction Challenge" / "Nina.csv")
nina.rename(columns = ({"alcohol_consumption_per_week":"alc",'physical_activity_minutes_per_week':"activity", 'diet_score':"diet",
                          'sleep_hours_per_day': "sleep", 'screen_time_hours_per_day':"screen", 'education_level':"edu",
                           'income_level':"inc", 'smoking_status': "smoke", 'employment_status':"empl", 'family_history_diabetes': "fam_his",
                          'hypertension_history': "hyp_his", 'cardiovascular_history': "card_his", 'diagnosed_diabetes': "label"} ), inplace = True)
nina[FEATURES].head()

Unnamed: 0,age,activity,diet,sleep,screen,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides
0,35,180,4.8,8.0,12.0,18.0,0.75,110,70,90,265,76,189,88


In [47]:
x = np.array([[35, 180, 4.8, 8.0, 12.0, 18.0, 0.75, 110, 70, 90,265, 76, 189,88 ]], dtype=np.float32)
p = model.predict(x)[0, 0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


In [48]:
p

np.float32(0.41547245)

In [39]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

def permutation_importance_mlp(model, X_val, y_val, metric=roc_auc_score, n_repeats=5):
    baseline = metric(y_val, model.predict(X_val, batch_size=512).ravel())
    importances = []

    for col in X_val.columns:
        scores = []
        for _ in range(n_repeats):
            X_perm = X_val.copy()
            X_perm[col] = np.random.permutation(X_perm[col].values)
            score = metric(y_val, model.predict(X_perm, batch_size=512).ravel())
            scores.append(baseline - score)

        importances.append(np.mean(scores))

    return pd.DataFrame({
        "feature": X_val.columns,
        "perm_importance": importances
    }).sort_values("perm_importance", ascending=False)

In [40]:
fi = permutation_importance_mlp(model, X_valid, y_valid)
fi

[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m274/274[

Unnamed: 0,feature,perm_importance
1,activity,0.061694
0,age,0.051466
13,triglycerides,0.006408
5,bmi,0.005015
11,hdl_cholesterol,0.00322
2,diet,0.002559
12,ldl_cholesterol,0.00231
9,heart_rate,0.001924
10,cholesterol_total,0.001631
4,screen,0.001181
