In [2]:
!pip -q install xgboost lightgbm imbalanced-learn
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV

from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

RANDOM_STATE = 0
np.random.seed(RANDOM_STATE)


In [3]:

# Import Data

TRAIN_PATH = "data_train.csv"
TEST_PATH = "data_test.csv"
TARGET_COL = "Diabetes_binary"

df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
print("\nTrain columns:", df_train.columns.tolist())
print("\nTest columns:", df_test.columns.tolist())

Train shape: (170000, 23)
Test shape: (30000, 22)

Train columns: ['ID', 'Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

Test columns: ['ID', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']


In [4]:

# Feature Engineering

def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # BMI / Age ratio (only if both exist and Age > 0)
    if "BMI" in df.columns and "Age" in df.columns:
        df["BMI_Age"] = df["BMI"] / (df["Age"].replace(0, np.nan))
        df["BMI_Age"] = df["BMI_Age"].fillna(df["BMI_Age"].median())

    # Physically unhealthy days > 10
    if "PhysHlth" in df.columns:
        df["Physically_Unhealthy"] = (df["PhysHlth"] > 10).astype(int)

    # Mentally unhealthy days > 10
    if "MentHlth" in df.columns:
        df["Mentally_Unhealthy"] = (df["MentHlth"] > 10).astype(int)

    # Obesity flag
    if "BMI" in df.columns:
        df["Obese"] = (df["BMI"] >= 30).astype(int)

    if "Age" in df.columns:
        df["Old"] = (df["Age"] >= 9).astype(int)

    # High-risk combo: HighBP + HighChol + Smoker
    if all(c in df.columns for c in ["HighBP", "HighChol", "Smoker"]):
        def to_binary(col):
            if df[col].dtype == "O":
                return df[col].map({"Yes": 1, "No": 0}).fillna(0).astype(int)
            else:
                return df[col].astype(float).fillna(0).astype(int)

        hb = to_binary("HighBP")
        hc = to_binary("HighChol")
        sm = to_binary("Smoker")
        df["HighRiskCombo"] = ((hb == 1) & (hc == 1) & (sm == 1)).astype(int)

    return df

df_train = add_engineered_features(df_train)
df_test = add_engineered_features(df_test)

print("Train shape after feature engineering:", df_train.shape)
print("Test shape after feature engineering:", df_test.shape)


Train shape after feature engineering: (170000, 29)
Test shape after feature engineering: (30000, 28)


In [5]:

y = df_train[TARGET_COL].astype(int)
X = df_train.drop(columns=[TARGET_COL]).copy()

candidate_id_cols = [c for c in df_test.columns
                     if c.lower().startswith("id") or c.lower().endswith("id")]

if candidate_id_cols:
    ID_COL = candidate_id_cols[0]
    print(f"Using '{ID_COL}' as ID column for submission.")
    test_ids = df_test[ID_COL].copy()
    X_test = df_test.drop(columns=[ID_COL]).copy()
else:
    print("No explicit ID column found. Using row index as ID.")
    ID_COL = "ID"
    test_ids = pd.Series(np.arange(1, len(df_test) + 1), name=ID_COL)
    X_test = df_test.copy()

common_cols = [c for c in X.columns if c in X_test.columns]
X = X[common_cols].copy()
X_test = X_test[common_cols].copy()

print("\nNumber of feature columns:", len(common_cols))


Using 'ID' as ID column for submission.

Number of feature columns: 27


In [6]:

# Split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print("X_train:", X_train.shape, "X_val:", X_val.shape)


X_train: (136000, 27) X_val: (34000, 27)


In [7]:
# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "bool", "category"]).columns.tolist()

print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


Numeric features: 14
Categorical features: 13


In [8]:

# Threshold Tuning

def tune_threshold_for_f1(y_true, y_proba, positive_label=1):
    """Brute-force search over thresholds to maximize F1 for the positive class."""
    best_threshold = 0.5
    best_f1 = -1

    for thr in np.linspace(0.2, 0.8, 61):  # step = 0.01
        y_pred = (y_proba >= thr).astype(int)
        f1 = f1_score(y_true, y_pred, pos_label=positive_label)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = thr

    return best_threshold, best_f1

def evaluate_model(name, model, X_tr, y_tr, X_val, y_val):
    print(f"\n=============================")
    print(f"Training model: {name}")
    print(f"=============================")
    model.fit(X_tr, y_tr)

    # Predicted probabilities for positive class
    y_val_proba = model.predict_proba(X_val)[:, 1]
    best_thr, best_f1 = tune_threshold_for_f1(y_val, y_val_proba, positive_label=1)

    y_val_pred = (y_val_proba >= best_thr).astype(int)

    print(f"Best threshold for F1(class 1): {best_thr:.4f}")
    print(f"Best F1(class 1) on validation: {best_f1:.4f}")
    print("\nConfusion Matrix (val):")
    print(confusion_matrix(y_val, y_val_pred, labels=[0, 1]))
    print("\nClassification Report (val):")
    print(classification_report(y_val, y_val_pred, zero_division=0))

    return {
        "name": name,
        "model": model,
        "threshold": best_thr,
        "f1_class1": best_f1
    }


In [9]:

results = []

# XGBoost (unweighted)
xgb_unweighted = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", xgb.XGBClassifier(
        random_state=RANDOM_STATE,
        tree_method="hist",
        objective="binary:logistic",
        eval_metric="logloss",
        n_estimators=400,
        learning_rate=0.07,
        max_depth=7,
        subsample=0.9,
        colsample_bytree=0.8,
        n_jobs=-1
    ))
])

res_xgb_unweighted = evaluate_model("XGBoost_unweighted", xgb_unweighted, X_train, y_train, X_val, y_val)
results.append(res_xgb_unweighted)

# XGBoost (with scale_pos_weight)
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\nscale_pos_weight (train) = {pos_weight:.4f}")

xgb_weighted = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", xgb.XGBClassifier(
        random_state=RANDOM_STATE,
        tree_method="hist",
        objective="binary:logistic",
        eval_metric="logloss",
        n_estimators=516,
        learning_rate=0.07,
        max_depth=7,
        min_child_weight=2,
        subsample=0.8821102743060054,
        colsample_bytree=0.7814047095321688,
        scale_pos_weight=pos_weight,
        n_jobs=-1
    ))
])

res_xgb_weighted = evaluate_model("XGBoost_weighted", xgb_weighted, X_train, y_train, X_val, y_val)
results.append(res_xgb_weighted)

# XGBoost (weighted + Calibrated)
CALIBRATION_SUBSAMPLE = min(40000, len(X_train))
idx_sub = np.random.choice(len(X_train), size=CALIBRATION_SUBSAMPLE, replace=False)

X_cal = X_train.iloc[idx_sub]
y_cal = y_train.iloc[idx_sub]

xgb_weighted_for_cal = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", xgb.XGBClassifier(
        random_state=RANDOM_STATE,
        tree_method="hist",
        objective="binary:logistic",
        eval_metric="logloss",
        n_estimators=516,
        learning_rate=0.07,
        max_depth=7,
        min_child_weight=2,
        subsample=0.8821102743060054,
        colsample_bytree=0.7814047095321688,
        scale_pos_weight=pos_weight,
        n_jobs=-1
    ))
])

calib_xgb = CalibratedClassifierCV(xgb_weighted_for_cal, method="sigmoid", cv=3)
res_xgb_calibrated = evaluate_model("XGBoost_weighted_calibrated", calib_xgb, X_cal, y_cal, X_val, y_val)
results.append(res_xgb_calibrated)

# Balanced Random Forest
brf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", BalancedRandomForestClassifier(
        n_estimators=400,
        max_depth=12,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

res_brf = evaluate_model("BalancedRandomForest", brf, X_train, y_train, X_val, y_val)
results.append(res_brf)

# LightGBM
lgbm = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.03,
        num_leaves=50,
        max_depth=-1,
        objective="binary",
        class_weight="balanced",
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

res_lgbm = evaluate_model("LightGBM_balanced", lgbm, X_train, y_train, X_val, y_val)
results.append(res_lgbm)

# Logistic Regression (baseline)
numeric_scaler = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor_lr = ColumnTransformer(
    transformers=[
        ("num", numeric_scaler, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

lr = Pipeline(steps=[
    ("preprocess", preprocessor_lr),
    ("clf", LogisticRegression(
        C=0.008111941985431923,
        max_iter=1000,
        solver="liblinear"
    ))
])

res_lr = evaluate_model("LogisticRegression", lr, X_train, y_train, X_val, y_val)
results.append(res_lr)

# Summary of models
print("\n======================")
print("Validation F1 (class 1) summary")
print("======================")
for r in results:
    print(f"{r['name']}: F1(class1) = {r['f1_class1']:.4f}, threshold = {r['threshold']:.4f}")

# Pick best model based on F1
best_result = max(results, key=lambda r: r["f1_class1"])
best_name = best_result["name"]
best_threshold = best_result["threshold"]

print(f"\nBEST MODEL: {best_name}")
print(f"Using threshold: {best_threshold:.4f}")



Training model: XGBoost_unweighted
Best threshold for F1(class 1): 0.2500
Best F1(class 1) on validation: 0.5134

Confusion Matrix (val):
[[22406  5571]
 [ 2019  4004]]

Classification Report (val):
              precision    recall  f1-score   support

           0       0.92      0.80      0.86     27977
           1       0.42      0.66      0.51      6023

    accuracy                           0.78     34000
   macro avg       0.67      0.73      0.68     34000
weighted avg       0.83      0.78      0.79     34000


scale_pos_weight (train) = 4.6448

Training model: XGBoost_weighted
Best threshold for F1(class 1): 0.5900
Best F1(class 1) on validation: 0.5087

Confusion Matrix (val):
[[22611  5366]
 [ 2138  3885]]

Classification Report (val):
              precision    recall  f1-score   support

           0       0.91      0.81      0.86     27977
           1       0.42      0.65      0.51      6023

    accuracy                           0.78     34000
   macro avg       0.6



Best threshold for F1(class 1): 0.6300
Best F1(class 1) on validation: 0.5169

Confusion Matrix (val):
[[23034  4943]
 [ 2201  3822]]

Classification Report (val):
              precision    recall  f1-score   support

           0       0.91      0.82      0.87     27977
           1       0.44      0.63      0.52      6023

    accuracy                           0.79     34000
   macro avg       0.67      0.73      0.69     34000
weighted avg       0.83      0.79      0.80     34000


Training model: LogisticRegression
Best threshold for F1(class 1): 0.2400
Best F1(class 1) on validation: 0.5138

Confusion Matrix (val):
[[22518  5459]
 [ 2054  3969]]

Classification Report (val):
              precision    recall  f1-score   support

           0       0.92      0.80      0.86     27977
           1       0.42      0.66      0.51      6023

    accuracy                           0.78     34000
   macro avg       0.67      0.73      0.69     34000
weighted avg       0.83      0.78    

In [10]:

# Submission
def build_model_by_name(name: str):
    if name == "XGBoost_unweighted":
        return Pipeline(steps=[
            ("preprocess", preprocessor),
            ("clf", xgb.XGBClassifier(
                random_state=RANDOM_STATE,
                tree_method="hist",
                objective="binary:logistic",
                eval_metric="logloss",
                n_estimators=400,
                learning_rate=0.07,
                max_depth=7,
                subsample=0.9,
                colsample_bytree=0.8,
                n_jobs=-1
            ))
        ])
    if name == "XGBoost_weighted":
        return Pipeline(steps=[
            ("preprocess", preprocessor),
            ("clf", xgb.XGBClassifier(
                random_state=RANDOM_STATE,
                tree_method="hist",
                objective="binary:logistic",
                eval_metric="logloss",
                n_estimators=516,
                learning_rate=0.07,
                max_depth=7,
                min_child_weight=2,
                subsample=0.8821102743060054,
                colsample_bytree=0.7814047095321688,
                scale_pos_weight=pos_weight,
                n_jobs=-1
            ))
        ])
    if name == "XGBoost_weighted_calibrated":
        base = Pipeline(steps=[
            ("preprocess", preprocessor),
            ("clf", xgb.XGBClassifier(
                random_state=RANDOM_STATE,
                tree_method="hist",
                objective="binary:logistic",
                eval_metric="logloss",
                n_estimators=516,
                learning_rate=0.07,
                max_depth=7,
                min_child_weight=2,
                subsample=0.8821102743060054,
                colsample_bytree=0.7814047095321688,
                scale_pos_weight=pos_weight,
                n_jobs=-1
            ))
        ])
        return CalibratedClassifierCV(base, method="sigmoid", cv=3)

    if name == "BalancedRandomForest":
        return Pipeline(steps=[
            ("preprocess", preprocessor),
            ("clf", BalancedRandomForestClassifier(
                n_estimators=400,
                max_depth=12,
                random_state=RANDOM_STATE,
                n_jobs=-1
            ))
        ])

    if name == "LightGBM_balanced":
        return Pipeline(steps=[
            ("preprocess", preprocessor),
            ("clf", lgb.LGBMClassifier(
                n_estimators=500,
                learning_rate=0.03,
                num_leaves=50,
                max_depth=-1,
                objective="binary",
                class_weight="balanced",
                subsample=0.9,
                colsample_bytree=0.8,
                random_state=RANDOM_STATE,
                n_jobs=-1
            ))
        ])

    if name == "LogisticRegression":
        return Pipeline(steps=[
            ("preprocess", preprocessor_lr),
            ("clf", LogisticRegression(
                C=0.008111941985431923,
                max_iter=1000,
                solver="liblinear"
            ))
        ])

    raise ValueError(f"Unknown model name: {name}")

final_model = build_model_by_name(best_name)

print(f"\nFitting BEST model on FULL training data: {best_name}")
final_model.fit(X, y)

# Predict probabilities on test set
test_proba = final_model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_threshold).astype(int)

submission = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: test_pred
})

submission_path = "submission.csv"
submission.to_csv(submission_path, index=False)
print(f"\nSaved submission file to: {submission_path}")
submission.head()



Fitting BEST model on FULL training data: LightGBM_balanced
[LightGBM] [Info] Number of positive: 30116, number of negative: 139884
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 489
[LightGBM] [Info] Number of data points in the train set: 170000, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000





Saved submission file to: submission.csv


Unnamed: 0,ID,Diabetes_binary
0,182992,1
1,148963,0
2,82811,1
3,110563,1
4,208808,0
