In [None]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix


In [None]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [None]:



# Drift-prone numerical columns
DRIFT_COLS = [
    "physical_activity_minutes_per_week",
    "triglycerides",
    "ldl_cholesterol",
    "cholesterol_total",
    "alcohol_consumption_per_week"
]

# Winsorization (clip extremes)
def clip_series(s, lower=0.01, upper=0.99):
    lo = s.quantile(lower)
    hi = s.quantile(upper)
    return s.clip(lo, hi)

for col in DRIFT_COLS:
    if col in train.columns:
        train[col] = clip_series(train[col])
        test[col]  = clip_series(test[col])

# Log transforms for skewed cols
LOG_COLS = [
    "physical_activity_minutes_per_week",
    "triglycerides",
    "alcohol_consumption_per_week"
]

for col in LOG_COLS:
    if col in train.columns:
        train[col + "_log"] = np.log1p(train[col])
        test[col + "_log"]  = np.log1p(test[col])


In [None]:
TARGET = "diagnosed_diabetes"
ID_COL  = "id"

train[TARGET] = train[TARGET].astype(int)

if TARGET in test.columns:
    test = test.drop(columns=[TARGET])


In [None]:

#  FEATURE ENGINEERING

# Physical activity (drift-prone) -> robust flag
train["sedentary"] = (train["physical_activity_minutes_per_week"] < 30).astype(int)
test["sedentary"]  = (test["physical_activity_minutes_per_week"] < 30).astype(int)

# BMI categories
train["bmi_overweight"] = (train["bmi"].between(25, 29.9)).astype(int)
test["bmi_overweight"]  = (test["bmi"].between(25, 29.9)).astype(int)

train["bmi_obese"] = (train["bmi"] >= 30).astype(int)
test["bmi_obese"]  = (test["bmi"] >= 30).astype(int)

# Blood pressure flag
train["bp_high"] = (train["systolic_bp"] >= 130).astype(int)
test["bp_high"]  = (test["systolic_bp"] >= 130).astype(int)

# Lipids (drift-prone) -> coarse flags
train["ldl_high"] = (train["ldl_cholesterol"] > 130).astype(int)
test["ldl_high"]  = (test["ldl_cholesterol"] > 130).astype(int)

train["high_triglycerides"] = (train["triglycerides"] > 150).astype(int)
test["high_triglycerides"]  = (test["triglycerides"] > 150).astype(int)

# Sleep flag
train["sleep_outside_healthy"] = ((train["sleep_hours_per_day"] < 6) | (train["sleep_hours_per_day"] > 8)).astype(int)
test["sleep_outside_healthy"]  = ((test["sleep_hours_per_day"] < 6) | (test["sleep_hours_per_day"] > 8)).astype(int)


In [None]:
#  ONE-HOT ENCODE categoricals  + align
cat_cols = train.select_dtypes(include="object").columns.tolist()
if cat_cols:
    train = pd.get_dummies(train, columns=cat_cols, drop_first=True)
    test  = pd.get_dummies(test,  columns=cat_cols, drop_first=True)

train, test = train.align(test, join="left", axis=1, fill_value=0)


In [None]:
# DROP dangerous / noisy groups (safe)

DROP_PREFIXES = [
    "education_",
    "income_",
    "employment_status_",
    "ethnicity_",
    "smoking_status_"
]

drop_cols = [
    c for c in train.columns
    if any(c.startswith(p) for p in DROP_PREFIXES)
]


In [None]:
# BUILD X, y, X_test with perfect column match
X = train.drop(columns=[ID_COL, TARGET] + drop_cols, errors="ignore")
y = train[TARGET]
X_test = test.drop(columns=[ID_COL] + drop_cols, errors="ignore")

# Ensure exact same columns order
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# Convert bool to int (clean)
bool_cols = X.select_dtypes(include="bool").columns
X[bool_cols] = X[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)

print(" Final X shape:", X.shape)
print(" Final X_test shape:", X_test.shape)


✅ Final X shape: (700000, 30)
✅ Final X_test shape: (300000, 30)


In [None]:
# 6) QUICK AUC CHECK
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

neg, pos = (y == 0).sum(), (y == 1).sum()
scale_pos_weight = neg / pos

model = XGBClassifier(
    n_estimators=400,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.85,
    colsample_bytree=0.8,
    min_child_weight=15,
    gamma=1.0,
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
valid_proba = model.predict_proba(X_valid)[:, 1]
print(" Validation ROC-AUC:", roc_auc_score(y_valid, valid_proba))


✅ Validation ROC-AUC: 0.715221214735183


In [None]:
# TRAIN FINAL MODEL ON ALL DATA
final_model = XGBClassifier(
    n_estimators=600,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.85,
    colsample_bytree=0.8,
    min_child_weight=15,
    gamma=1.0,
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

final_model.fit(X, y)


In [None]:

#  PREDICT TEST PROBABILITIES
test_proba = final_model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    TARGET: test_proba
})

submission_path = "submission.csv"
submission.to_csv(submission_path, index=False)

print(f"\n Saved: {submission_path} (probabilities for ROC-AUC)")
print("Submission preview:")
display(submission.head())

print("\nSanity check (min/max proba):",
      submission[TARGET].min(),
      submission[TARGET].max())


✅ Saved: submission.csv (probabilities for ROC-AUC)
Submission preview:


Unnamed: 0,id,diagnosed_diabetes
0,700000,0.404538
1,700001,0.551726
2,700002,0.636839
3,700003,0.343274
4,700004,0.833294



Sanity check (min/max proba): 0.034617260098457336 0.9824862480163574
