In [10]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

# ==========================================
# 1. DATA LOADING & FEATURE ENGINEERING
# ==========================================
print("--- Loading and Engineering Features ---")

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Combine for consistent processing
train_df['is_train'] = 1
test_df['is_train'] = 0
test_df['diagnosed_diabetes'] = np.nan 
all_data = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

--- Loading and Engineering Features ---


In [11]:
# --- Feature Engineering (Applied to All) ---
# 1. Interaction Features
all_data['age_bmi'] = all_data['age'] * all_data['bmi']
all_data['age_bp'] = all_data['age'] * all_data['systolic_bp']
all_data['bmi_bp'] = all_data['bmi'] * all_data['systolic_bp']

# 2. Threshold Flags
all_data['is_obese'] = (all_data['bmi'] >= 30).astype(int)
all_data['is_overweight'] = ((all_data['bmi'] >= 25) & (all_data['bmi'] < 30)).astype(int)
all_data['high_bp'] = (all_data['systolic_bp'] >= 130).astype(int)
all_data['is_senior'] = (all_data['age'] >= 65).astype(int)

# 3. Lifestyle Score
all_data['lifestyle_risk'] = (
    (all_data['bmi'] > 30).astype(int) + 
    (all_data['physical_activity_minutes_per_week'] < 60).astype(int) +
    (all_data['sleep_hours_per_day'] < 6).astype(int)
)

# Lists of columns
categorical_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
numerical_cols = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day',
    'bmi', 'waist_to_hip_ratio', 'systolic_bp',
    'age_bmi', 'age_bp', 'bmi_bp', 'lifestyle_risk'
]

In [12]:
# ==========================================
# 2. DATA PREPARATION (Splitting for each model)
# ==========================================

# --- A. PREP FOR XGBOOST (One-Hot + Scaled) ---
print("Preparing data for XGBoost...")
xgb_data = all_data.copy()

# One-Hot Encode
xgb_data = pd.get_dummies(xgb_data, columns=categorical_cols, drop_first=True)

# Scale Numerical
scaler = StandardScaler()
xgb_data[numerical_cols] = scaler.fit_transform(xgb_data[numerical_cols])
xgb_data[numerical_cols] = xgb_data[numerical_cols].fillna(xgb_data[numerical_cols].mean())
xgb_data = xgb_data.fillna(0)

# Split XGB Data
X_xgb = xgb_data[xgb_data['is_train'] == 1].drop(['id', 'is_train', 'diagnosed_diabetes'], axis=1).values
X_test_xgb = xgb_data[xgb_data['is_train'] == 0].drop(['id', 'is_train', 'diagnosed_diabetes'], axis=1).values
y = train_df['diagnosed_diabetes'].values

Preparing data for XGBoost...


In [13]:
# --- B. PREP FOR CATBOOST (Raw Text + No Scaling) ---
print("Preparing data for CatBoost...")
cat_data = all_data.copy()

# Fill NaNs
cat_data[numerical_cols] = cat_data[numerical_cols].fillna(cat_data[numerical_cols].mean())
cat_data = cat_data.fillna('Missing')

# Split CatBoost Data
X_cat = cat_data[cat_data['is_train'] == 1].drop(['id', 'is_train', 'diagnosed_diabetes'], axis=1)
X_test_cat = cat_data[cat_data['is_train'] == 0].drop(['id', 'is_train', 'diagnosed_diabetes'], axis=1)

Preparing data for CatBoost...


In [14]:
# ==========================================
# 3. TRAINING XGBOOST (5-Fold CV)
# ==========================================
print("\n--- Starting XGBoost Training ---")

# Best Manual Params (Stable)
xgb_params = {
    'n_estimators': 3000,
    'learning_rate': 0.01,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'n_jobs': -1,
    'random_state': 42,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}

folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

xgb_test_preds = np.zeros(len(X_test_xgb))
xgb_oof_preds = np.zeros(len(X_xgb))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_xgb, y)):
    X_train_fold, y_train_fold = X_xgb[train_idx], y[train_idx]
    X_val_fold, y_val_fold = X_xgb[val_idx], y[val_idx]
    
    model = XGBClassifier(**xgb_params, early_stopping_rounds=100)
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], verbose=0)
    
    val_p = model.predict_proba(X_val_fold)[:, 1]
    xgb_oof_preds[val_idx] = val_p
    xgb_test_preds += model.predict_proba(X_test_xgb)[:, 1] / folds
    
    print(f"XGB Fold {fold+1} AUC: {roc_auc_score(y_val_fold, val_p):.5f}")

print(f"Overall XGB CV AUC: {roc_auc_score(y, xgb_oof_preds):.5f}")


--- Starting XGBoost Training ---
XGB Fold 1 AUC: 0.72627
XGB Fold 2 AUC: 0.72428
XGB Fold 3 AUC: 0.72524
XGB Fold 4 AUC: 0.72659
XGB Fold 5 AUC: 0.72574
Overall XGB CV AUC: 0.72562


In [15]:
# ==========================================
# 4. TRAINING CATBOOST (5-Fold CV)
# ==========================================
print("\n--- Starting CatBoost Training ---")

cat_test_preds = np.zeros(len(X_test_cat))
cat_oof_preds = np.zeros(len(X_cat))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_cat, y)):
    X_train_fold = X_cat.iloc[train_idx]
    y_train_fold = y[train_idx]
    X_val_fold = X_cat.iloc[val_idx]
    y_val_fold = y[val_idx]
    
    # Heavy Duty Params
    model = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.02,
        depth=8,
        eval_metric='AUC',
        random_seed=42,
        bagging_temperature=0.2,
        od_type='Iter',
        od_wait=200,
        verbose=0  # Silent training
    )
    
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        cat_features=categorical_cols,
        use_best_model=True
    )
    
    val_p = model.predict_proba(X_val_fold)[:, 1]
    cat_oof_preds[val_idx] = val_p
    cat_test_preds += model.predict_proba(X_test_cat)[:, 1] / folds
    
    print(f"CatBoost Fold {fold+1} AUC: {roc_auc_score(y_val_fold, val_p):.5f}")

print(f"Overall CatBoost CV AUC: {roc_auc_score(y, cat_oof_preds):.5f}")


--- Starting CatBoost Training ---
CatBoost Fold 1 AUC: 0.72738
CatBoost Fold 2 AUC: 0.72544
CatBoost Fold 3 AUC: 0.72674
CatBoost Fold 4 AUC: 0.72770
CatBoost Fold 5 AUC: 0.72723
Overall CatBoost CV AUC: 0.72689


In [16]:
# ==========================================
# 5. ENSEMBLING & SUBMISSION
# ==========================================
print("\n--- Blending Models ---")

# Weighted Average (Giving slightly more weight to CatBoost as it performed better)
final_predictions = (0.6 * cat_test_preds) + (0.4 * xgb_test_preds)

submission = pd.DataFrame({
    'id': test_df['id'],
    'diagnosed_diabetes': final_predictions
})

submission.to_csv('submission_ensemble_final.csv', index=False)
print("submission_ensemble_final.csv saved successfully!")


--- Blending Models ---
submission_ensemble_final.csv saved successfully!
