In [None]:
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

np.random.seed(42)

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
orig = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv')

TARGET = 'diagnosed_diabetes'
print(f'Train: {train.shape}, Test: {test.shape}')

In [None]:
# Detect cutoff
rolling_mean = train['physical_activity_minutes_per_week'].rolling(window=1000).mean()
cutoff_id = rolling_mean[rolling_mean > 88].index.min()
print(f'Cutoff ID: {cutoff_id}')

In [None]:
# Feature engineering
def create_features(df):
    df = df.copy()
    
    # Lipid ratios
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1)
    df['total_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1)
    df['trig_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1)
    df['non_hdl_chol'] = df['cholesterol_total'] - df['hdl_cholesterol']
    df['atherogenic_index'] = np.log10(df['triglycerides'] / (df['hdl_cholesterol'] + 1))
    
    # BMI
    df['bmi_squared'] = df['bmi'] ** 2
    df['bmi_age'] = df['bmi'] * df['age']
    df['bmi_waist'] = df['bmi'] * df['waist_to_hip_ratio']
    
    # BP
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['mean_arterial_pressure'] = df['diastolic_bp'] + (df['pulse_pressure'] / 3)
    df['bp_product'] = df['systolic_bp'] * df['diastolic_bp']
    
    # Metabolic
    df['metabolic_syndrome'] = (
        (df['bmi'] > 30).astype(int) * 2 +
        (df['triglycerides'] >= 150).astype(int) * 2 +
        (df['hdl_cholesterol'] < 40).astype(int) * 2 +
        (df['systolic_bp'] >= 130).astype(int)
    )
    df['insulin_resistance'] = df['triglycerides'] * df['bmi'] * df['waist_to_hip_ratio'] / 1000
    
    # Lifestyle
    df['activity_per_bmi'] = df['physical_activity_minutes_per_week'] / (df['bmi'] + 1)
    df['sedentary_index'] = df['screen_time_hours_per_day'] / (df['physical_activity_minutes_per_week'] / 60 + 1)
    df['health_behavior'] = (
        df['diet_score'] * df['physical_activity_minutes_per_week'] * df['sleep_hours_per_day']
    ) / (df['screen_time_hours_per_day'] + 1)
    
    # Age
    df['age_squared'] = df['age'] ** 2
    df['age_cholesterol'] = df['age'] * df['cholesterol_total']
    df['age_bp'] = df['age'] * df['systolic_bp']
    
    # Diabetes risk
    df['diabetes_risk_score'] = (
        (df['age'] > 45).astype(int) * 2 +
        (df['bmi'] > 30).astype(int) * 2 +
        (df['physical_activity_minutes_per_week'] < 150).astype(int) +
        (df['triglycerides'] > 150).astype(int) +
        (df['hdl_cholesterol'] < 40).astype(int)
    )
    
    return df

train = create_features(train)
test = create_features(test)
print(f'Features created: {train.shape[1]} columns')

In [None]:
# Original dataset features
BASE_COLS = ['age', 'bmi', 'systolic_bp', 'diastolic_bp', 'cholesterol_total',
             'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
             'physical_activity_minutes_per_week', 'waist_to_hip_ratio']

for col in BASE_COLS:
    if col in orig.columns:
        mean_map = orig.groupby(col)[TARGET].mean()
        train = train.merge(mean_map.rename(f'orig_mean_{col}'), on=col, how='left')
        test = test.merge(mean_map.rename(f'orig_mean_{col}'), on=col, how='left')

print(f'Total columns: {train.shape[1]}')

In [None]:
# Prepare data
FEATURES = [col for col in train.columns if col not in ['id', TARGET]]
CATS = train.select_dtypes('object').columns.to_list()

X_full = train[FEATURES].copy()
y_full = train[TARGET].values
X_test = test[FEATURES].copy()

for col in CATS:
    le = LabelEncoder()
    X_full[col] = le.fit_transform(X_full[col].astype(str))
    X_test[col] = le.transform(test[col].astype(str))

print(f'Features: {len(FEATURES)}')

In [None]:
# Stacking with test-like validation
print('Building stacking ensemble...')

X_val = X_full.iloc[cutoff_id:]
y_val = y_full[cutoff_id:]

# Base models with different configurations
base_models = [
    ('xgb1', XGBClassifier(
        n_estimators=1000, learning_rate=0.01, max_depth=5,
        subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.1, reg_lambda=1.0,
        random_state=42, eval_metric='auc', n_jobs=-1
    )),
    ('xgb2', XGBClassifier(
        n_estimators=800, learning_rate=0.015, max_depth=6,
        subsample=0.75, colsample_bytree=0.75,
        reg_alpha=0.15, reg_lambda=1.2,
        random_state=123, eval_metric='auc', n_jobs=-1
    )),
    ('xgb3', XGBClassifier(
        n_estimators=600, learning_rate=0.02, max_depth=7,
        subsample=0.7, colsample_bytree=0.7,
        reg_alpha=0.2, reg_lambda=1.5,
        random_state=456, eval_metric='auc', n_jobs=-1
    )),
    ('lgb1', LGBMClassifier(
        n_estimators=1000, learning_rate=0.01, max_depth=5, num_leaves=30,
        subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.1, reg_lambda=1.0,
        random_state=42, verbose=-1
    )),
    ('lgb2', LGBMClassifier(
        n_estimators=800, learning_rate=0.015, max_depth=6, num_leaves=40,
        subsample=0.75, colsample_bytree=0.75,
        reg_alpha=0.15, reg_lambda=1.2,
        random_state=123, verbose=-1
    )),
    ('lgb3', LGBMClassifier(
        n_estimators=600, learning_rate=0.02, max_depth=7, num_leaves=50,
        subsample=0.7, colsample_bytree=0.7,
        reg_alpha=0.2, reg_lambda=1.5,
        random_state=456, verbose=-1
    ))
]

print(f'{len(base_models)} base models')

In [None]:
# 5-fold stacking on test-like validation
print('\nTraining base models with 5-fold CV on test-like data...')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# OOF predictions for meta-features
oof_train = np.zeros((len(X_val), len(base_models)))
test_preds = np.zeros((len(X_test), len(base_models)))

X_early = X_full.iloc[:cutoff_id]
y_early = y_full[:cutoff_id]

for model_idx, (name, model) in enumerate(base_models):
    print(f'\nTraining {name}...')
    
    fold_preds = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_val, y_val)):
        # Concatenate early data with fold train data
        X_tr = pd.concat([X_early, X_val.iloc[train_idx]], axis=0)
        y_tr = np.concatenate([y_early, y_val[train_idx]])
        X_vl = X_val.iloc[val_idx]
        y_vl = y_val[val_idx]
        
        model_copy = model.__class__(**model.get_params())
        model_copy.fit(X_tr, y_tr)
        
        oof_train[val_idx, model_idx] = model_copy.predict_proba(X_vl)[:, 1]
        fold_preds.append(model_copy.predict_proba(X_test)[:, 1])
    
    test_preds[:, model_idx] = np.mean(fold_preds, axis=0)
    score = roc_auc_score(y_val, oof_train[:, model_idx])
    print(f'{name} OOF AUC: {score:.5f}')

print('\nBase models training complete')

In [None]:
# Train meta-model
print('\nTraining meta-model (Logistic Regression)...')

meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(oof_train, y_val)

meta_pred_val = meta_model.predict_proba(oof_train)[:, 1]
stacking_score = roc_auc_score(y_val, meta_pred_val)

print(f'Stacking validation AUC: {stacking_score:.5f}')

# Compare with simple average
avg_pred = oof_train.mean(axis=1)
avg_score = roc_auc_score(y_val, avg_pred)
print(f'Simple average AUC: {avg_score:.5f}')
print(f'Stacking improvement: +{stacking_score - avg_score:.5f}')

In [None]:
# Weighted refit for base models
print('\nWeighted refit (weight=15)...')

WEIGHT = 15.0
sample_weights = np.ones(len(X_full))
sample_weights[cutoff_id:] = WEIGHT

test_preds_weighted = np.zeros((len(X_test), len(base_models)))

for model_idx, (name, model) in enumerate(base_models):
    model_weighted = model.__class__(**model.get_params())
    model_weighted.fit(X_full, y_full, sample_weight=sample_weights)
    test_preds_weighted[:, model_idx] = model_weighted.predict_proba(X_test)[:, 1]

print('Weighted refit complete')

In [None]:
# Final predictions with meta-model
final_preds = meta_model.predict_proba(test_preds_weighted)[:, 1]

submission = pd.DataFrame({
    'id': test['id'],
    TARGET: final_preds
})

submission.to_csv('submission.csv', index=False)

print('='*70)
print('SUBMISSION CREATED - STACKING ENSEMBLE')
print('='*70)
print('Strategy:')
print(f'  - {len(base_models)} diverse base models')
print('  - 5-fold CV on test-like data with concatenation')
print('  - Logistic Regression meta-model')
print('  - Weighted refit (weight=15) on base models')
print(f'  - {len(FEATURES)} features')
print('='*70)
print(f'Stacking validation: {stacking_score:.5f}')
print(f'Previous LB: 0.70322')
print(f'Expected improvement: +0.003 to +0.005')
print(f'Expected LB: 0.706-0.709')
print('='*70)
print('\nSubmission statistics:')
print(submission[TARGET].describe())
print('='*70)