In [None]:
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

np.random.seed(42)

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
orig = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv')

TARGET = 'diagnosed_diabetes'
print(f'Train: {train.shape}, Test: {test.shape}, Original: {orig.shape}')

In [None]:
# Detect cutoff using rolling mean of physical_activity_minutes_per_week
target_col = 'physical_activity_minutes_per_week'
window_size = 1000
rolling_mean = train[target_col].rolling(window=window_size).mean()

threshold = 88
cutoff_mask = rolling_mean > threshold
cutoff_id = rolling_mean[cutoff_mask].index.min()

print(f'Cutoff ID detected: {cutoff_id}')
print(f'Train before cutoff: {cutoff_id}')
print(f'Train after cutoff (test-like): {len(train) - cutoff_id}')

In [None]:
# Adversarial validation to confirm post-cutoff matches test
print('\nAdversarial Validation...')
train_subset = train[train.index >= cutoff_id].copy()
test_subset = test.copy()

train_subset['is_test'] = 0
test_subset['is_test'] = 1

adv_data = pd.concat([train_subset, test_subset], ignore_index=True)
adv_data = adv_data.sample(frac=1, random_state=42).reset_index(drop=True)

drop_cols = ['id', 'is_test', TARGET]
features_adv = [c for c in test.columns if c not in drop_cols]

X_adv = adv_data[features_adv]
y_adv = adv_data['is_test']

for col in X_adv.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_adv[col] = le.fit_transform(X_adv[col].astype(str))

from lightgbm import LGBMClassifier
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_adv, y_adv)):
    X_tr, X_vl = X_adv.iloc[train_idx], X_adv.iloc[val_idx]
    y_tr, y_vl = y_adv.iloc[train_idx], y_adv.iloc[val_idx]
    
    model = LGBMClassifier(random_state=42, verbose=-1)
    model.fit(X_tr, y_tr)
    
    preds = model.predict_proba(X_vl)[:, 1]
    score = roc_auc_score(y_vl, preds)
    auc_scores.append(score)

mean_auc = np.mean(auc_scores)
print(f'Adversarial Validation AUC: {mean_auc:.4f}')
print(f'(Close to 0.5 = post-cutoff train matches test distribution)')

In [None]:
# Base features
BASE = [col for col in train.columns if col not in ['id', TARGET]]
CATS = train.select_dtypes('object').columns.to_list()
NUMS = [col for col in BASE if col not in CATS]

print(f'{len(BASE)} Base Features')
print(f'{len(CATS)} Categorical Features')
print(f'{len(NUMS)} Numerical Features')

In [None]:
# Create features from original dataset
print('\nCreating features from original dataset...')

ORIG = []
for col in BASE:
    # Mean target by column value in original dataset
    mean_map = orig.groupby(col)[TARGET].mean()
    new_mean_col_name = f'orig_mean_{col}'
    mean_map.name = new_mean_col_name
    
    train = train.merge(mean_map, on=col, how='left')
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(new_mean_col_name)
    
    # Count of samples in original dataset
    new_count_col_name = f'orig_count_{col}'
    count_map = orig.groupby(col).size().reset_index(name=new_count_col_name)
    
    train = train.merge(count_map, on=col, how='left')
    test = test.merge(count_map, on=col, how='left')
    ORIG.append(new_count_col_name)

print(f'{len(ORIG)} Original Dataset Features Created!')
FEATURES = BASE + ORIG
print(f'{len(FEATURES)} Total Features')

In [None]:
# Split based on detected cutoff
train_df = train[train['id'] < cutoff_id].copy()
val_df = train[train['id'] >= cutoff_id].copy()

X_train = train_df[FEATURES]
y_train = train_df[TARGET]
X_val = val_df[FEATURES]
y_val = val_df[TARGET]

print(f'Train: {X_train.shape}')
print(f'Validation (test-like): {X_val.shape}')

In [None]:
# Set categorical columns for XGBoost
for col in CATS:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    test[col] = test[col].astype('category')

In [None]:
# Train XGBoost with early stopping
print('Training XGBoost...')

xgb_model = XGBClassifier(
    n_estimators=10000,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    enable_categorical=True,
    eval_metric='auc',
    early_stopping_rounds=200,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=200
)

y_pred_val = xgb_model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_pred_val)

print(f'\nValidation AUC: {val_auc:.5f}')

In [None]:
# Also train LightGBM for ensemble
print('\nTraining LightGBM...')

# Convert categorical back to codes for LightGBM
X_train_lgb = X_train.copy()
X_val_lgb = X_val.copy()
test_lgb = test[FEATURES].copy()

for col in CATS:
    le = LabelEncoder()
    X_train_lgb[col] = le.fit_transform(X_train_lgb[col].astype(str))
    X_val_lgb[col] = le.transform(X_val_lgb[col].astype(str))
    test_lgb[col] = le.transform(test_lgb[col].astype(str))

lgb_model = LGBMClassifier(
    n_estimators=10000,
    learning_rate=0.01,
    max_depth=6,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

lgb_model.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
)

import lightgbm as lgb
y_pred_val_lgb = lgb_model.predict_proba(X_val_lgb)[:, 1]
val_auc_lgb = roc_auc_score(y_val, y_pred_val_lgb)

print(f'LightGBM Validation AUC: {val_auc_lgb:.5f}')

In [None]:
# Test ensemble weights
print('\nOptimizing ensemble weights...')

best_score = max(val_auc, val_auc_lgb)
best_weight = 1.0 if val_auc > val_auc_lgb else 0.0

for w in np.arange(0.0, 1.01, 0.1):
    pred_ens = w * y_pred_val + (1 - w) * y_pred_val_lgb
    score = roc_auc_score(y_val, pred_ens)
    print(f'XGB={w:.1f}, LGB={1-w:.1f}: {score:.5f}')
    if score > best_score:
        best_score = score
        best_weight = w

print(f'\nBest Ensemble: XGB={best_weight:.1f}, LGB={1-best_weight:.1f}')
print(f'Best Validation AUC: {best_score:.5f}')

In [None]:
# Generate predictions
print('\nGenerating predictions...')

test_preds_xgb = xgb_model.predict_proba(test[FEATURES])[:, 1]
test_preds_lgb = lgb_model.predict_proba(test_lgb)[:, 1]
test_preds = best_weight * test_preds_xgb + (1 - best_weight) * test_preds_lgb

submission = pd.DataFrame({
    'id': test['id'],
    TARGET: test_preds
})

submission.to_csv('submission.csv', index=False)

print('='*70)
print('SUBMISSION CREATED')
print('='*70)
print('Strategy:')
print(f'  - Cutoff detection: ID {cutoff_id}')
print(f'  - Adversarial validation AUC: {mean_auc:.4f} (close to 0.5 = good)')
print(f'  - Original dataset features: {len(ORIG)} features')
print(f'  - Total features: {len(FEATURES)}')
print(f'  - XGBoost + LightGBM ensemble')
print(f'  - Optimal weights: XGB={best_weight:.1f}, LGB={1-best_weight:.1f}')
print('='*70)
print(f'Validation AUC: {best_score:.5f}')
print(f'Expected LB: {best_score:.5f} Â± 0.001')
print('='*70)
print('\nSubmission statistics:')
print(submission[TARGET].describe())
print('='*70)