# EXP-014: 3-Model L1 Stacking + L2 XGB/NN Blend

**LB: 0.8515** (рекорд)

## Пайплайн:
```
Step 0.5: Per-target Optuna (XGB 200k/3f/30t + CB 200k/3f/30t + LGB 100k/3f/20t)
Step 1:   L1 OOF 750k 5-fold (XGB GPU + CB GPU + LGB CPU) → 6 npy
Step 2a:  L2 XGB Optuna per-target (15 trials) → OOF 0.8457
Step 2b:  L2 NN v2 (LayerNorm, StandardScaler, 3 blocks) → OOF 0.8414
Final:    Blend 60/40 XGB+NN → OOF 0.8479 → LB 0.8515
```

**Каждый шаг проверяет артефакты → если есть, загружает и пропускает.**

In [None]:
# ============================================================
# CELL 1: Setup + Environment
# ============================================================
!pip install -q xgboost catboost lightgbm optuna

import numpy as np
import pandas as pd
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import optuna
import json
import os
import gc
import time
import warnings
import sys
from datetime import datetime
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
log_msg = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")

print("=" * 60)
print("EXP-014 FULL PIPELINE")
print("=" * 60)
print(f"XGBoost:  {xgb.__version__}")
print(f"CatBoost: {cb.__version__}")
print(f"LightGBM: {lgb.__version__}")
print(f"Optuna:   {optuna.__version__}")

# GPU check
!nvidia-smi --query-gpu=name,memory.total --format=csv,noheader

# Quick GPU test
dm = xgb.DMatrix(np.random.randn(1000, 10), label=np.random.randint(0, 2, 1000))
xgb.train({'device': 'cuda', 'objective': 'binary:logistic', 'verbosity': 0}, dm, num_boost_round=10)
print("XGBoost GPU: OK")

pool = cb.Pool(np.random.randn(1000, 10), label=np.random.randint(0, 2, 1000))
cb.CatBoostClassifier(iterations=10, task_type='GPU', verbose=0, bootstrap_type='Poisson', subsample=0.8).fit(pool)
print("CatBoost GPU: OK")
del dm, pool; gc.collect()
print("\nSETUP COMPLETE")

In [None]:
# ============================================================
# CELL 2: Config + Drive + Artifact Paths
# ============================================================
from google.colab import drive
if not os.path.isdir('/content/drive/MyDrive'):
    drive.mount('/content/drive')
else:
    print('Drive already mounted')

# --- Paths ---
DATA = '/content/drive/MyDrive/data_fusion'
ART_XGB  = f'{DATA}/artifacts/step05_xgb'
ART_CB   = f'{DATA}/artifacts/step05_cb'
ART_LGB  = f'{DATA}/artifacts/step05_lgb'
ART_L1   = f'{DATA}/artifacts/l1_oof'
ART_L2   = f'{DATA}/artifacts/l2_stacking'

for d in [ART_XGB, ART_CB, ART_LGB, ART_L1, ART_L2]:
    os.makedirs(d, exist_ok=True)

# --- Config ---
RANDOM_SEED = 42
N_FOLDS_L1 = 5
N_FOLDS_L2 = 5

# Step 0.5 config
OPTUNA_XGB = {'sample': 200_000, 'folds': 3, 'trials': 30, 'fs_thresh': 0.85}
OPTUNA_CB  = {'sample': 200_000, 'folds': 3, 'trials': 30, 'fs_thresh': 0.85}
OPTUNA_LGB = {'sample': 100_000, 'folds': 3, 'trials': 20, 'fs_thresh': 0.85}

# --- Check artifacts ---
def check_file(path):
    exists = os.path.exists(path)
    size = os.path.getsize(path) / 1e6 if exists else 0
    return exists, size

print("=" * 60)
print("ARTIFACT STATUS")
print("=" * 60)

artifacts = {
    'Step 0.5 XGB params':  f'{ART_XGB}/xgb_best_params.json',
    'Step 0.5 XGB feats':   f'{ART_XGB}/xgb_best_features.json',
    'Step 0.5 CB params':   f'{ART_CB}/cb_best_params.json',
    'Step 0.5 CB feats':    f'{ART_CB}/cb_best_features.json',
    'Step 0.5 LGB params':  f'{ART_LGB}/lgb_best_params.json',
    'Step 0.5 LGB feats':   f'{ART_LGB}/lgb_best_features.json',
    'L1 OOF XGB':           f'{ART_L1}/oof_xgb.npy',
    'L1 OOF CB':            f'{ART_L1}/oof_cb.npy',
    'L1 OOF LGB':           f'{ART_L1}/oof_lgb.npy',
    'L1 Test XGB':          f'{ART_L1}/test_xgb.npy',
    'L1 Test CB':           f'{ART_L1}/test_cb.npy',
    'L1 Test LGB':          f'{ART_L1}/test_lgb.npy',
    'L2 XGB params':        f'{ART_L2}/l2_xgb_best_params.json',
    'L2 OOF XGB':           f'{ART_L2}/oof_l2_xgb.npy',
    'L2 Test XGB':          f'{ART_L2}/test_l2_xgb.npy',
    'L2 OOF NN':            f'{ART_L2}/oof_l2_nn_v2.npy',
    'L2 Test NN':           f'{ART_L2}/test_l2_nn_v2.npy',
}

all_ok = True
for name, path in artifacts.items():
    exists, size = check_file(path)
    status = f'OK ({size:.1f} MB)' if exists else 'MISSING'
    if not exists:
        all_ok = False
    print(f"  {name:<22} {status}")

# Data files
print(f"\nDATA FILES:")
for f in ['train_target.parquet', 'train_main_features.parquet',
          'train_extra_features.parquet', 'test_main_features.parquet',
          'test_extra_features.parquet']:
    exists, size = check_file(f'{DATA}/{f}')
    print(f"  {f:<35} {'OK' if exists else 'MISSING'} ({size:.0f} MB)")

if all_ok:
    print(f"\n>>> ВСЕ АРТЕФАКТЫ НАЙДЕНЫ — можно сразу прыгать на L2 или Blend <<<")
else:
    print(f"\n>>> Есть MISSING артефакты — нужно запустить соответствующие шаги <<<")

---
## Step 0.5: Per-target Optuna (XGB + CB + LGB)

**Пропускается если артефакты уже есть.**

- XGB: 200k sample, 3-fold, 30 trials (~200 min)
- CB: 200k sample, 3-fold, 30 trials (~680 min)
- LGB: 100k sample, 3-fold, 20 trials (~216 min)

In [None]:
# ============================================================
# CELL 3: Load data for Step 0.5 / L1 (if needed)
# ============================================================

# Проверяем нужно ли грузить данные
need_step05 = not all([
    os.path.exists(f'{ART_XGB}/xgb_best_params.json'),
    os.path.exists(f'{ART_CB}/cb_best_params.json'),
    os.path.exists(f'{ART_LGB}/lgb_best_params.json'),
])
need_l1 = not os.path.exists(f'{ART_L1}/oof_xgb.npy')

if need_step05 or need_l1:
    print("Загрузка данных (нужны для Step 0.5 или L1)...")
    t0 = time.time()

    # Таргеты
    train_target = pd.read_parquet(f'{DATA}/train_target.parquet')
    target_cols = [c for c in train_target.columns if c.startswith('target_')]
    train_ids = train_target['customer_id'].values
    print(f"  Targets: {train_target.shape}")

    # Main features (float32 для RAM)
    train_main = pd.read_parquet(f'{DATA}/train_main_features.parquet')
    num_cols = train_main.select_dtypes(include=[np.number]).columns.drop('customer_id', errors='ignore')
    train_main[num_cols] = train_main[num_cols].astype(np.float32)
    print(f"  Main: {train_main.shape}, {train_main.memory_usage(deep=True).sum()/1e9:.1f} GB")

    # Extra features (float32 для RAM)
    train_extra = pd.read_parquet(f'{DATA}/train_extra_features.parquet')
    num_cols_e = train_extra.select_dtypes(include=[np.number]).columns.drop('customer_id', errors='ignore')
    train_extra[num_cols_e] = train_extra[num_cols_e].astype(np.float32)
    print(f"  Extra: {train_extra.shape}, {train_extra.memory_usage(deep=True).sum()/1e9:.1f} GB")
    gc.collect()

    # Merge
    X_full = train_main.merge(train_extra, on='customer_id', how='inner')
    del train_main, train_extra; gc.collect()

    # Выравнивание по customer_id
    X_full = X_full.set_index('customer_id').loc[train_ids].reset_index()
    y_full = train_target.set_index('customer_id').loc[train_ids].reset_index()
    del train_target; gc.collect()

    # Разделяем
    feature_names = [c for c in X_full.columns if c != 'customer_id']
    y_sample_full = y_full[target_cols]
    X_sample_full = X_full[feature_names]
    train_customer_ids = X_full['customer_id'].values

    print(f"\n  X_full: {X_sample_full.shape}")
    print(f"  y_full: {y_sample_full.shape}")
    print(f"  Loaded in {time.time()-t0:.0f}s")
else:
    # Грузим только таргеты (нужны для L2)
    train_target = pd.read_parquet(f'{DATA}/train_target.parquet')
    target_cols = [c for c in train_target.columns if c.startswith('target_')]
    train_ids = train_target['customer_id'].values
    y_train_arr = train_target[target_cols].values.astype(np.int8)
    del train_target; gc.collect()
    print(f"Step 0.5 + L1 артефакты найдены, данные не грузим.")
    print(f"Загружены только таргеты: {y_train_arr.shape}")

In [None]:
# ============================================================
# CELL 4: Step 0.5 — Feature Selection + Optuna functions
# ============================================================

def select_features_xgb(X_data, y_target, threshold=0.85):
    """FS через XGBoost gain importance (GPU)."""
    dm = xgb.DMatrix(X_data, label=y_target, nthread=-1)
    params = {'objective': 'binary:logistic', 'eval_metric': 'logloss',
              'device': 'cuda', 'max_depth': 6, 'learning_rate': 0.1, 'verbosity': 0}
    model = xgb.train(params, dm, num_boost_round=200)
    importance = model.get_score(importance_type='total_gain')
    if not importance:
        return list(X_data.columns)
    sorted_feats = sorted(importance.items(), key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in sorted_feats)
    selected, cumulative = [], 0.0
    for feat, gain in sorted_feats:
        selected.append(feat)
        cumulative += gain / total_gain
        if cumulative >= threshold:
            break
    del model, dm; gc.collect()
    return selected


def select_features_cb(X_data, y_target, threshold=0.85):
    """FS через CatBoost PredictionValuesChange (GPU)."""
    model = cb.CatBoostClassifier(
        iterations=200, depth=6, learning_rate=0.1,
        loss_function='Logloss', bootstrap_type='Poisson', subsample=0.8,
        task_type='GPU', random_seed=RANDOM_SEED, verbose=0)
    model.fit(cb.Pool(X_data, label=y_target))
    imp = model.get_feature_importance(type=cb.EFstrType.PredictionValuesChange)
    feat_names = X_data.columns.tolist()
    pairs = sorted(zip(feat_names, imp), key=lambda x: x[1], reverse=True)
    total_imp = sum(v for _, v in pairs)
    if total_imp == 0:
        return feat_names
    selected, cum = [], 0.0
    for name, importance in pairs:
        cum += importance
        selected.append(name)
        if cum / total_imp >= threshold:
            break
    if len(selected) < 10:
        selected = [n for n, _ in pairs[:10]]
    del model; gc.collect()
    return selected


def select_features_lgb(X_data, y_target, threshold=0.85):
    """FS через LightGBM gain importance (CPU)."""
    dtrain = lgb.Dataset(X_data, label=y_target)
    model = lgb.train(
        {'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
         'n_jobs': -1, 'num_leaves': 63, 'learning_rate': 0.1},
        dtrain, num_boost_round=200,
        valid_sets=[dtrain], valid_names=['train'],
        callbacks=[lgb.log_evaluation(0)])
    importance = model.feature_importance(importance_type='gain')
    feature_names = model.feature_name()
    pairs = sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in pairs)
    if total_gain == 0:
        return list(X_data.columns)
    selected, cumulative = [], 0.0
    for feat, gain in pairs:
        selected.append(feat)
        cumulative += gain / total_gain
        if cumulative >= threshold:
            break
    del model, dtrain; gc.collect()
    return selected

print("FS functions defined")

In [None]:
# ============================================================
# CELL 5: Step 0.5 — XGBoost Optuna (SKIP if artifacts exist)
# ============================================================
from optuna.integration import XGBoostPruningCallback

if os.path.exists(f'{ART_XGB}/xgb_best_params.json'):
    print(">>> XGB Optuna: SKIP (артефакты найдены) <<<")
    with open(f'{ART_XGB}/xgb_best_params.json') as f:
        xgb_params = json.load(f)
    with open(f'{ART_XGB}/xgb_best_features.json') as f:
        xgb_features = json.load(f)
    print(f"  Loaded: {len(xgb_params)} targets")
else:
    print("=" * 60)
    print("Step 0.5: XGBoost Optuna")
    cfg = OPTUNA_XGB
    print(f"Sample: {cfg['sample']:,}, Folds: {cfg['folds']}, Trials: {cfg['trials']}")
    print("=" * 60)

    # Sample data
    np.random.seed(RANDOM_SEED)
    idx = np.random.choice(len(X_sample_full), cfg['sample'], replace=False)
    X_s = X_sample_full.iloc[idx].reset_index(drop=True)
    y_s = y_sample_full.iloc[idx].reset_index(drop=True)

    xgb_params, xgb_features = {}, {}
    # Checkpoint
    cp_path = f'{ART_XGB}/checkpoint_xgb.json'
    if os.path.exists(cp_path):
        with open(cp_path) as f:
            cp = json.load(f)
        xgb_params = cp.get('best_params', {})
        xgb_features = cp.get('best_features', {})
        print(f"  Checkpoint: {len(xgb_params)} targets done")

    t_start = time.time()
    for i, tcol in enumerate(target_cols):
        if tcol in xgb_params:
            log_msg(f"[{i+1}/41] {tcol}: SKIP"); continue
        y_t = y_s[tcol]
        n_pos = int(y_t.sum())
        if n_pos < cfg['folds'] * 2:
            xgb_params[tcol] = {}; xgb_features[tcol] = list(X_s.columns)
            log_msg(f"[{i+1}/41] {tcol}: SKIP (pos={n_pos})"); continue

        selected = select_features_xgb(X_s, y_t, threshold=cfg['fs_thresh'])
        X_sel = X_s[selected]
        log_msg(f"[{i+1}/41] {tcol}: pos={y_t.mean():.4f}, FS: {X_s.shape[1]}>{len(selected)}")

        dm_full = xgb.DMatrix(X_sel, label=y_t, nthread=-1)
        skf = StratifiedKFold(n_splits=cfg['folds'], shuffle=True, random_state=RANDOM_SEED)
        folds = list(skf.split(X_sel, y_t))

        def objective(trial):
            params = {
                'objective': 'binary:logistic', 'eval_metric': 'auc',
                'device': 'cuda', 'n_jobs': 1, 'verbosity': 0,
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 250),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.05, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 25.0, log=True),
                'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
                'max_bin': trial.suggest_int('max_bin', 128, 512, step=64),
                'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
            }
            pruning_cb = XGBoostPruningCallback(trial, 'test-auc')
            try:
                cv_res = xgb.cv(params, dm_full, num_boost_round=1000, folds=folds,
                                early_stopping_rounds=50, callbacks=[pruning_cb], verbose_eval=False)
                return cv_res['test-auc-mean'].max()
            except optuna.exceptions.TrialPruned:
                raise
            except:
                return 0.5

        t0 = time.time()
        study = optuna.create_study(direction='maximize',
                                    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
                                    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
        study.optimize(objective, n_trials=cfg['trials'], show_progress_bar=True)

        xgb_params[tcol] = study.best_params
        xgb_features[tcol] = selected
        log_msg(f"  AUC={study.best_value:.4f}, time={time.time()-t0:.0f}s")

        del dm_full, study; gc.collect()

        if (i + 1) % 5 == 0:
            with open(cp_path, 'w') as f:
                json.dump({'best_params': xgb_params, 'best_features': xgb_features}, f, indent=2)
            log_msg(f"  Checkpoint: {len(xgb_params)}/41")

    with open(f'{ART_XGB}/xgb_best_params.json', 'w') as f:
        json.dump(xgb_params, f, indent=2)
    with open(f'{ART_XGB}/xgb_best_features.json', 'w') as f:
        json.dump(xgb_features, f, indent=2)
    log_msg(f"XGB Optuna DONE: {(time.time()-t_start)/60:.0f} min")
    del X_s, y_s; gc.collect()

In [None]:
# ============================================================
# CELL 6: Step 0.5 — CatBoost Optuna (SKIP if artifacts exist)
# ============================================================

if os.path.exists(f'{ART_CB}/cb_best_params.json'):
    print(">>> CB Optuna: SKIP (артефакты найдены) <<<")
    with open(f'{ART_CB}/cb_best_params.json') as f:
        cb_params = json.load(f)
    with open(f'{ART_CB}/cb_best_features.json') as f:
        cb_features = json.load(f)
    print(f"  Loaded: {len(cb_params)} targets")
else:
    print("=" * 60)
    print("Step 0.5: CatBoost Optuna")
    cfg = OPTUNA_CB
    print(f"Sample: {cfg['sample']:,}, Folds: {cfg['folds']}, Trials: {cfg['trials']}")
    print("=" * 60)

    np.random.seed(RANDOM_SEED)
    idx = np.random.choice(len(X_sample_full), cfg['sample'], replace=False)
    X_s = X_sample_full.iloc[idx].reset_index(drop=True)
    y_s = y_sample_full.iloc[idx].reset_index(drop=True)

    cb_params, cb_features = {}, {}
    cp_path = f'{ART_CB}/checkpoint_cb.json'
    if os.path.exists(cp_path):
        with open(cp_path) as f:
            cp = json.load(f)
        cb_params = cp.get('best_params', {})
        cb_features = cp.get('best_features', {})
        print(f"  Checkpoint: {len(cb_params)} targets done")

    t_start = time.time()
    for i, tcol in enumerate(target_cols):
        if tcol in cb_params:
            log_msg(f"[{i+1}/41] {tcol}: SKIP"); continue
        y_t = y_s[tcol]
        n_pos = int(y_t.sum())
        if n_pos < cfg['folds'] * 2:
            cb_params[tcol] = {}; cb_features[tcol] = list(X_s.columns)
            log_msg(f"[{i+1}/41] {tcol}: SKIP (pos={n_pos})"); continue

        selected = select_features_cb(X_s, y_t.values, threshold=cfg['fs_thresh'])
        X_sel = X_s[selected]
        log_msg(f"[{i+1}/41] {tcol}: pos={y_t.mean():.4f}, FS: {X_s.shape[1]}>{len(selected)}")

        skf = StratifiedKFold(n_splits=cfg['folds'], shuffle=True, random_state=RANDOM_SEED)
        folds = list(skf.split(X_sel, y_t))
        pools_tr = [cb.Pool(X_sel.iloc[tr], label=y_t.iloc[tr]) for tr, _ in folds]
        pools_val = [cb.Pool(X_sel.iloc[vl], label=y_t.iloc[vl]) for _, vl in folds]
        val_labels = [y_t.iloc[vl].values for _, vl in folds]

        def objective(trial):
            params = {
                'iterations': 2000, 'loss_function': 'Logloss',
                'task_type': 'GPU', 'random_seed': RANDOM_SEED, 'verbose': 0,
                'use_best_model': True, 'bootstrap_type': 'Poisson',
                'depth': trial.suggest_int('depth', 4, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.01, 100, log=True),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'random_strength': trial.suggest_float('random_strength', 0.0, 20.0),
                'border_count': trial.suggest_int('border_count', 32, 254),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
            }
            aucs = []
            for fi in range(cfg['folds']):
                model = cb.CatBoostClassifier(**params)
                model.fit(pools_tr[fi], eval_set=pools_val[fi], early_stopping_rounds=50)
                pred = model.predict_proba(pools_val[fi])[:, 1]
                aucs.append(roc_auc_score(val_labels[fi], pred))
                del model
            gc.collect()
            return np.mean(aucs)

        t0 = time.time()
        study = optuna.create_study(direction='maximize',
                                    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
        study.optimize(objective, n_trials=cfg['trials'], show_progress_bar=True)

        cb_params[tcol] = study.best_params
        cb_features[tcol] = selected
        log_msg(f"  AUC={study.best_value:.4f}, time={time.time()-t0:.0f}s")

        del pools_tr, pools_val, study; gc.collect()

        if (i + 1) % 5 == 0:
            with open(cp_path, 'w') as f:
                json.dump({'best_params': cb_params, 'best_features': cb_features}, f, indent=2)
            log_msg(f"  Checkpoint: {len(cb_params)}/41")

    with open(f'{ART_CB}/cb_best_params.json', 'w') as f:
        json.dump(cb_params, f, indent=2)
    with open(f'{ART_CB}/cb_best_features.json', 'w') as f:
        json.dump(cb_features, f, indent=2)
    log_msg(f"CB Optuna DONE: {(time.time()-t_start)/60:.0f} min")
    del X_s, y_s; gc.collect()

In [None]:
# ============================================================
# CELL 7: Step 0.5 — LightGBM Optuna (SKIP if artifacts exist)
# ============================================================

if os.path.exists(f'{ART_LGB}/lgb_best_params.json'):
    print(">>> LGB Optuna: SKIP (артефакты найдены) <<<")
    with open(f'{ART_LGB}/lgb_best_params.json') as f:
        lgb_params = json.load(f)
    with open(f'{ART_LGB}/lgb_best_features.json') as f:
        lgb_features = json.load(f)
    print(f"  Loaded: {len(lgb_params)} targets")
else:
    print("=" * 60)
    print("Step 0.5: LightGBM Optuna")
    cfg = OPTUNA_LGB
    print(f"Sample: {cfg['sample']:,}, Folds: {cfg['folds']}, Trials: {cfg['trials']}")
    print("=" * 60)

    np.random.seed(RANDOM_SEED)
    idx = np.random.choice(len(X_sample_full), cfg['sample'], replace=False)
    X_s = X_sample_full.iloc[idx].reset_index(drop=True)
    y_s = y_sample_full.iloc[idx].reset_index(drop=True)

    lgb_params, lgb_features = {}, {}
    cp_path = f'{ART_LGB}/checkpoint_lgb.json'
    if os.path.exists(cp_path):
        with open(cp_path) as f:
            cp = json.load(f)
        lgb_params = cp.get('best_params', {})
        lgb_features = cp.get('best_features', {})
        print(f"  Checkpoint: {len(lgb_params)} targets done")

    from optuna.integration import LightGBMPruningCallback
    import warnings as w
    w.filterwarnings('ignore', message='The reported value is ignored')

    t_start = time.time()
    for i, tcol in enumerate(target_cols):
        if tcol in lgb_params:
            log_msg(f"[{i+1}/41] {tcol}: SKIP"); continue
        y_t = y_s[tcol]
        n_pos = int(y_t.sum())
        if n_pos < cfg['folds'] * 2:
            lgb_params[tcol] = {}; lgb_features[tcol] = list(X_s.columns)
            log_msg(f"[{i+1}/41] {tcol}: SKIP (pos={n_pos})"); continue

        selected = select_features_lgb(X_s, y_t, threshold=cfg['fs_thresh'])
        X_sel = X_s[selected]
        log_msg(f"[{i+1}/41] {tcol}: pos={y_t.mean():.4f}, FS: {X_s.shape[1]}>{len(selected)}")

        skf = StratifiedKFold(n_splits=cfg['folds'], shuffle=True, random_state=RANDOM_SEED)
        folds = list(skf.split(X_sel, y_t))
        X_np = X_sel.values; y_np = y_t.values; feat_names = selected
        ds_tr = [lgb.Dataset(X_np[tr], label=y_np[tr], feature_name=feat_names, free_raw_data=False) for tr, _ in folds]
        ds_val = [lgb.Dataset(X_np[vl], label=y_np[vl], reference=ds_tr[j], feature_name=feat_names, free_raw_data=False) for j, (_, vl) in enumerate(folds)]
        val_idxs = [vl for _, vl in folds]

        def objective(trial):
            params = {
                'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'n_jobs': -1,
                'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
                'num_leaves': trial.suggest_int('num_leaves', 16, 255),
                'max_depth': trial.suggest_int('max_depth', 4, 10),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 2500),
                'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 100.0, log=True),
                'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 100.0, log=True),
                'feature_fraction': trial.suggest_float('feature_fraction', 0.05, 1.0),
                'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 15.0),
                'path_smooth': trial.suggest_float('path_smooth', 0.0, 10.0),
                'max_bin': trial.suggest_int('max_bin', 63, 511),
            }
            aucs = []
            for fi in range(cfg['folds']):
                pruning_cb = LightGBMPruningCallback(trial, 'auc', valid_name='val')
                model = lgb.train(params, ds_tr[fi], num_boost_round=1000,
                                  valid_sets=[ds_val[fi]], valid_names=['val'],
                                  callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0), pruning_cb])
                pred = model.predict(X_np[val_idxs[fi]])
                aucs.append(roc_auc_score(y_np[val_idxs[fi]], pred))
                del model
            gc.collect()
            return np.mean(aucs)

        t0 = time.time()
        study = optuna.create_study(direction='maximize',
                                    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
                                    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
        study.optimize(objective, n_trials=cfg['trials'], show_progress_bar=True)

        lgb_params[tcol] = study.best_params
        lgb_features[tcol] = selected
        log_msg(f"  AUC={study.best_value:.4f}, time={time.time()-t0:.0f}s")

        del ds_tr, ds_val, study; gc.collect()

        if (i + 1) % 5 == 0:
            with open(cp_path, 'w') as f:
                json.dump({'best_params': lgb_params, 'best_features': lgb_features}, f, indent=2)
            log_msg(f"  Checkpoint: {len(lgb_params)}/41")

    with open(f'{ART_LGB}/lgb_best_params.json', 'w') as f:
        json.dump(lgb_params, f, indent=2)
    with open(f'{ART_LGB}/lgb_best_features.json', 'w') as f:
        json.dump(lgb_features, f, indent=2)
    log_msg(f"LGB Optuna DONE: {(time.time()-t_start)/60:.0f} min")
    del X_s, y_s; gc.collect()

---
## Step 1: L1 OOF (750k, 5-fold, 3 models)

**~400 min. Пропускается если `oof_xgb.npy` уже есть.**

In [None]:
# ============================================================
# CELL 8: L1 OOF functions
# ============================================================

def l1_oof_xgb(X, y, X_te, params_dict, features, target_col,
               n_folds=N_FOLDS_L1, seed=RANDOM_SEED):
    """L1 OOF XGBoost GPU. Returns: oof (n,), test (m,), auc."""
    feats = features[target_col]
    p = params_dict[target_col]
    if not p:
        return np.zeros(len(X)), np.zeros(len(X_te)), 0.0
    X_sel = X[feats].values; X_te_sel = X_te[feats].values; y_arr = y.values
    oof = np.zeros(len(X)); test_preds = np.zeros(len(X_te))
    dm_te = xgb.DMatrix(X_te_sel)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_sel, y_arr)):
        xgb_p = {'objective': 'binary:logistic', 'eval_metric': 'auc',
                 'device': 'cuda', 'n_jobs': 1, 'verbosity': 0}
        for k, v in p.items():
            if k not in ('best_auc', 'default_auc', 'n_rounds'):
                xgb_p[k] = v
        dm_tr = xgb.DMatrix(X_sel[tr_idx], label=y_arr[tr_idx])
        dm_val = xgb.DMatrix(X_sel[val_idx], label=y_arr[val_idx])
        model = xgb.train(xgb_p, dm_tr, num_boost_round=2000,
                          evals=[(dm_val, 'val')], verbose_eval=0, early_stopping_rounds=50)
        oof[val_idx] = model.predict(dm_val)
        test_preds += model.predict(dm_te) / n_folds
        del model, dm_tr, dm_val
    del dm_te; gc.collect()
    return oof, test_preds, roc_auc_score(y_arr, oof)


def l1_oof_cb(X, y, X_te, params_dict, features, target_col,
              n_folds=N_FOLDS_L1, seed=RANDOM_SEED):
    """L1 OOF CatBoost GPU. Returns: oof (n,), test (m,), auc."""
    feats = features[target_col]
    p = params_dict[target_col]
    if not p:
        return np.zeros(len(X)), np.zeros(len(X_te)), 0.0
    X_sel = X[feats].values; X_te_sel = X_te[feats].values; y_arr = y.values
    oof = np.zeros(len(X)); test_preds = np.zeros(len(X_te))
    pool_te = cb.Pool(X_te_sel)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_sel, y_arr)):
        cb_p = {'iterations': 2000, 'loss_function': 'Logloss', 'task_type': 'GPU',
                'random_seed': seed, 'verbose': 0, 'use_best_model': True,
                'bootstrap_type': 'Poisson'}
        for k, v in p.items():
            if k not in ('best_auc', 'default_auc'):
                cb_p[k] = v
        pool_tr = cb.Pool(X_sel[tr_idx], label=y_arr[tr_idx])
        pool_val = cb.Pool(X_sel[val_idx], label=y_arr[val_idx])
        model = cb.CatBoostClassifier(**cb_p)
        model.fit(pool_tr, eval_set=pool_val, early_stopping_rounds=50)
        oof[val_idx] = model.predict_proba(pool_val)[:, 1]
        test_preds += model.predict_proba(pool_te)[:, 1] / n_folds
        del model, pool_tr, pool_val
    del pool_te; gc.collect()
    return oof, test_preds, roc_auc_score(y_arr, oof)


def l1_oof_lgb(X, y, X_te, params_dict, features, target_col,
               n_folds=N_FOLDS_L1, seed=RANDOM_SEED):
    """L1 OOF LightGBM CPU. Returns: oof (n,), test (m,), auc."""
    feats = features[target_col]
    p = params_dict[target_col]
    if not p:
        return np.zeros(len(X)), np.zeros(len(X_te)), 0.0
    X_sel = X[feats].values; X_te_sel = X_te[feats].values; y_arr = y.values
    oof = np.zeros(len(X)); test_preds = np.zeros(len(X_te))
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_sel, y_arr)):
        lgb_p = {'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'n_jobs': -1}
        for k, v in p.items():
            if k not in ('best_auc', 'default_auc'):
                lgb_p[k] = v
        dtrain = lgb.Dataset(X_sel[tr_idx], label=y_arr[tr_idx], feature_name=feats, free_raw_data=False)
        dval = lgb.Dataset(X_sel[val_idx], label=y_arr[val_idx], reference=dtrain,
                           feature_name=feats, free_raw_data=False)
        model = lgb.train(lgb_p, dtrain, num_boost_round=2000, valid_sets=[dval], valid_names=['val'],
                          callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
        oof[val_idx] = model.predict(X_sel[val_idx])
        test_preds += model.predict(X_te_sel) / n_folds
        del model, dtrain, dval
    gc.collect()
    return oof, test_preds, roc_auc_score(y_arr, oof)

print("L1 OOF functions defined")

In [None]:
# ============================================================
# CELL 9: L1 OOF Main Loop (SKIP if artifacts exist)
# ============================================================

if os.path.exists(f'{ART_L1}/oof_xgb.npy'):
    print(">>> L1 OOF: SKIP (артефакты найдены) <<<")
    oof_xgb = np.load(f'{ART_L1}/oof_xgb.npy')
    oof_cb  = np.load(f'{ART_L1}/oof_cb.npy')
    oof_lgb = np.load(f'{ART_L1}/oof_lgb.npy')
    test_xgb = np.load(f'{ART_L1}/test_xgb.npy')
    test_cb  = np.load(f'{ART_L1}/test_cb.npy')
    test_lgb = np.load(f'{ART_L1}/test_lgb.npy')
    print(f"  OOF: {oof_xgb.shape}, Test: {test_xgb.shape}")
    # Load targets if not loaded
    if 'y_train_arr' not in dir():
        target = pd.read_parquet(f'{DATA}/train_target.parquet')
        target_cols = [c for c in target.columns if c.startswith('target_')]
        y_train_arr = target[target_cols].values.astype(np.int8)
        train_ids = target['customer_id'].values
        del target; gc.collect()
else:
    print("=" * 60)
    print("L1 OOF: 750k, 5-fold, 3 models")
    print("=" * 60)

    # Load test
    t0 = time.time()
    main_te = pd.read_parquet(f'{DATA}/test_main_features.parquet')
    num_cols = main_te.select_dtypes(include=[np.number]).columns.drop('customer_id', errors='ignore')
    main_te[num_cols] = main_te[num_cols].astype(np.float32)
    extra_te = pd.read_parquet(f'{DATA}/test_extra_features.parquet')
    num_cols_e = extra_te.select_dtypes(include=[np.number]).columns.drop('customer_id', errors='ignore')
    extra_te[num_cols_e] = extra_te[num_cols_e].astype(np.float32)
    X_test = main_te.merge(extra_te, on='customer_id', how='inner')
    del main_te, extra_te; gc.collect()
    test_customer_ids = X_test['customer_id'].values
    X_test = X_test.drop(columns=['customer_id'])
    print(f"Test loaded: {X_test.shape}, {time.time()-t0:.0f}s")

    n_train = len(X_sample_full)
    n_test = len(X_test)
    oof_xgb = np.zeros((n_train, 41), dtype=np.float32)
    oof_cb = np.zeros((n_train, 41), dtype=np.float32)
    oof_lgb = np.zeros((n_train, 41), dtype=np.float32)
    test_xgb = np.zeros((n_test, 41), dtype=np.float32)
    test_cb = np.zeros((n_test, 41), dtype=np.float32)
    test_lgb = np.zeros((n_test, 41), dtype=np.float32)
    results_log = []

    # Checkpoint
    start_idx = 0
    cp_path = f'{ART_L1}/checkpoint_l1.json'
    if os.path.exists(cp_path):
        with open(cp_path) as f: cp = json.load(f)
        start_idx = cp['last_target_idx'] + 1
        for name in ['oof_xgb', 'oof_cb', 'oof_lgb', 'test_xgb', 'test_cb', 'test_lgb']:
            arr = np.load(f'{ART_L1}/{name}_partial.npy')
            locals()[name][:] = arr
        results_log = cp.get('results_log', [])
        print(f"Checkpoint: resume from {start_idx}/41")

    t_start = time.time()
    for i, tcol in enumerate(target_cols):
        if i < start_idx:
            continue
        y_t = y_sample_full[tcol]
        log_msg(f"[{i+1}/41] {tcol}: pos={y_t.mean():.4f}")

        t0 = time.time()
        oof_xgb[:, i], test_xgb[:, i], auc_x = l1_oof_xgb(
            X_sample_full, y_t, X_test, xgb_params, xgb_features, tcol)
        t_x = time.time() - t0

        t0 = time.time()
        oof_cb[:, i], test_cb[:, i], auc_c = l1_oof_cb(
            X_sample_full, y_t, X_test, cb_params, cb_features, tcol)
        t_c = time.time() - t0

        t0 = time.time()
        oof_lgb[:, i], test_lgb[:, i], auc_l = l1_oof_lgb(
            X_sample_full, y_t, X_test, lgb_params, lgb_features, tcol)
        t_l = time.time() - t0

        log_msg(f"  XGB={auc_x:.4f}({t_x:.0f}s) CB={auc_c:.4f}({t_c:.0f}s) LGB={auc_l:.4f}({t_l:.0f}s)")
        results_log.append({'target': tcol, 'auc_xgb': round(auc_x, 4),
                            'auc_cb': round(auc_c, 4), 'auc_lgb': round(auc_l, 4)})
        gc.collect()

        if (i + 1) % 5 == 0:
            for name in ['oof_xgb', 'oof_cb', 'oof_lgb', 'test_xgb', 'test_cb', 'test_lgb']:
                np.save(f'{ART_L1}/{name}_partial.npy', locals()[name])
            with open(cp_path, 'w') as f:
                json.dump({'last_target_idx': i, 'results_log': results_log}, f)
            elapsed = (time.time() - t_start) / 60
            done = i + 1 - start_idx
            eta = elapsed / done * (41 - i - 1) if done > 0 else 0
            log_msg(f"  Checkpoint {i+1}/41, elapsed={elapsed:.0f}min, ETA={eta:.0f}min")

    # Save final
    for name in ['oof_xgb', 'oof_cb', 'oof_lgb', 'test_xgb', 'test_cb', 'test_lgb']:
        np.save(f'{ART_L1}/{name}.npy', locals()[name])
    with open(f'{ART_L1}/results_log.json', 'w') as f:
        json.dump(results_log, f, indent=2)

    y_train_arr = y_sample_full.values.astype(np.int8)
    elapsed = (time.time() - t_start) / 60
    log_msg(f"L1 OOF DONE: {elapsed:.0f} min")
    del X_test; gc.collect()

---
## Step 2: L2 Stacking (XGB + NN Blend)

In [None]:
# ============================================================
# CELL 10: Build L2 matrix (123 OOF + 82 meta = 205 features)
# ============================================================
print("=" * 60)
print("Building L2 matrix")
print("=" * 60)

# Ensure targets loaded
if 'y_train_arr' not in dir():
    target = pd.read_parquet(f'{DATA}/train_target.parquet')
    target_cols = [c for c in target.columns if c.startswith('target_')]
    y_train_arr = target[target_cols].values.astype(np.int8)
    train_ids = target['customer_id'].values
    del target; gc.collect()

# Ensure L1 loaded
if 'oof_xgb' not in dir():
    oof_xgb = np.load(f'{ART_L1}/oof_xgb.npy')
    oof_cb  = np.load(f'{ART_L1}/oof_cb.npy')
    oof_lgb = np.load(f'{ART_L1}/oof_lgb.npy')
    test_xgb = np.load(f'{ART_L1}/test_xgb.npy')
    test_cb  = np.load(f'{ART_L1}/test_cb.npy')
    test_lgb = np.load(f'{ART_L1}/test_lgb.npy')

# 123 OOF features
X_l2_train = np.hstack([oof_xgb, oof_cb, oof_lgb])   # (750k, 123)
X_l2_test  = np.hstack([test_xgb, test_cb, test_lgb]) # (250k, 123)

# 82 meta features (mean + std across 3 models)
oof_stack = np.stack([oof_xgb, oof_cb, oof_lgb], axis=0)
test_stack = np.stack([test_xgb, test_cb, test_lgb], axis=0)
X_l2_train = np.hstack([X_l2_train, oof_stack.mean(0), oof_stack.std(0)])
X_l2_test  = np.hstack([X_l2_test, test_stack.mean(0), test_stack.std(0)])
del oof_stack, test_stack; gc.collect()

print(f"X_l2_train: {X_l2_train.shape} ({X_l2_train.nbytes/1e6:.0f} MB)")
print(f"X_l2_test:  {X_l2_test.shape}")
print(f"Features: 123 OOF + 41 mean + 41 std = {X_l2_train.shape[1]}")

In [None]:
# ============================================================
# CELL 11: L2 XGB Optuna + OOF (SKIP if artifacts exist)
# ============================================================
N_TRIALS_L2 = 15

if os.path.exists(f'{ART_L2}/oof_l2_xgb.npy'):
    print(">>> L2 XGB: SKIP (артефакты найдены) <<<")
    with open(f'{ART_L2}/l2_xgb_best_params.json') as f:
        l2_best_params = json.load(f)
    oof_l2_xgb = np.load(f'{ART_L2}/oof_l2_xgb.npy')
    test_l2_xgb = np.load(f'{ART_L2}/test_l2_xgb.npy')
    macro = np.mean([roc_auc_score(y_train_arr[:, i], oof_l2_xgb[:, i]) for i in range(41)])
    print(f"  OOF Macro AUC: {macro:.4f}")
else:
    print("=" * 60)
    print(f"L2 XGB Optuna: {N_TRIALS_L2} trials, {N_FOLDS_L2}-fold")
    print("=" * 60)

    def l2_objective(trial, X, y):
        params = {
            'objective': 'binary:logistic', 'eval_metric': 'auc',
            'device': 'cuda', 'n_jobs': 1, 'verbosity': 0,
            'max_depth': trial.suggest_int('max_depth', 2, 4),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.6),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 50.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        }
        skf = StratifiedKFold(n_splits=N_FOLDS_L2, shuffle=True, random_state=RANDOM_SEED)
        aucs = []
        for tr_idx, val_idx in skf.split(X, y):
            dm_tr = xgb.DMatrix(X[tr_idx], label=y[tr_idx])
            dm_val = xgb.DMatrix(X[val_idx], label=y[val_idx])
            model = xgb.train(params, dm_tr, num_boost_round=500,
                              evals=[(dm_val, 'val')], verbose_eval=0, early_stopping_rounds=30)
            aucs.append(roc_auc_score(y[val_idx], model.predict(dm_val)))
            del model, dm_tr, dm_val
        gc.collect()
        return np.mean(aucs)

    # Per-target Optuna
    l2_best_params = {}
    t_start = time.time()
    for i, tcol in enumerate(target_cols):
        t0 = time.time()
        y_col = y_train_arr[:, i]
        study = optuna.create_study(direction='maximize',
                                    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
        study.optimize(lambda trial: l2_objective(trial, X_l2_train, y_col),
                       n_trials=N_TRIALS_L2, show_progress_bar=False)
        bp = study.best_params; bp['best_auc'] = study.best_value
        l2_best_params[tcol] = bp
        log_msg(f"[{i+1}/41] {tcol}: AUC={study.best_value:.4f}, depth={bp['max_depth']}, time={time.time()-t0:.0f}s")
        del study; gc.collect()

    with open(f'{ART_L2}/l2_xgb_best_params.json', 'w') as f:
        json.dump(l2_best_params, f, indent=2)
    log_msg(f"L2 Optuna DONE: {(time.time()-t_start)/60:.1f} min")

    # OOF + test predictions
    print(f"\nL2 XGB OOF + Test...")
    oof_l2_xgb = np.zeros((len(X_l2_train), 41), dtype=np.float32)
    test_l2_xgb = np.zeros((len(X_l2_test), 41), dtype=np.float32)

    for i, tcol in enumerate(target_cols):
        y_col = y_train_arr[:, i]
        bp = l2_best_params[tcol]
        xgb_p = {'objective': 'binary:logistic', 'eval_metric': 'auc',
                 'device': 'cuda', 'n_jobs': 1, 'verbosity': 0,
                 'max_depth': bp['max_depth'], 'learning_rate': bp['learning_rate'],
                 'colsample_bytree': bp['colsample_bytree'], 'subsample': bp['subsample'],
                 'reg_lambda': bp['reg_lambda'], 'reg_alpha': bp['reg_alpha'],
                 'min_child_weight': bp['min_child_weight']}
        skf = StratifiedKFold(n_splits=N_FOLDS_L2, shuffle=True, random_state=RANDOM_SEED)
        dm_te = xgb.DMatrix(X_l2_test)
        for tr_idx, val_idx in skf.split(X_l2_train, y_col):
            dm_tr = xgb.DMatrix(X_l2_train[tr_idx], label=y_col[tr_idx])
            dm_val = xgb.DMatrix(X_l2_train[val_idx], label=y_col[val_idx])
            model = xgb.train(xgb_p, dm_tr, num_boost_round=500,
                              evals=[(dm_val, 'val')], verbose_eval=0, early_stopping_rounds=30)
            oof_l2_xgb[val_idx, i] = model.predict(dm_val)
            test_l2_xgb[:, i] += model.predict(dm_te) / N_FOLDS_L2
            del model, dm_tr, dm_val
        del dm_te; gc.collect()
        if (i + 1) % 10 == 0:
            auc = roc_auc_score(y_col, oof_l2_xgb[:, i])
            log_msg(f"  [{i+1}/41] {tcol}: AUC={auc:.4f}")

    np.save(f'{ART_L2}/oof_l2_xgb.npy', oof_l2_xgb)
    np.save(f'{ART_L2}/test_l2_xgb.npy', test_l2_xgb)
    macro = np.mean([roc_auc_score(y_train_arr[:, i], oof_l2_xgb[:, i]) for i in range(41)])
    log_msg(f"L2 XGB OOF Macro AUC: {macro:.4f}")

In [None]:
# ============================================================
# CELL 12: L2 NN v2 (SKIP if artifacts exist)
# ============================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

# Check for v3 first, then v2
nn_version = None
if os.path.exists(f'{ART_L2}/oof_l2_nn_v3.npy'):
    nn_version = 'v3'
elif os.path.exists(f'{ART_L2}/oof_l2_nn_v2.npy'):
    nn_version = 'v2'

if nn_version:
    print(f">>> L2 NN: SKIP (артефакты {nn_version} найдены) <<<")
    oof_l2_nn = np.load(f'{ART_L2}/oof_l2_nn_{nn_version}.npy')
    test_l2_nn = np.load(f'{ART_L2}/test_l2_nn_{nn_version}.npy')
    macro = np.mean([roc_auc_score(y_train_arr[:, i], oof_l2_nn[:, i]) for i in range(41)])
    print(f"  OOF Macro AUC: {macro:.4f}")
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"PyTorch device: {device}")

    scaler = StandardScaler()
    X_l2_train_scaled = scaler.fit_transform(X_l2_train).astype(np.float32)
    X_l2_test_scaled = scaler.transform(X_l2_test).astype(np.float32)
    print(f"Scaled range: [{X_l2_train_scaled.min():.2f}, {X_l2_train_scaled.max():.2f}]")

    class L2NetV2(nn.Module):
        def __init__(self, in_dim=205, h1=512, h2=256, h3=128, n_targets=41,
                     drop1=0.3, drop2=0.25, drop3=0.2):
            super().__init__()
            self.input_norm = nn.LayerNorm(in_dim)
            self.fc1 = nn.Linear(in_dim, h1)
            self.ln1 = nn.LayerNorm(h1)
            self.fc2 = nn.Linear(h1, h2)
            self.ln2 = nn.LayerNorm(h2)
            self.skip_proj = nn.Linear(h1, h2)
            self.fc3 = nn.Linear(h2, h3)
            self.ln3 = nn.LayerNorm(h3)
            self.classifier = nn.Linear(h3, n_targets)
            self.drop1 = nn.Dropout(drop1)
            self.drop2 = nn.Dropout(drop2)
            self.drop3 = nn.Dropout(drop3)

        def forward(self, x):
            x = self.input_norm(x)
            h1 = self.drop1(F.silu(self.ln1(self.fc1(x))))
            h2 = self.ln2(self.fc2(h1))
            h2 = self.drop2(F.silu(h2 + self.skip_proj(h1) * 0.5))
            h3 = self.drop3(F.silu(self.ln3(self.fc3(h2))))
            return self.classifier(h3)

    N_EPOCHS = 60; BATCH = 512; LR = 0.001; WD = 1e-5; PATIENCE = 15
    print(f"\n{'='*60}")
    print(f"L2 NN v3: {N_FOLDS_L2}-fold, {N_EPOCHS} ep, patience={PATIENCE}, batch={BATCH}, lr={LR}")
    print(f"{'='*60}")

    oof_l2_nn = np.zeros((len(X_l2_train), 41), dtype=np.float32)
    test_l2_nn = np.zeros((len(X_l2_test), 41), dtype=np.float32)
    X_te_tensor = torch.FloatTensor(X_l2_test_scaled).to(device)
    y_tensor_all = torch.FloatTensor(y_train_arr.astype(np.float32))
    skf = StratifiedKFold(n_splits=N_FOLDS_L2, shuffle=True, random_state=RANDOM_SEED)
    t_start = time.time()

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_l2_train_scaled, y_train_arr[:, 0])):
        t0 = time.time()
        X_tr = torch.FloatTensor(X_l2_train_scaled[tr_idx]).to(device)
        y_tr = y_tensor_all[tr_idx].to(device)
        X_val = torch.FloatTensor(X_l2_train_scaled[val_idx]).to(device)
        train_dl = DataLoader(TensorDataset(X_tr, y_tr), batch_size=BATCH, shuffle=True)

        model = L2NetV2().to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, max_lr=LR, epochs=N_EPOCHS,
            steps_per_epoch=len(train_dl), pct_start=0.3)
        criterion = nn.BCEWithLogitsLoss()
        best_auc = 0; best_state = None; no_improve = 0

        for epoch in range(N_EPOCHS):
            model.train()
            for xb, yb in train_dl:
                optimizer.zero_grad()
                loss = criterion(model(xb), yb)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

            model.eval()
            with torch.no_grad():
                val_probs = torch.sigmoid(model(X_val)).cpu().numpy()
            aucs = [roc_auc_score(y_train_arr[val_idx, j], val_probs[:, j]) for j in range(41)]
            macro = np.mean(aucs)
            if macro > best_auc:
                best_auc = macro
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
                no_improve = 0
            else:
                no_improve += 1
            if no_improve >= PATIENCE:
                break

        model.load_state_dict(best_state)
        model.eval()
        with torch.no_grad():
            oof_l2_nn[val_idx] = torch.sigmoid(model(X_val)).cpu().numpy()
            test_l2_nn += torch.sigmoid(model(X_te_tensor)).cpu().numpy() / N_FOLDS_L2
        log_msg(f"Fold {fold+1}: AUC={best_auc:.4f}, best_ep={epoch+1-no_improve}, stopped={epoch+1}, time={time.time()-t0:.0f}s")
        del model, X_tr, y_tr, X_val, best_state
        torch.cuda.empty_cache(); gc.collect()

    del X_te_tensor; torch.cuda.empty_cache()
    macro_nn = np.mean([roc_auc_score(y_train_arr[:, i], oof_l2_nn[:, i]) for i in range(41)])
    log_msg(f"L2 NN OOF Macro AUC: {macro_nn:.4f}, time: {(time.time()-t_start)/60:.1f} min")
    np.save(f'{ART_L2}/oof_l2_nn_v3.npy', oof_l2_nn)
    np.save(f'{ART_L2}/test_l2_nn_v3.npy', test_l2_nn)
    print(f"Saved oof_l2_nn_v3.npy, test_l2_nn_v3.npy")

In [None]:
# ============================================================
# CELL 13: Final Blend + Submission
# ============================================================
print("=" * 60)
print("FINAL BLEND")
print("=" * 60)

# Test alphas
alphas = [1.0, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60, 0.55, 0.50]
best_alpha, best_macro = 1.0, 0

for alpha in alphas:
    blend = alpha * oof_l2_xgb + (1 - alpha) * oof_l2_nn
    aucs = [roc_auc_score(y_train_arr[:, i], blend[:, i]) for i in range(41)]
    macro = np.mean(aucs)
    marker = ' <<< BEST' if macro > best_macro else ''
    print(f"  XGB {alpha*100:3.0f}% / NN {(1-alpha)*100:3.0f}%: Macro AUC = {macro:.4f}{marker}")
    if macro > best_macro:
        best_macro = macro; best_alpha = alpha

print(f"\nBest: alpha={best_alpha:.2f}, OOF Macro AUC = {best_macro:.4f}")

# --- Submission ---
blend_test = (best_alpha * test_l2_xgb + (1 - best_alpha) * test_l2_nn).astype(np.float64)

test_main = pd.read_parquet(f'{DATA}/test_main_features.parquet', columns=['customer_id'])
predict_cols = [c.replace('target_', 'predict_') for c in target_cols]

submission = pd.DataFrame({'customer_id': test_main['customer_id'].values})
for i, pcol in enumerate(predict_cols):
    submission[pcol] = blend_test[:, i]

sub_path = f'{DATA}/submissions/sub_exp014_blend.parquet'
os.makedirs(f'{DATA}/submissions', exist_ok=True)
submission.to_parquet(sub_path, index=False)

# Validation
print(f"\n--- Submission validation ---")
print(f"Shape:   {submission.shape}")
print(f"Columns: {list(submission.columns[:4])}... (predict_*, NOT target_*)")
print(f"Dtype:   {submission[predict_cols[0]].dtype} (must be float64)")
print(f"Range:   [{submission[predict_cols].min().min():.6f}, {submission[predict_cols].max().max():.6f}]")
print(f"NaN:     {submission[predict_cols].isna().any().any()}")
print(f"\nSaved: {sub_path}")
print(f"\n>>> DONE! OOF={best_macro:.4f}, expected LB ~0.8515 <<<")