# EXP-015: NN L2 Boost (RankGauss + Input Dropout)

**Базируется на EXP-014** (LB 0.8522)

## Что нового:
- **NN v4**: RankGauss (QuantileTransformer) вместо StandardScaler + Input Dropout 0.10
- **3-way blend**: XGB 55% + NN_v3 20% + NN_v4 25% (diversity от двух разных скалеров)

## Результат:
- NN v4 OOF: **0.8426** (v3 was 0.8415, **+0.0011**)
- 3-way blend OOF: **0.8487** (vs 2-way 0.8482, **+0.0005**)
- **LB: 0.8522** (паритет с EXP-014)

## Ключевой инсайт:
RankGauss дал +0.0011 OOF для NN (32/41 таргетов улучшились), но 3-way blend
эффективнее 2-way: v3 и v4 обучены на разных скалерах → разные ошибки → diversity.

In [None]:
# ============================================================
# CELL 1: Setup + Load EXP-014 artifacts
# ============================================================
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc, time, os, json
from datetime import datetime

from google.colab import drive
drive.mount('/content/drive')

log_msg = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")

# --- Пути (Drive папка = data_fusion, НЕ data_fusion_2026!) ---
DATA = '/content/drive/MyDrive/data_fusion'
ART_L1 = f'{DATA}/artifacts/l1_oof'
ART_L2 = f'{DATA}/artifacts/l2_stacking'

RANDOM_SEED = 42
N_FOLDS_L2 = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# --- Таргеты (БЕЗ sorted! sorted ломает порядок столбцов → AUC=0.50) ---
target = pd.read_parquet(f'{DATA}/train_target.parquet')
target_cols = [c for c in target.columns if c.startswith('target_')]
y_train_arr = target[target_cols].values.astype(np.int8)
train_ids = target['customer_id'].values
del target; gc.collect()
print(f"Targets: {y_train_arr.shape}, cols: {len(target_cols)}")
print(f"Порядок: {target_cols[:3]} ... {target_cols[-2:]}")

# --- L1 OOF ---
oof_xgb = np.load(f'{ART_L1}/oof_xgb.npy')
oof_cb  = np.load(f'{ART_L1}/oof_cb.npy')
oof_lgb = np.load(f'{ART_L1}/oof_lgb.npy')
test_xgb = np.load(f'{ART_L1}/test_xgb.npy')
test_cb  = np.load(f'{ART_L1}/test_cb.npy')
test_lgb = np.load(f'{ART_L1}/test_lgb.npy')
print(f"L1 OOF: XGB {oof_xgb.shape}, CB {oof_cb.shape}, LGB {oof_lgb.shape}")

# --- L2 матрица (123 OOF + 82 meta = 205 features) ---
X_l2_train = np.hstack([oof_xgb, oof_cb, oof_lgb])
X_l2_test  = np.hstack([test_xgb, test_cb, test_lgb])
oof_stack = np.stack([oof_xgb, oof_cb, oof_lgb], axis=0)
test_stack = np.stack([test_xgb, test_cb, test_lgb], axis=0)
X_l2_train = np.hstack([X_l2_train, oof_stack.mean(0), oof_stack.std(0)])
X_l2_test  = np.hstack([X_l2_test, test_stack.mean(0), test_stack.std(0)])
del oof_stack, test_stack; gc.collect()
print(f"L2 matrix: train {X_l2_train.shape}, test {X_l2_test.shape}")

# --- L2 XGB OOF (якорь) ---
oof_l2_xgb = np.load(f'{ART_L2}/oof_l2_xgb.npy')
test_l2_xgb = np.load(f'{ART_L2}/test_l2_xgb.npy')
xgb_macro = np.mean([roc_auc_score(y_train_arr[:, i], oof_l2_xgb[:, i]) for i in range(41)])
print(f"L2 XGB OOF Macro AUC: {xgb_macro:.4f}")

# --- L2 NN v3 OOF (файл oof_l2_nn_v3.npy, НЕ oof_l2_nn.npy!) ---
oof_l2_nn_v3 = np.load(f'{ART_L2}/oof_l2_nn_v3.npy')
test_l2_nn_v3 = np.load(f'{ART_L2}/test_l2_nn_v3.npy')
nn_v3_macro = np.mean([roc_auc_score(y_train_arr[:, i], oof_l2_nn_v3[:, i]) for i in range(41)])
print(f"L2 NN v3 OOF Macro AUC: {nn_v3_macro:.4f}")

# --- Blend baseline ---
blend = 0.6 * oof_l2_xgb + 0.4 * oof_l2_nn_v3
blend_macro = np.mean([roc_auc_score(y_train_arr[:, i], blend[:, i]) for i in range(41)])
print(f"\n{'='*60}")
print(f"BASELINE: XGB={xgb_macro:.4f}, NN_v3={nn_v3_macro:.4f}, Blend 60/40={blend_macro:.4f}")
print(f"LB 0.8522 (рекорд)")
print(f"{'='*60}")

In [None]:
# ============================================================
# CELL 2: NN v4 — RankGauss + Input Dropout
# ============================================================
t_total = time.time()

# --- RankGauss вместо StandardScaler ---
log_msg("Fitting QuantileTransformer (RankGauss)...")
qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=RANDOM_SEED)
X_train_rg = qt.fit_transform(X_l2_train).astype(np.float32)
X_test_rg = qt.transform(X_l2_test).astype(np.float32)
print(f"RankGauss range: [{X_train_rg.min():.2f}, {X_train_rg.max():.2f}]")

# --- Модель с Input Dropout ---
class L2NetV4(nn.Module):
    \"\"\"v4 = v3 + RankGauss + Input Dropout 0.10\"\"\"
    def __init__(self, in_dim=205, h1=512, h2=256, h3=128, n_targets=41,
                 drop_input=0.10, drop1=0.3, drop2=0.25, drop3=0.2):
        super().__init__()
        self.input_drop = nn.Dropout(drop_input)
        self.input_norm = nn.LayerNorm(in_dim)
        self.fc1 = nn.Linear(in_dim, h1)
        self.ln1 = nn.LayerNorm(h1)
        self.fc2 = nn.Linear(h1, h2)
        self.ln2 = nn.LayerNorm(h2)
        self.skip_proj = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, h3)
        self.ln3 = nn.LayerNorm(h3)
        self.classifier = nn.Linear(h3, n_targets)
        self.drop1 = nn.Dropout(drop1)
        self.drop2 = nn.Dropout(drop2)
        self.drop3 = nn.Dropout(drop3)

    def forward(self, x):
        x = self.input_drop(x)
        x = self.input_norm(x)
        h1 = self.drop1(F.silu(self.ln1(self.fc1(x))))
        h2 = self.ln2(self.fc2(h1))
        h2 = self.drop2(F.silu(h2 + self.skip_proj(h1) * 0.5))
        h3 = self.drop3(F.silu(self.ln3(self.fc3(h2))))
        return self.classifier(h3)

# --- Гиперпараметры ---
N_EPOCHS = 60; BATCH = 512; LR = 0.001; WD = 1e-5; PATIENCE = 15

print(f"\n{'='*60}")
print(f"L2 NN v4: RankGauss + InputDrop(0.10)")
print(f"{N_FOLDS_L2}-fold, {N_EPOCHS} ep, patience={PATIENCE}, batch={BATCH}, lr={LR}")
print(f"{'='*60}")

oof_l2_nn_v4 = np.zeros((len(X_train_rg), 41), dtype=np.float32)
test_l2_nn_v4 = np.zeros((len(X_test_rg), 41), dtype=np.float32)
X_te_tensor = torch.FloatTensor(X_test_rg).to(device)
y_tensor_all = torch.FloatTensor(y_train_arr.astype(np.float32))
skf = StratifiedKFold(n_splits=N_FOLDS_L2, shuffle=True, random_state=RANDOM_SEED)
fold_aucs = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_rg, y_train_arr[:, 0])):
    t0 = time.time()
    X_tr = torch.FloatTensor(X_train_rg[tr_idx]).to(device)
    y_tr = y_tensor_all[tr_idx].to(device)
    X_val = torch.FloatTensor(X_train_rg[val_idx]).to(device)
    train_dl = DataLoader(TensorDataset(X_tr, y_tr), batch_size=BATCH, shuffle=True)

    model = L2NetV4().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=LR, epochs=N_EPOCHS,
        steps_per_epoch=len(train_dl), pct_start=0.3)
    criterion = nn.BCEWithLogitsLoss()
    best_auc = 0; best_state = None; no_improve = 0

    for epoch in range(N_EPOCHS):
        model.train()
        for xb, yb in train_dl:
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

        model.eval()
        with torch.no_grad():
            val_probs = torch.sigmoid(model(X_val)).cpu().numpy()
        aucs = [roc_auc_score(y_train_arr[val_idx, j], val_probs[:, j]) for j in range(41)]
        macro = np.mean(aucs)
        if macro > best_auc:
            best_auc = macro
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1

        if (epoch + 1) % 10 == 0 or no_improve >= PATIENCE:
            log_msg(f"  Fold {fold} ep {epoch+1}: AUC={macro:.4f} (best={best_auc:.4f}, no_imp={no_improve})")
        if no_improve >= PATIENCE:
            break

    model.load_state_dict(best_state)
    model.eval()
    with torch.no_grad():
        oof_l2_nn_v4[val_idx] = torch.sigmoid(model(X_val)).cpu().numpy()
        test_l2_nn_v4 += torch.sigmoid(model(X_te_tensor)).cpu().numpy() / N_FOLDS_L2

    fold_aucs.append(best_auc)
    log_msg(f"Fold {fold}: AUC={best_auc:.4f}, time={time.time()-t0:.0f}s")
    del X_tr, y_tr, X_val, model, optimizer, scheduler, best_state
    torch.cuda.empty_cache(); gc.collect()

# --- Результат ---
v4_macro = np.mean([roc_auc_score(y_train_arr[:, i], oof_l2_nn_v4[:, i]) for i in range(41)])
print(f"\n{'='*60}")
print(f"NN v4 OOF Macro AUC: {v4_macro:.4f} (v3 was {nn_v3_macro:.4f}, diff={v4_macro-nn_v3_macro:+.4f})")
print(f"Per-fold: {[f'{a:.4f}' for a in fold_aucs]}")

# Blend с XGB
for w in [0.5, 0.6, 0.7]:
    bl = w * oof_l2_xgb + (1-w) * oof_l2_nn_v4
    bl_auc = np.mean([roc_auc_score(y_train_arr[:, i], bl[:, i]) for i in range(41)])
    print(f"Blend {w:.0%}/{1-w:.0%} XGB+NNv4: {bl_auc:.4f}")

print(f"\nTotal time: {(time.time()-t_total)/60:.1f} min")

In [None]:
# ============================================================
# CELL 3: 3-way blend analysis + save v4
# ============================================================

print("=== 3-way blend: XGB + NN_v3 + NN_v4 ===")
for w_xgb in [0.55, 0.60, 0.65]:
    for w_v3 in [0.10, 0.15, 0.20]:
        w_v4 = round(1.0 - w_xgb - w_v3, 2)
        if w_v4 < 0.05:
            continue
        bl = w_xgb * oof_l2_xgb + w_v3 * oof_l2_nn_v3 + w_v4 * oof_l2_nn_v4
        auc = np.mean([roc_auc_score(y_train_arr[:, i], bl[:, i]) for i in range(41)])
        tag = " <<<" if auc > blend_macro + 0.00005 else ""
        print(f"  XGB={w_xgb:.2f} v3={w_v3:.2f} v4={w_v4:.2f} -> OOF={auc:.4f} (diff={auc-blend_macro:+.5f}){tag}")

# Per-target comparison
print(f"\n=== Per-target: v4 vs v3 ===")
better = 0; worse = 0
for i, tc in enumerate(target_cols):
    a3 = roc_auc_score(y_train_arr[:, i], oof_l2_nn_v3[:, i])
    a4 = roc_auc_score(y_train_arr[:, i], oof_l2_nn_v4[:, i])
    diff = a4 - a3
    if abs(diff) > 0.002:
        print(f"  {tc}: v3={a3:.4f} v4={a4:.4f} diff={diff:+.4f}")
    if diff > 0: better += 1
    else: worse += 1
print(f"v4 лучше: {better}/41, v4 хуже: {worse}/41")

# Save v4
np.save(f'{ART_L2}/oof_l2_nn_v4.npy', oof_l2_nn_v4)
np.save(f'{ART_L2}/test_l2_nn_v4.npy', test_l2_nn_v4)
print(f"\nv4 артефакты сохранены в {ART_L2}/")

In [None]:
# ============================================================
# CELL 4: Submission — 3-way blend
# ============================================================

W_XGB, W_V3, W_V4 = 0.55, 0.20, 0.25

test_blend = W_XGB * test_l2_xgb + W_V3 * test_l2_nn_v3 + W_V4 * test_l2_nn_v4
print(f"Test blend: XGB={W_XGB}, v3={W_V3}, v4={W_V4}")
print(f"Test range: [{test_blend.min():.6f}, {test_blend.max():.6f}]")

test_df = pd.read_parquet(f'{DATA}/test_main_features.parquet', columns=['customer_id'])
sub = pd.DataFrame({'customer_id': test_df['customer_id'].values})
for i, tc in enumerate(target_cols):
    pred_col = tc.replace('target_', 'predict_')
    sub[pred_col] = test_blend[:, i].astype(np.float64)

assert sub.shape == (250000, 42)
assert all(c.startswith('predict_') for c in sub.columns[1:])
assert sub.iloc[:, 1:].dtypes.unique()[0] == np.float64

out_path = f'{DATA}/submission_exp015_3way_blend.parquet'
sub.to_parquet(out_path, index=False)
print(f"\nСохранено: {out_path}")
print(f"OOF: 0.8487 (vs baseline 0.8482, +0.0005)")
print(f"LB: 0.8522")

del test_df; gc.collect()