In [26]:
!pip -q install -U "tabpfn == 2.2.1"
!pip -q install pytorch-tabnet

# ============================================================
# CELL 1: IMPORTS & CONFIGURATION
# ============================================================

import os, random, gc, warnings
warnings.filterwarnings("ignore")

SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["TF_DETERMINISTIC_OPS"] = "1"

random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import pandas as pd
import joblib

import tensorflow as tf
tf.random.set_seed(SEED)
try:
    tf.config.experimental.enable_op_determinism()
except Exception:
    pass

from tensorflow import keras
from tensorflow.keras.layers import (
    Input, Dense, GRU, Bidirectional, Layer,
    Concatenate, Dropout, SpatialDropout1D, BatchNormalization,
    GaussianNoise, Masking
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences

from pathlib import Path
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.optimize import minimize_scalar

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from tabpfn import TabPFNClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import optuna
from optuna.samplers import TPESampler

keras.backend.clear_session()
gc.collect()

print("TF:", tf.__version__, "| PyTorch:", torch.__version__, "| SEED =", SEED)

DATA_PATH = Path("/kaggle/input/project/mallorn-astronomical-classification-challenge")

BANDS = ["u", "g", "r", "i", "z", "y"]
BAND_MAP = {b: i for i, b in enumerate(BANDS)}

MAX_SEQ_LEN = 300
N_FEATURES_PER_STEP = 4

MODEL_DIR = "saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)

GBM_MODEL_DIR = "/kaggle/working/models"
os.makedirs(GBM_MODEL_DIR, exist_ok=True)

FEAT_TRAIN_PKL = "/kaggle/input/2d-gp-features/kaggle/working/cache/train_features_2dgp_gpy.pkl"
FEAT_TEST_PKL  = "/kaggle/input/2d-gp-features/kaggle/working/cache/test_features_2dgp_gpy.pkl"

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# SelectKBest configuration (cho cả NN meta và GBM)
USE_SELECTKBEST = False   # Set False to disable feature selection
K_BEST_RATIO = 0.8       # Keep 80% of features (or use absolute number if < 1)
K_BEST_MIN = 200         # Minimum number of features to keep
K_BEST_MAX = None        # Maximum number of features (None = no limit)

# Device for PyTorch models
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"PyTorch models will use device: {DEVICE}")

print("Config done!")

TF: 2.19.0 | PyTorch: 2.8.0+cu126 | SEED = 42
PyTorch models will use device: cpu
Config done!


In [27]:
# ============================================================
# CELL 2: LOAD DATA
# ============================================================

print("\n--- Loading Data ---")

train_log = pd.read_csv(DATA_PATH / "train_log.csv")
test_log  = pd.read_csv(DATA_PATH / "test_log.csv")

def load_lc(split, kind):
    return pd.read_csv(DATA_PATH / split / f"{kind}_full_lightcurves.csv")

train_lc = pd.concat([load_lc(s, "train") for s in train_log["split"].unique()], ignore_index=True)
test_lc  = pd.concat([load_lc(s, "test")  for s in test_log["split"].unique()],  ignore_index=True)

train_feat = pd.read_pickle(FEAT_TRAIN_PKL)
test_feat  = pd.read_pickle(FEAT_TEST_PKL)

y = train_log["target"].values.astype(np.int32)

print(f"Train LC: {train_lc.shape}, Test LC: {test_lc.shape}")
print(f"Train objects: {len(train_log)}, Test objects: {len(test_log)}")
print(f"TDE count: {train_log['target'].sum()}, TDE ratio: {train_log['target'].mean()*100:.2f}%")


--- Loading Data ---
Train LC: (479384, 5), Test LC: (1145125, 5)
Train objects: 3043, Test objects: 7135
TDE count: 148, TDE ratio: 4.86%


In [28]:
# ============================================================
# CELL 8: TRAIN LIGHTGBM (WITH BEST-FOLD SUBMISSION)
# ============================================================
import numpy as np
import pandas as pd
import joblib
from itertools import combinations
from scipy.stats import rankdata
from scipy.optimize import minimize_scalar

print("\n" + "="*60)
print("TRAINING LIGHTGBM")
print("="*60)

lgb_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "max_depth": 7,
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_child_samples": 20,
    "scale_pos_weight": scale_pos_weight,
    "verbose": -1,
    "seed": SEED,
    "n_jobs": -1,
    "feature_fraction_seed": SEED,
    "bagging_seed": SEED,
    "data_random_seed": SEED,
}

lgb_oof = np.zeros(len(y))
lgb_test = np.zeros(len(X_test))
lgb_models = []
lgb_fold_scores = []
lgb_fold_thresholds = []
lgb_test_folds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold}/{N_FOLDS} ---")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100, verbose=False)],
    )

    lgb_models.append(model)
    fold_oof = model.predict(X_val)
    lgb_oof[val_idx] = fold_oof

    fold_test = model.predict(X_test)
    lgb_test += fold_test / N_FOLDS
    lgb_test_folds.append(fold_test)

    best_th, best_f1 = 0.5, 0
    for th in np.arange(0.1, 0.9, 0.01):
        f1 = f1_score(y_val, (fold_oof >= th).astype(int))
        if f1 > best_f1:
            best_f1, best_th = f1, th

    lgb_fold_scores.append(best_f1)
    lgb_fold_thresholds.append(best_th)
    print(f"Fold {fold}: F1={best_f1:.4f} at threshold={best_th:.2f}")

def neg_f1(th):
    return -f1_score(y, (lgb_oof >= th).astype(int))

result = minimize_scalar(neg_f1, bounds=(0.1, 0.9), method="bounded")
lgb_best_th = result.x
lgb_best_f1 = -result.fun
lgb_auc = roc_auc_score(y, lgb_oof)

print(f"\nLightGBM CV F1: {np.mean(lgb_fold_scores):.4f} ± {np.std(lgb_fold_scores):.4f}")
print(f"LightGBM Global F1: {lgb_best_f1:.4f} at threshold={lgb_best_th:.2f}")
print(f"LightGBM ROC-AUC: {lgb_auc:.4f}")

# Submission từ fold có F1 cao nhất
best_lgb_fold = int(np.argmax(lgb_fold_scores))
best_lgb_th = float(lgb_fold_thresholds[best_lgb_fold])
best_lgb_test = np.asarray(lgb_test_folds[best_lgb_fold], dtype=float)

lgb_single_preds = (best_lgb_test >= best_lgb_th).astype(int)
sub_lgb_single = pd.DataFrame({
    "object_id": test_log["object_id"],
    "target": lgb_single_preds
})
sub_lgb_single.to_csv("submission_single_fold_LGB.csv", index=False)
print(f"\nSaved single-fold LGB submission (best fold #{best_lgb_fold+1}) -> submission_single_fold_LGB.csv")


TRAINING LIGHTGBM

--- Fold 1/5 ---
Fold 1: F1=0.5263 at threshold=0.19

--- Fold 2/5 ---
Fold 2: F1=0.6250 at threshold=0.30

--- Fold 3/5 ---
Fold 3: F1=0.5067 at threshold=0.12

--- Fold 4/5 ---
Fold 4: F1=0.6667 at threshold=0.34

--- Fold 5/5 ---
Fold 5: F1=0.5758 at threshold=0.35

LightGBM CV F1: 0.5801 ± 0.0597
LightGBM Global F1: 0.5631 at threshold=0.34
LightGBM ROC-AUC: 0.9425

Saved single-fold LGB submission (best fold #4) -> submission_single_fold_LGB.csv


In [29]:
# ============================================================
# CELL 9: FEATURE STATISTICS ANALYSIS
# ============================================================

print("\n" + "="*70)
print("ANALYSIS 1: FEATURE STATISTICS")
print("="*70)

# Analyze original features before scaling
print("\n1. NaN/Inf Analysis:")
nan_counts = pd.DataFrame({
    'feature': feature_names,
    'nan_count': [np.isnan(X_raw[:, i]).sum() for i in range(X_raw.shape[1])],
    'inf_count': [np.isinf(X_raw[:, i]).sum() for i in range(X_raw.shape[1])],
    'zero_count': [(X_raw[:, i] == 0).sum() for i in range(X_raw.shape[1])],
})

nan_counts['nan_pct'] = (nan_counts['nan_count'] / len(X_raw) * 100).round(2)
nan_counts['zero_pct'] = (nan_counts['zero_count'] / len(X_raw) * 100).round(2)

high_nan = nan_counts[nan_counts['nan_pct'] > 50].sort_values('nan_pct', ascending=False)
print(f"\n  Total features: {len(nan_counts)}")
print(f"  Features with >50% NaN: {len(high_nan)}")
if len(high_nan) > 0:
    print(f"  Top 10 features with most NaN:")
    print(high_nan.head(10)[['feature', 'nan_pct', 'zero_pct']].to_string(index=False))

print("\n2. Feature Scale Analysis:")
feature_stats = pd.DataFrame({
    'feature': feature_names,
    'mean': [np.nanmean(X_raw[:, i]) for i in range(X_raw.shape[1])],
    'std': [np.nanstd(X_raw[:, i]) for i in range(X_raw.shape[1])],
    'min': [np.nanmin(X_raw[:, i]) for i in range(X_raw.shape[1])],
    'max': [np.nanmax(X_raw[:, i]) for i in range(X_raw.shape[1])],
    'q25': [np.nanpercentile(X_raw[:, i], 25) for i in range(X_raw.shape[1])],
    'q75': [np.nanpercentile(X_raw[:, i], 75) for i in range(X_raw.shape[1])],
})

# Remove NaN/inf for analysis
feature_stats = feature_stats.replace([np.inf, -np.inf], np.nan)

print(f"  Mean std across features: {feature_stats['std'].mean():.4f}")
print(f"  Max std: {feature_stats['std'].max():.4f}")
print(f"  Min std: {feature_stats['std'].min():.4f}")

# Features with extreme scales
extreme_std = feature_stats[feature_stats['std'] > 1000].sort_values('std', ascending=False)
if len(extreme_std) > 0:
    print(f"\n  Features with VERY HIGH std (>1000): {len(extreme_std)}")
    print(extreme_std.head(10)[['feature', 'std', 'min', 'max']].to_string(index=False))

zero_var = feature_stats[feature_stats['std'] < 1e-6]
if len(zero_var) > 0:
    print(f"\n  Features with ZERO/NEAR-ZERO variance (<1e-6): {len(zero_var)}")
    print(zero_var[['feature', 'std']].head(20).to_string(index=False))

print("\n3. Feature Distribution Skewness:")
from scipy.stats import skew
feature_stats['skewness'] = [skew(X_raw[:, i]) if np.nanstd(X_raw[:, i]) > 1e-6 else 0 
                             for i in range(X_raw.shape[1])]
high_skew = feature_stats[feature_stats['skewness'].abs() > 5].sort_values('skewness', key=abs, ascending=False)
if len(high_skew) > 0:
    print(f"  Features with HIGH skewness (|skew| > 5): {len(high_skew)}")
    print(high_skew.head(10)[['feature', 'skewness', 'mean', 'std']].to_string(index=False))


ANALYSIS 1: FEATURE STATISTICS

1. NaN/Inf Analysis:

  Total features: 308
  Features with >50% NaN: 0

2. Feature Scale Analysis:
  Mean std across features: 3663.5524
  Max std: 1097813.4700
  Min std: 0.0000

  Features with VERY HIGH std (>1000): 7
               feature          std           min          max
         gp2d_ls_ratio 1.097813e+06  6.153001e-08 4.299914e+07
u_gp2d_integrated_flux 2.955390e+03 -3.252791e+04 8.786704e+04
g_gp2d_integrated_flux 2.624058e+03 -2.264464e+04 6.584307e+04
r_gp2d_integrated_flux 2.447970e+03 -1.457523e+04 4.652942e+04
i_gp2d_integrated_flux 2.395894e+03 -1.254920e+04 4.101660e+04
y_gp2d_integrated_flux 2.349733e+03 -1.171562e+04 3.981636e+04
z_gp2d_integrated_flux 2.348133e+03 -1.210663e+04 4.057562e+04

  Features with ZERO/NEAR-ZERO variance (<1e-6): 5
     feature  std
       Z_err  0.0
count_snr_-3  0.0
count_snr_-5  0.0
 frac_snr_-3  0.0
 frac_snr_-5  0.0

3. Feature Distribution Skewness:
  Features with HIGH skewness (|skew| > 5): 11

=> Apply Select Features

In [30]:
# ============================================================
# CELL 7: PREPARE FEATURES WITH SelectKBest (FOR GBM MODELS)
# ============================================================
USE_SELECTKBEST = True
print("\n" + "="*60)
print("LOADING DATA (REUSED IN-MEMORY) FOR GBM")
print("="*60)
K_BEST_RATIO = 0.8
X_df = train_feat.copy()
X_test_df = test_feat.copy()

print(f"Train features: {X_df.shape}")
print(f"Test features: {X_test_df.shape}")
print(f"TDE count: {y.sum()}, TDE ratio: {y.mean()*100:.2f}%")

print("\n--- Preparing GBM Features ---")

if "object_id" in X_df.columns:
    X_df = X_df.drop(columns=["object_id"])
if "object_id" in X_test_df.columns:
    X_test_df = X_test_df.drop(columns=["object_id"])

# Deterministic order
common_cols = sorted(set(X_df.columns) & set(X_test_df.columns))
X_df = X_df[common_cols]
X_test_df = X_test_df[common_cols]

X_df = X_df.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_df = X_test_df.replace([np.inf, -np.inf], np.nan).fillna(0)

feature_names = X_df.columns.tolist()
X_raw = X_df.values
X_test_raw = X_test_df.values

n_features_original = X_raw.shape[1]
print(f"Original number of GBM features: {n_features_original}")

feature_selector = None
selected_feature_names = feature_names

os.makedirs(GBM_MODEL_DIR, exist_ok=True)

if USE_SELECTKBEST and n_features_original > K_BEST_MIN:
    # Calculate k_best
    if K_BEST_RATIO < 1.0:
        k_best = int(n_features_original * K_BEST_RATIO)
    else:
        k_best = int(K_BEST_RATIO)
    
    # Apply min/max constraints
    k_best = max(K_BEST_MIN, k_best)
    if K_BEST_MAX is not None:
        k_best = min(K_BEST_MAX, k_best)
    k_best = min(k_best, n_features_original)
    
    print(f"\nApplying SelectKBest (GBM): {n_features_original} -> {k_best} features")
    print(f"  (ratio={K_BEST_RATIO}, min={K_BEST_MIN}, max={K_BEST_MAX})")
    
    # Fit SelectKBest on full training data
    feature_selector = SelectKBest(score_func=f_classif, k=k_best)
    X = feature_selector.fit_transform(X_raw, y)
    X_test = feature_selector.transform(X_test_raw)
    
    # Get selected feature names
    selected_mask = feature_selector.get_support()
    selected_feature_names = [feature_names[i] for i in range(len(feature_names)) if selected_mask[i]]
    
    print(f"Selected {len(selected_feature_names)} features for GBM")
    
    # Save selector
    joblib.dump(feature_selector, f"{GBM_MODEL_DIR}/gbm_feature_selector.pkl")
    joblib.dump(selected_feature_names, f"{GBM_MODEL_DIR}/gbm_selected_feature_names.pkl")
    print(f"✅ Saved GBM feature selector and selected feature names")
    
else:
    print(f"\nSkipping SelectKBest for GBM (using all {n_features_original} features)")
    X = X_raw
    X_test = X_test_raw
    
    # Save all feature names
    joblib.dump(selected_feature_names, f"{GBM_MODEL_DIR}/gbm_all_feature_names.pkl")

print(f"\nFinal GBM feature count: {X.shape[1]}")
print(f"Final GBM train shape: {X.shape}, Final GBM test shape: {X_test.shape}")

scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")


LOADING DATA (REUSED IN-MEMORY) FOR GBM
Train features: (3043, 309)
Test features: (7135, 309)
TDE count: 148, TDE ratio: 4.86%

--- Preparing GBM Features ---
Original number of GBM features: 308

Applying SelectKBest (GBM): 308 -> 246 features
  (ratio=0.8, min=200, max=None)
Selected 246 features for GBM
✅ Saved GBM feature selector and selected feature names

Final GBM feature count: 246
Final GBM train shape: (3043, 246), Final GBM test shape: (7135, 246)
Scale pos weight: 19.56


In [31]:
# ============================================================
# CELL 8: TRAIN LIGHTGBM (WITH BEST-FOLD SUBMISSION)
# ============================================================
import numpy as np
import pandas as pd
import joblib
from itertools import combinations
from scipy.stats import rankdata
from scipy.optimize import minimize_scalar

print("\n" + "="*60)
print("TRAINING LIGHTGBM")
print("="*60)

lgb_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "max_depth": 7,
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_child_samples": 20,
    "scale_pos_weight": scale_pos_weight,
    "verbose": -1,
    "seed": SEED,
    "n_jobs": -1,
    "feature_fraction_seed": SEED,
    "bagging_seed": SEED,
    "data_random_seed": SEED,
}

lgb_oof = np.zeros(len(y))
lgb_test = np.zeros(len(X_test))
lgb_models = []
lgb_fold_scores = []
lgb_fold_thresholds = []
lgb_test_folds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold}/{N_FOLDS} ---")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100, verbose=False)],
    )

    lgb_models.append(model)
    fold_oof = model.predict(X_val)
    lgb_oof[val_idx] = fold_oof

    fold_test = model.predict(X_test)
    lgb_test += fold_test / N_FOLDS
    lgb_test_folds.append(fold_test)

    best_th, best_f1 = 0.5, 0
    for th in np.arange(0.1, 0.9, 0.01):
        f1 = f1_score(y_val, (fold_oof >= th).astype(int))
        if f1 > best_f1:
            best_f1, best_th = f1, th

    lgb_fold_scores.append(best_f1)
    lgb_fold_thresholds.append(best_th)
    print(f"Fold {fold}: F1={best_f1:.4f} at threshold={best_th:.2f}")

def neg_f1(th):
    return -f1_score(y, (lgb_oof >= th).astype(int))

result = minimize_scalar(neg_f1, bounds=(0.1, 0.9), method="bounded")
lgb_best_th = result.x
lgb_best_f1 = -result.fun
lgb_auc = roc_auc_score(y, lgb_oof)

print(f"\nLightGBM CV F1: {np.mean(lgb_fold_scores):.4f} ± {np.std(lgb_fold_scores):.4f}")
print(f"LightGBM Global F1: {lgb_best_f1:.4f} at threshold={lgb_best_th:.2f}")
print(f"LightGBM ROC-AUC: {lgb_auc:.4f}")

# Submission từ fold có F1 cao nhất
best_lgb_fold = int(np.argmax(lgb_fold_scores))
best_lgb_th = float(lgb_fold_thresholds[best_lgb_fold])
best_lgb_test = np.asarray(lgb_test_folds[best_lgb_fold], dtype=float)

lgb_single_preds = (best_lgb_test >= best_lgb_th).astype(int)
sub_lgb_single = pd.DataFrame({
    "object_id": test_log["object_id"],
    "target": lgb_single_preds
})
sub_lgb_single.to_csv("submission_single_fold_LGB.csv", index=False)
print(f"\nSaved single-fold LGB submission (best fold #{best_lgb_fold+1}) -> submission_single_fold_LGB.csv")


TRAINING LIGHTGBM

--- Fold 1/5 ---
Fold 1: F1=0.5263 at threshold=0.19

--- Fold 2/5 ---
Fold 2: F1=0.6250 at threshold=0.30

--- Fold 3/5 ---
Fold 3: F1=0.5067 at threshold=0.12

--- Fold 4/5 ---
Fold 4: F1=0.6667 at threshold=0.34

--- Fold 5/5 ---
Fold 5: F1=0.5758 at threshold=0.35

LightGBM CV F1: 0.5801 ± 0.0597
LightGBM Global F1: 0.5631 at threshold=0.34
LightGBM ROC-AUC: 0.9425

Saved single-fold LGB submission (best fold #4) -> submission_single_fold_LGB.csv


In [32]:
# ============================================================
# CELL 18: SHAP ANALYSIS FOR WEAKNESS DETECTION
# ============================================================

print("\n" + "="*70)
print("ANALYSIS: SHAP VALUES FOR FEATURE IMPORTANCE")
print("="*70)

try:
    import shap
    print("✅ SHAP library available\n")
except ImportError:
    print("❌ Installing SHAP...")
    !pip install -q shap
    import shap
    print("✅ SHAP installed\n")

# Use a sample for SHAP (tính toán nhanh hơn)
# Hoặc dùng toàn bộ data nếu có đủ thời gian
SAMPLE_SIZE = min(10000, len(X))  # Sample size for SHAP calculation

print(f"Using sample size: {SAMPLE_SIZE} for SHAP calculation")
print("(For full dataset, increase SAMPLE_SIZE or use tree explainer)\n")

# Sample data
sample_idx = np.random.choice(len(X), size=SAMPLE_SIZE, replace=False)
X_sample = X[sample_idx]
y_sample = y[sample_idx]

# Use one of the trained models (best fold)
best_model_idx = np.argmax(lgb_fold_scores)
shap_model = lgb_models[best_model_idx]

print(f"Using model from fold {best_model_idx + 1} (best F1: {lgb_fold_scores[best_model_idx]:.4f})\n")

# Calculate SHAP values
print("Calculating SHAP values...")
explainer = shap.TreeExplainer(shap_model)
shap_values = explainer.shap_values(X_sample)

# shap_values is a list: [shap_values_class_0, shap_values_class_1]
# For binary classification, we'll use class 1 (TDE)
shap_values_class1 = shap_values[1] if isinstance(shap_values, list) else shap_values

print(f"✅ SHAP values calculated: shape {shap_values_class1.shape}\n")

# ============================================================
# 1. GLOBAL FEATURE IMPORTANCE (SHAP)
# ============================================================
print("="*70)
print("1. GLOBAL FEATURE IMPORTANCE (SHAP)")
print("="*70)

# Mean absolute SHAP values = global importance
shap_importance = np.abs(shap_values_class1).mean(axis=0)
shap_importance_df = pd.DataFrame({
    'feature': feature_names[:len(shap_importance)],  # Ensure same length
    'shap_importance': shap_importance,
}).sort_values('shap_importance', ascending=False)

print("\nTop 30 Features by SHAP Importance:")
print(shap_importance_df.head(30).to_string(index=False))

print("\nBottom 20 Features by SHAP Importance (potentially useless):")
print(shap_importance_df.tail(20).to_string(index=False))

# Compare SHAP importance vs LGB gain importance
if 'feature_importance_df' in locals():
    print("\n" + "-"*70)
    print("SHAP vs LGB Gain Importance Comparison:")
    
    comparison_df = shap_importance_df.merge(
        feature_importance_df[['feature', 'importance']],
        on='feature',
        how='left'
    )
    comparison_df['importance_norm'] = (
        comparison_df['importance'] - comparison_df['importance'].min()
    ) / (comparison_df['importance'].max() - comparison_df['importance'].min() + 1e-10)
    comparison_df['shap_norm'] = (
        comparison_df['shap_importance'] - comparison_df['shap_importance'].min()
    ) / (comparison_df['shap_importance'].max() - comparison_df['shap_importance'].min() + 1e-10)
    
    corr_shap_lgb = np.corrcoef(
        comparison_df['shap_norm'], 
        comparison_df['importance_norm']
    )[0, 1]
    
    print(f"   Correlation: {corr_shap_lgb:.4f}")
    
    if corr_shap_lgb < 0.5:
        print(f"   ⚠️  LOW correlation! Different features are important from SHAP perspective!")
    
    # Features important in SHAP but not in LGB
    comparison_df['importance_diff'] = comparison_df['shap_norm'] - comparison_df['importance_norm']
    underused_shap = comparison_df.nlargest(20, 'importance_diff')
    
    print(f"\n   Top 20 Features MORE Important in SHAP than LGB Gain:")
    print(underused_shap[['feature', 'shap_importance', 'importance', 'importance_diff']].to_string(index=False))
    print(f"   → These features may be underutilized by current model!")

# ============================================================
# 2. FEATURE IMPACT BY CLASS
# ============================================================
print("\n" + "="*70)
print("2. FEATURE IMPACT BY CLASS (TDE vs Non-TDE)")
print("="*70)

# Separate SHAP values by actual class
tde_shap = shap_values_class1[y_sample == 1]
nontde_shap = shap_values_class1[y_sample == 0]

print(f"\nTDE samples: {len(tde_shap)}, Non-TDE samples: {len(nontde_shap)}")

tde_mean_shap = np.abs(tde_shap).mean(axis=0)
nontde_mean_shap = np.abs(nontde_shap).mean(axis=0)

class_impact_df = pd.DataFrame({
    'feature': feature_names[:len(tde_mean_shap)],
    'shap_TDE': tde_mean_shap,
    'shap_NonTDE': nontde_mean_shap,
    'shap_diff': tde_mean_shap - nontde_mean_shap,
}).sort_values('shap_diff', ascending=False)

print("\nTop 20 Features MORE Important for TDE predictions:")
print(class_impact_df.head(20)[['feature', 'shap_TDE', 'shap_NonTDE', 'shap_diff']].to_string(index=False))

print("\nTop 20 Features MORE Important for Non-TDE predictions:")
print(class_impact_df.tail(20)[['feature', 'shap_TDE', 'shap_NonTDE', 'shap_diff']].to_string(index=False))

# Features with very different importance between classes
class_specific = class_impact_df[np.abs(class_impact_df['shap_diff']) > 0.01].sort_values('shap_diff', key=abs, ascending=False)
print(f"\n   Found {len(class_specific)} features with CLASS-SPECIFIC importance:")
print(class_specific.head(15)[['feature', 'shap_TDE', 'shap_NonTDE', 'shap_diff']].to_string(index=False))
print(f"   → These features behave differently for TDE vs Non-TDE")

# ============================================================
# 3. MISCLASSIFICATION ANALYSIS WITH SHAP
# ============================================================
print("\n" + "="*70)
print("3. MISCLASSIFICATION ANALYSIS WITH SHAP")
print("="*70)

# Get predictions for sample
sample_preds = shap_model.predict(X_sample)
sample_preds_binary = (sample_preds >= lgb_fold_thresholds[best_model_idx]).astype(int)
sample_y = y_sample

# Find misclassifications
fp_mask = (sample_preds_binary == 1) & (sample_y == 0)
fn_mask = (sample_preds_binary == 0) & (sample_y == 1)
tp_mask = (sample_preds_binary == 1) & (sample_y == 1)
tn_mask = (sample_preds_binary == 0) & (sample_y == 0)

print(f"\nFP (False Positives): {fp_mask.sum()}")
print(f"FN (False Negatives): {fn_mask.sum()}")
print(f"TP (True Positives): {tp_mask.sum()}")
print(f"TN (True Negatives): {tn_mask.sum()}")

if fp_mask.sum() > 0:
    fp_shap = shap_values_class1[fp_mask]
    tn_shap = shap_values_class1[tn_mask][:min(fp_mask.sum(), tn_mask.sum())]
    
    fp_mean_shap = np.abs(fp_shap).mean(axis=0)
    tn_mean_shap = np.abs(tn_shap).mean(axis=0)
    
    fp_analysis = pd.DataFrame({
        'feature': feature_names[:len(fp_mean_shap)],
        'fp_shap': fp_mean_shap,
        'tn_shap': tn_mean_shap,
        'diff': fp_mean_shap - tn_mean_shap,
    }).sort_values('diff', ascending=False)
    
    print("\n   Top 15 Features causing False Positives (confusing Non-TDE as TDE):")
    print(fp_analysis.head(15)[['feature', 'fp_shap', 'tn_shap', 'diff']].to_string(index=False))

if fn_mask.sum() > 0:
    fn_shap = shap_values_class1[fn_mask]
    tp_shap = shap_values_class1[tp_mask][:min(fn_mask.sum(), tp_mask.sum())]
    
    fn_mean_shap = np.abs(fn_shap).mean(axis=0)
    tp_mean_shap = np.abs(tp_shap).mean(axis=0)
    
    fn_analysis = pd.DataFrame({
        'feature': feature_names[:len(fn_mean_shap)],
        'fn_shap': fn_mean_shap,
        'tp_shap': tp_mean_shap,
        'diff': fn_mean_shap - tp_mean_shap,
    }).sort_values('diff', ascending=False)
    
    print("\n   Top 15 Features causing False Negatives (missing TDE):")
    print(fn_analysis.head(15)[['feature', 'fn_shap', 'tp_shap', 'diff']].to_string(index=False))

# ============================================================
# 4. FEATURE VALUES THAT CONFUSE MODEL
# ============================================================
print("\n" + "="*70)
print("4. FEATURE VALUES THAT CAUSE CONFUSION")
print("="*70)

# For misclassified samples, check if SHAP values are unusually high/low
if fp_mask.sum() > 0 or fn_mask.sum() > 0:
    misclassified_shap = shap_values_class1[fp_mask | fn_mask]
    correctly_classified_shap = shap_values_class1[tp_mask | tn_mask]
    
    if len(misclassified_shap) > 0 and len(correctly_classified_shap) > 0:
        misclassified_mean = misclassified_shap.mean(axis=0)
        correct_mean = correctly_classified_shap.mean(axis=0)
        
        confusion_df = pd.DataFrame({
            'feature': feature_names[:len(misclassified_mean)],
            'misclassified_shap': misclassified_mean,
            'correct_shap': correct_mean,
            'shap_diff': misclassified_mean - correct_mean,
        }).sort_values('shap_diff', key=abs, ascending=False)
        
        print("\n   Features with DIFFERENT SHAP values for misclassified vs correctly classified:")
        print(confusion_df.head(20)[['feature', 'misclassified_shap', 'correct_shap', 'shap_diff']].to_string(index=False))

print("\n" + "="*70)
print("KEY INSIGHTS FROM SHAP")
print("="*70)

insights = []

if 'underused_shap' in locals() and len(underused_shap) > 10:
    insights.append(f"❌ {len(underused_shap)} features are important in SHAP but underused by LGB gain importance")
    insights.append("   → Model may benefit from feature engineering or different algorithm")

if len(class_specific) > 10:
    insights.append(f"⚠️  {len(class_specific)} features have class-specific importance")
    insights.append("   → Consider class-specific feature engineering or weighted features")

if fp_mask.sum() > 5 or fn_mask.sum() > 5:
    insights.append(f"⚠️  Misclassifications found: {fp_mask.sum()} FP, {fn_mask.sum()} FN")
    insights.append("   → Check feature values and interactions for these cases")

if len(insights) == 0:
    print("\n✅ SHAP analysis shows consistent feature importance")
else:
    print("\n")
    for i, insight in enumerate(insights, 1):
        print(f"{i}. {insight}")

print("\n" + "="*70)

# Save SHAP importance for later use
shap_importance_df.to_csv(f"{GBM_MODEL_DIR}/shap_importance.csv", index=False)
print(f"\n✅ Saved SHAP importance to {GBM_MODEL_DIR}/shap_importance.csv")


ANALYSIS: SHAP VALUES FOR FEATURE IMPORTANCE
✅ SHAP library available

Using sample size: 3043 for SHAP calculation
(For full dataset, increase SAMPLE_SIZE or use tree explainer)

Using model from fold 4 (best F1: 0.6667)

Calculating SHAP values...
✅ SHAP values calculated: shape (3043, 246)

1. GLOBAL FEATURE IMPORTANCE (SHAP)

Top 30 Features by SHAP Importance:
                feature  shap_importance
i_gp2d_peaks_pos_frac_2         0.488558
    i_gp2d_time_fwd_0.5         0.420239
              i_snr_max         0.355115
    i_gp2d_decline_rate         0.312741
           flux_std_all         0.203499
         flux_range_all         0.187759
           flux_max_all         0.181696
    r_gp2d_time_bwd_0.8         0.167534
      count_max_fall_30         0.159222
             r_flux_std         0.157541
             i_flux_min         0.151830
    g_gp2d_time_fwd_0.5         0.148489
  i_gp2d_positive_width         0.137879
               g_pct_25         0.131938
       g_gp2d_pe

In [33]:
# ============================================================
# CELL 19: SHAP-BASED FEATURE OPTIMIZATION
# ============================================================

print("\n" + "="*70)
print("SHAP-BASED FEATURE OPTIMIZATION")
print("="*70)

# Load SHAP importance if not in memory
import os
if 'shap_importance_df' not in locals():
    if os.path.exists(f"{GBM_MODEL_DIR}/shap_importance.csv"):
        shap_importance_df = pd.read_csv(f"{GBM_MODEL_DIR}/shap_importance.csv")
        print("✅ Loaded SHAP importance from file")
    else:
        print("❌ SHAP importance not found. Run CELL 18 first!")
        raise FileNotFoundError("Please run CELL 18 (SHAP Analysis) first")

print(f"\nTotal features (after SelectKBest): {len(shap_importance_df)}")

# Get original feature data (before SelectKBest transformation)
# We need to work with original X_df, X_test_df
X_df_orig = train_feat.copy()
X_test_df_orig = test_feat.copy()

if "object_id" in X_df_orig.columns:
    X_df_orig = X_df_orig.drop(columns=["object_id"])
if "object_id" in X_test_df_orig.columns:
    X_test_df_orig = X_test_df_orig.drop(columns=["object_id"])

common_cols_orig = sorted(set(X_df_orig.columns) & set(X_test_df_orig.columns))
X_df_orig = X_df_orig[common_cols_orig]
X_test_df_orig = X_test_df_orig[common_cols_orig]

X_df_orig = X_df_orig.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_df_orig = X_test_df_orig.replace([np.inf, -np.inf], np.nan).fillna(0)

# ============================================================
# STEP 1: REMOVE USELESS FEATURES (SHAP importance < threshold)
# ============================================================
print("\n" + "-"*70)
print("STEP 1: REMOVE USELESS FEATURES")
print("-"*70)

# Threshold: remove features with SHAP importance < 0.001
SHAP_THRESHOLD = 0.001

useless_features = shap_importance_df[
    shap_importance_df['shap_importance'] < SHAP_THRESHOLD
]['feature'].tolist()

print(f"Features to remove (SHAP < {SHAP_THRESHOLD}): {len(useless_features)}")
if len(useless_features) > 0:
    print(f"  Examples: {useless_features[:10]}")

# Remove from original datasets
X_df_cleaned = X_df_orig.drop(columns=[f for f in useless_features if f in X_df_orig.columns], errors='ignore')
X_test_df_cleaned = X_test_df_orig.drop(columns=[f for f in useless_features if f in X_test_df_orig.columns], errors='ignore')

print(f"\n✅ Removed {len(X_df_orig.columns) - len(X_df_cleaned.columns)} useless features")
print(f"   Remaining features: {len(X_df_cleaned.columns)}")

# ============================================================
# STEP 2: CREATE FEATURE INTERACTIONS FOR TOP FEATURES
# ============================================================
print("\n" + "-"*70)
print("STEP 2: CREATE FEATURE INTERACTIONS FOR TOP FEATURES")
print("-"*70)

# Get top 10 features by SHAP importance
top_shap_features = shap_importance_df.head(10)['feature'].tolist()
print(f"\nTop 10 SHAP features:")
for i, feat in enumerate(top_shap_features, 1):
    shap_val = shap_importance_df[shap_importance_df['feature'] == feat]['shap_importance'].iloc[0]
    print(f"  {i}. {feat}: {shap_val:.4f}")

# Create interactions between top features (multiplicative)
print(f"\nCreating interactions for top i-band features...")
interaction_count = 0

# Focus on i-band features (they're most important)
i_band_features = [f for f in top_shap_features if f.startswith('i_')]
other_top_features = [f for f in top_shap_features if not f.startswith('i_')]

# Create interactions: top i-feature × top other feature
for i_feat in i_band_features[:3]:  # Top 3 i-features
    if i_feat not in X_df_cleaned.columns:
        continue
    
    for other_feat in other_top_features[:3]:  # Top 3 other features
        if other_feat not in X_df_cleaned.columns:
            continue
        
        # Skip self-interaction
        if i_feat == other_feat:
            continue
        
        # Create multiplicative interaction
        interaction_name = f"{i_feat}_x_{other_feat}"
        
        # Normalize before multiplication to avoid extreme values
        i_norm = (X_df_cleaned[i_feat] - X_df_cleaned[i_feat].mean()) / (X_df_cleaned[i_feat].std() + 1e-10)
        other_norm = (X_df_cleaned[other_feat] - X_df_cleaned[other_feat].mean()) / (X_df_cleaned[other_feat].std() + 1e-10)
        
        X_df_cleaned[interaction_name] = i_norm * other_norm
        X_test_df_cleaned[interaction_name] = (
            (X_test_df_cleaned[i_feat] - X_df_cleaned[i_feat].mean()) / (X_df_cleaned[i_feat].std() + 1e-10) *
            (X_test_df_cleaned[other_feat] - X_df_cleaned[other_feat].mean()) / (X_df_cleaned[other_feat].std() + 1e-10)
        )
        interaction_count += 1

print(f"✅ Created {interaction_count} feature interactions")

# ============================================================
# STEP 3: CREATE CLASS-SPECIFIC FEATURES (for TDE)
# ============================================================
print("\n" + "-"*70)
print("STEP 3: CREATE CLASS-SPECIFIC FEATURES")
print("-"*70)

# Features important for TDE predictions (from SHAP analysis)
tde_important_features = [
    'i_gp2d_decline_rate',
    'flux_range_all',
    'i_snr_max',
    'i_gp2d_time_fwd_0.5',
]

print(f"\nCreating weighted features for TDE-important features...")
class_specific_count = 0

for feat in tde_important_features:
    if feat not in X_df_cleaned.columns:
        continue
    
    # Create squared feature (non-linear transformation)
    X_df_cleaned[f"{feat}_squared"] = X_df_cleaned[feat] ** 2
    X_test_df_cleaned[f"{feat}_squared"] = X_test_df_cleaned[feat] ** 2
    
    # Create log feature if positive
    if (X_df_cleaned[feat] > 0).all():
        X_df_cleaned[f"{feat}_log"] = np.log1p(X_df_cleaned[feat])
        X_test_df_cleaned[f"{feat}_log"] = np.log1p(X_test_df_cleaned[feat])
        class_specific_count += 1
    
    class_specific_count += 1

print(f"✅ Created {class_specific_count} class-specific features")

# ============================================================
# STEP 4: CREATE ANTI-FP FEATURES (to reduce False Positives)
# ============================================================
print("\n" + "-"*70)
print("STEP 4: CREATE ANTI-FP FEATURES")
print("-"*70)

# Features that cause False Positives (from SHAP analysis)
fp_features = [
    'i_gp2d_time_fwd_0.5',
    'i_gp2d_decline_rate',
    'flux_range_all',
    'u_gp2d_abs_diff',
]

print(f"\nCreating anti-FP features (to reduce false positives)...")
anti_fp_count = 0

for feat in fp_features:
    if feat not in X_df_cleaned.columns:
        continue
    
    # Create threshold-based feature: is feature value > median?
    median_val = X_df_cleaned[feat].median()
    X_df_cleaned[f"{feat}_high"] = (X_df_cleaned[feat] > median_val).astype(float)
    X_test_df_cleaned[f"{feat}_high"] = (X_test_df_cleaned[feat] > median_val).astype(float)
    
    anti_fp_count += 1

print(f"✅ Created {anti_fp_count} anti-FP features")

# ============================================================
# STEP 5: CREATE ANTI-FN FEATURES (to reduce False Negatives)
# ============================================================
print("\n" + "-"*70)
print("STEP 5: CREATE ANTI-FN FEATURES")
print("-"*70)

# Features important for FN cases (from SHAP analysis)
fn_features = [
    'u_gp2d_abs_diff',
    'i_snr_mean',
    'Z',
]

print(f"\nCreating anti-FN features (to reduce false negatives)...")
anti_fn_count = 0

for feat in fn_features:
    if feat not in X_df_cleaned.columns:
        continue
    
    # Create threshold-based feature
    median_val = X_df_cleaned[feat].median()
    X_df_cleaned[f"{feat}_low"] = (X_df_cleaned[feat] < median_val).astype(float)
    X_test_df_cleaned[f"{feat}_low"] = (X_test_df_cleaned[feat] < median_val).astype(float)
    
    anti_fn_count += 1

print(f"✅ Created {anti_fn_count} anti-FN features")

# ============================================================
# STEP 6: FINAL CLEANUP
# ============================================================
print("\n" + "-"*70)
print("STEP 6: FINAL CLEANUP")
print("-"*70)

# Ensure same columns
common_cols = sorted(set(X_df_cleaned.columns) & set(X_test_df_cleaned.columns))
X_df_cleaned = X_df_cleaned[common_cols]
X_test_df_cleaned = X_test_df_cleaned[common_cols]

print(f"\nFinal feature count: {len(common_cols)}")
print(f"  - Original: {len(X_df_orig.columns)}")
print(f"  - Removed useless: {len(useless_features)}")
print(f"  - Added (interactions + transforms): {len(common_cols) - (len(X_df_orig.columns) - len([f for f in useless_features if f in X_df_orig.columns]))}")

# Replace NaN/Inf
X_df_cleaned = X_df_cleaned.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_df_cleaned = X_test_df_cleaned.replace([np.inf, -np.inf], np.nan).fillna(0)

# Update feature names and data
feature_names_optimized = X_df_cleaned.columns.tolist()
X_raw_optimized = X_df_cleaned.values
X_test_raw_optimized = X_test_df_cleaned.values

print(f"\n✅ Feature optimization completed!")
print(f"   Optimized shape: Train {X_raw_optimized.shape}, Test {X_test_raw_optimized.shape}")

# Save optimized feature names
joblib.dump(feature_names_optimized, f"{GBM_MODEL_DIR}/optimized_feature_names.pkl")
print(f"✅ Saved optimized feature names")


SHAP-BASED FEATURE OPTIMIZATION

Total features (after SelectKBest): 246

----------------------------------------------------------------------
STEP 1: REMOVE USELESS FEATURES
----------------------------------------------------------------------
Features to remove (SHAP < 0.001): 24
  Examples: ['y_flux_min', 'gp2d_peak_dt_r_z', 'i_gp2d_time_bwd_0.5', 'i_gp2d_time_bwd_0.8', 'snr_median', 'r_gp2d_peaks_pos_count', 'i_gp2d_negative_width', 'i_gp2d_rise_decline_ratio', 'y_flux_mean', 'u_gp2d_fwhm']

✅ Removed 24 useless features
   Remaining features: 284

----------------------------------------------------------------------
STEP 2: CREATE FEATURE INTERACTIONS FOR TOP FEATURES
----------------------------------------------------------------------

Top 10 SHAP features:
  1. i_gp2d_peaks_pos_frac_2: 0.4886
  2. i_gp2d_time_fwd_0.5: 0.4202
  3. i_snr_max: 0.3551
  4. i_gp2d_decline_rate: 0.3127
  5. flux_std_all: 0.2035
  6. flux_range_all: 0.1878
  7. flux_max_all: 0.1817
  8. r_gp2d_t

In [34]:
# ============================================================
# CELL 20: TRAIN LIGHTGBM WITH OPTIMIZED FEATURES
# ============================================================

print("\n" + "="*60)
print("TRAINING LIGHTGBM (SHAP-OPTIMIZED FEATURES)")
print("="*60)

# Use optimized features
X_raw = X_raw_optimized
X_test_raw = X_test_raw_optimized
feature_names = feature_names_optimized
n_features_original = X_raw.shape[1]

print(f"\nUsing optimized features: {n_features_original} features")

# Apply Robust Scaling
print("\n--- Applying Robust Scaling ---")
scaler_robust_opt = RobustScaler()
X_scaled = scaler_robust_opt.fit_transform(X_raw)
X_test_scaled = scaler_robust_opt.transform(X_test_raw)

print(f"Scaled features - Train: {X_scaled.shape}, Test: {X_test_scaled.shape}")

# Save scaler
joblib.dump(scaler_robust_opt, f"{GBM_MODEL_DIR}/robust_scaler_optimized.pkl")

# Use scaled features
X = X_scaled
X_test = X_test_scaled

# ============================================================
# TRAIN LIGHTGBM WITH OPTIMIZED HYPERPARAMETERS
# ============================================================

# Tuned parameters based on SHAP insights
lgb_params_optimized = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "max_depth": 7,
    "learning_rate": 0.03,  # Lower LR for better convergence
    "feature_fraction": 0.7,  # Lower - focus on top features
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_child_samples": 30,  # Higher - more regularization
    "scale_pos_weight": scale_pos_weight,
    "lambda_l1": 0.1,  # L1 regularization
    "lambda_l2": 0.1,  # L2 regularization
    "verbose": -1,
    "seed": SEED,
    "n_jobs": -1,
    "feature_fraction_seed": SEED,
    "bagging_seed": SEED,
    "data_random_seed": SEED,
}

print(f"\nScale pos weight: {scale_pos_weight:.2f}")

lgb_oof_opt = np.zeros(len(y))
lgb_test_opt = np.zeros(len(X_test))
lgb_models_opt = []
lgb_fold_scores_opt = []
lgb_fold_thresholds_opt = []
lgb_test_folds_opt = []

print("\n--- Training LightGBM with 5-fold CV (Optimized) ---")
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold}/{N_FOLDS} ---")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params_optimized,
        train_data,
        num_boost_round=3000,  # More rounds with lower LR
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(150, verbose=False)],
    )

    lgb_models_opt.append(model)
    fold_oof = model.predict(X_val)
    lgb_oof_opt[val_idx] = fold_oof

    fold_test = model.predict(X_test)
    lgb_test_opt += fold_test / N_FOLDS
    lgb_test_folds_opt.append(fold_test)

    best_th, best_f1 = 0.5, 0
    for th in np.arange(0.1, 0.9, 0.01):
        f1 = f1_score(y_val, (fold_oof >= th).astype(int))
        if f1 > best_f1:
            best_f1, best_th = f1, th

    lgb_fold_scores_opt.append(best_f1)
    lgb_fold_thresholds_opt.append(best_th)
    print(f"Fold {fold}: F1={best_f1:.4f} at threshold={best_th:.2f}")

# Find best global threshold
def neg_f1(th):
    return -f1_score(y, (lgb_oof_opt >= th).astype(int))

result = minimize_scalar(neg_f1, bounds=(0.1, 0.9), method="bounded")
lgb_best_th_opt = result.x
lgb_best_f1_opt = -result.fun
lgb_auc_opt = roc_auc_score(y, lgb_oof_opt)

print(f"\n" + "="*60)
print("OPTIMIZED RESULTS")
print("="*60)
print(f"\nLightGBM (Optimized) CV F1: {np.mean(lgb_fold_scores_opt):.4f} ± {np.std(lgb_fold_scores_opt):.4f}")
print(f"LightGBM (Optimized) Global F1: {lgb_best_f1_opt:.4f} at threshold={lgb_best_th_opt:.2f}")
print(f"LightGBM (Optimized) ROC-AUC: {lgb_auc_opt:.4f}")

# Compare with previous
if 'lgb_best_f1' in locals():
    improvement = lgb_best_f1_opt - lgb_best_f1
    improvement_pct = improvement / lgb_best_f1 * 100 if lgb_best_f1 > 0 else 0
    print(f"\n" + "-"*60)
    print("COMPARISON WITH PREVIOUS MODEL")
    print("-"*60)
    print(f"Previous F1: {lgb_best_f1:.4f}")
    print(f"Optimized F1: {lgb_best_f1_opt:.4f}")
    print(f"Improvement: {improvement:+.4f} ({improvement_pct:+.2f}%)")

# Save models
for i, model in enumerate(lgb_models_opt):
    model.save_model(f"{GBM_MODEL_DIR}/lgb_model_optimized_fold{i+1}.txt")

# Save predictions and config
np.save(f"{GBM_MODEL_DIR}/lgb_oof_optimized.npy", lgb_oof_opt)
np.save(f"{GBM_MODEL_DIR}/lgb_test_optimized.npy", lgb_test_opt)

lgb_config_opt = {
    "SEED": SEED,
    "n_features": X.shape[1],
    "n_features_original": len(feature_names),
    "scale_pos_weight": scale_pos_weight,
    "lgb_params": lgb_params_optimized,
    "fold_scores": lgb_fold_scores_opt,
    "fold_thresholds": lgb_fold_thresholds_opt,
    "best_threshold": lgb_best_th_opt,
    "best_f1": lgb_best_f1_opt,
    "roc_auc": lgb_auc_opt,
    "cv_f1_mean": float(np.mean(lgb_fold_scores_opt)),
    "cv_f1_std": float(np.std(lgb_fold_scores_opt)),
}
joblib.dump(lgb_config_opt, f"{GBM_MODEL_DIR}/lgb_config_optimized.pkl")

# Submission
best_lgb_fold_opt = int(np.argmax(lgb_fold_scores_opt))
best_lgb_th_opt = float(lgb_fold_thresholds_opt[best_lgb_fold_opt])
best_lgb_test_opt = np.asarray(lgb_test_folds_opt[best_lgb_fold_opt], dtype=float)

lgb_single_preds_opt = (best_lgb_test_opt >= best_lgb_th_opt).astype(int)
sub_lgb_single_opt = pd.DataFrame({
    "object_id": test_log["object_id"],
    "target": lgb_single_preds_opt
})
sub_lgb_single_opt.to_csv("submission_LGB_SHAP_optimized.csv", index=False)
print(f"\n✅ Saved optimized submission -> submission_LGB_SHAP_optimized.csv")

print("\n" + "="*60)


TRAINING LIGHTGBM (SHAP-OPTIMIZED FEATURES)

Using optimized features: 304 features

--- Applying Robust Scaling ---
Scaled features - Train: (3043, 304), Test: (7135, 304)

Scale pos weight: 19.56

--- Training LightGBM with 5-fold CV (Optimized) ---

--- Fold 1/5 ---
Fold 1: F1=0.5455 at threshold=0.30

--- Fold 2/5 ---
Fold 2: F1=0.5556 at threshold=0.50

--- Fold 3/5 ---
Fold 3: F1=0.5000 at threshold=0.17

--- Fold 4/5 ---
Fold 4: F1=0.6667 at threshold=0.55

--- Fold 5/5 ---
Fold 5: F1=0.5479 at threshold=0.20

OPTIMIZED RESULTS

LightGBM (Optimized) CV F1: 0.5631 ± 0.0553
LightGBM (Optimized) Global F1: 0.5196 at threshold=0.42
LightGBM (Optimized) ROC-AUC: 0.9473

------------------------------------------------------------
COMPARISON WITH PREVIOUS MODEL
------------------------------------------------------------
Previous F1: 0.5631
Optimized F1: 0.5196
Improvement: -0.0435 (-7.73%)

✅ Saved optimized submission -> submission_LGB_SHAP_optimized.csv



In [35]:
# ============================================================
# CELL 21: HYPERPARAMETER TUNING BASED ON SHAP INSIGHTS
# ============================================================

print("\n" + "="*70)
print("HYPERPARAMETER TUNING BASED ON SHAP INSIGHTS")
print("="*70)

# Key insight từ SHAP: 
# - i-band features rất quan trọng cho TDE
# - Model có class-specific behavior
# - Cần focus vào class weighting và feature importance

# Restore original features (before SHAP optimization)
print("\nRestoring original features (before SHAP optimization)...")
X_df_orig = train_feat.copy()
X_test_df_orig = test_feat.copy()

if "object_id" in X_df_orig.columns:
    X_df_orig = X_df_orig.drop(columns=["object_id"])
if "object_id" in X_test_df_orig.columns:
    X_test_df_orig = X_test_df_orig.drop(columns=["object_id"])

common_cols_orig = sorted(set(X_df_orig.columns) & set(X_test_df_orig.columns))
X_df_orig = X_df_orig[common_cols_orig]
X_test_df_orig = X_test_df_orig[common_cols_orig]

X_df_orig = X_df_orig.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_df_orig = X_test_df_orig.replace([np.inf, -np.inf], np.nan).fillna(0)

# Apply SelectKBest (giống như trước)
if USE_SELECTKBEST and len(common_cols_orig) > K_BEST_MIN:
    from sklearn.feature_selection import SelectKBest, f_classif
    
    feature_selector_restored = SelectKBest(score_func=f_classif, k=int(len(common_cols_orig) * K_BEST_RATIO))
    X_restored = feature_selector_restored.fit_transform(X_df_orig.values, y)
    X_test_restored = feature_selector_restored.transform(X_test_df_orig.values)
    
    selected_mask = feature_selector_restored.get_support()
    selected_feature_names_restored = [common_cols_orig[i] for i in range(len(common_cols_orig)) if selected_mask[i]]
    
    print(f"✅ Restored {len(selected_feature_names_restored)} features (SelectKBest)")
else:
    X_restored = X_df_orig.values
    X_test_restored = X_test_df_orig.values
    selected_feature_names_restored = common_cols_orig
    print(f"✅ Restored {len(selected_feature_names_restored)} features (no selection)")

# Apply Robust Scaling
scaler_restored = RobustScaler()
X_restored = scaler_restored.fit_transform(X_restored)
X_test_restored = scaler_restored.transform(X_test_restored)

# Use restored data
X = X_restored
X_test = X_test_restored
feature_names = selected_feature_names_restored

print(f"\nFinal shape: Train {X.shape}, Test {X_test.shape}")

# ============================================================
# TUNED HYPERPARAMETERS BASED ON SHAP INSIGHTS
# ============================================================

# Key changes based on SHAP:
# 1. Higher min_child_samples (reduce overfitting on minority class)
# 2. Tuned feature_fraction (focus on discriminative features)
# 3. Adjusted regularization
# 4. Better class weighting

lgb_params_tuned = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "max_depth": 6,  # Slightly lower to reduce complexity
    "learning_rate": 0.05,  # Keep original
    "feature_fraction": 0.85,  # Higher - SHAP shows most features are useful
    "bagging_fraction": 0.85,
    "bagging_freq": 5,
    "min_child_samples": 25,  # Higher for regularization
    "scale_pos_weight": scale_pos_weight,
    "lambda_l1": 0.05,  # Light L1 regularization
    "lambda_l2": 0.05,  # Light L2 regularization
    "min_gain_to_split": 0.01,  # Require minimum gain
    "verbose": -1,
    "seed": SEED,
    "n_jobs": -1,
    "feature_fraction_seed": SEED,
    "bagging_seed": SEED,
    "data_random_seed": SEED,
}

print(f"\n--- Hyperparameters tuned based on SHAP insights ---")
print(f"  - max_depth: 7 → 6 (reduce complexity)")
print(f"  - feature_fraction: 0.8 → 0.85 (keep more features)")
print(f"  - min_child_samples: 20 → 25 (more regularization)")
print(f"  - Added L1/L2 regularization: 0.05 each")
print(f"  - min_gain_to_split: 0.01 (require meaningful splits)")

lgb_oof_tuned = np.zeros(len(y))
lgb_test_tuned = np.zeros(len(X_test))
lgb_models_tuned = []
lgb_fold_scores_tuned = []
lgb_fold_thresholds_tuned = []
lgb_test_folds_tuned = []

print("\n--- Training LightGBM with Tuned Hyperparameters ---")
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold}/{N_FOLDS} ---")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params_tuned,
        train_data,
        num_boost_round=2500,  # Slightly more rounds
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(150, verbose=False)],
    )

    lgb_models_tuned.append(model)
    fold_oof = model.predict(X_val)
    lgb_oof_tuned[val_idx] = fold_oof

    fold_test = model.predict(X_test)
    lgb_test_tuned += fold_test / N_FOLDS
    lgb_test_folds_tuned.append(fold_test)

    best_th, best_f1 = 0.5, 0
    for th in np.arange(0.1, 0.9, 0.01):
        f1 = f1_score(y_val, (fold_oof >= th).astype(int))
        if f1 > best_f1:
            best_f1, best_th = f1, th

    lgb_fold_scores_tuned.append(best_f1)
    lgb_fold_thresholds_tuned.append(best_th)
    print(f"Fold {fold}: F1={best_f1:.4f} at threshold={best_th:.2f}")

# Find best global threshold
def neg_f1(th):
    return -f1_score(y, (lgb_oof_tuned >= th).astype(int))

result = minimize_scalar(neg_f1, bounds=(0.1, 0.9), method="bounded")
lgb_best_th_tuned = result.x
lgb_best_f1_tuned = -result.fun
lgb_auc_tuned = roc_auc_score(y, lgb_oof_tuned)

print(f"\n" + "="*60)
print("TUNED RESULTS")
print("="*60)
print(f"\nLightGBM (Tuned) CV F1: {np.mean(lgb_fold_scores_tuned):.4f} ± {np.std(lgb_fold_scores_tuned):.4f}")
print(f"LightGBM (Tuned) Global F1: {lgb_best_f1_tuned:.4f} at threshold={lgb_best_th_tuned:.2f}")
print(f"LightGBM (Tuned) ROC-AUC: {lgb_auc_tuned:.4f}")

# Compare with original
if 'lgb_best_f1' in locals():
    improvement = lgb_best_f1_tuned - lgb_best_f1
    improvement_pct = improvement / lgb_best_f1 * 100 if lgb_best_f1 > 0 else 0
    print(f"\n" + "-"*60)
    print("COMPARISON WITH ORIGINAL MODEL")
    print("-"*60)
    print(f"Original F1: {lgb_best_f1:.4f}")
    print(f"Tuned F1: {lgb_best_f1_tuned:.4f}")
    print(f"Improvement: {improvement:+.4f} ({improvement_pct:+.2f}%)")

# Save
for i, model in enumerate(lgb_models_tuned):
    model.save_model(f"{GBM_MODEL_DIR}/lgb_model_tuned_fold{i+1}.txt")

np.save(f"{GBM_MODEL_DIR}/lgb_oof_tuned.npy", lgb_oof_tuned)
np.save(f"{GBM_MODEL_DIR}/lgb_test_tuned.npy", lgb_test_tuned)

# Submission
best_lgb_fold_tuned = int(np.argmax(lgb_fold_scores_tuned))
best_lgb_th_tuned = float(lgb_fold_thresholds_tuned[best_lgb_fold_tuned])
best_lgb_test_tuned = np.asarray(lgb_test_folds_tuned[best_lgb_fold_tuned], dtype=float)

lgb_single_preds_tuned = (best_lgb_test_tuned >= best_lgb_th_tuned).astype(int)
sub_lgb_tuned = pd.DataFrame({
    "object_id": test_log["object_id"],
    "target": lgb_single_preds_tuned
})
sub_lgb_tuned.to_csv("submission_LGB_tuned_hyperparams.csv", index=False)
print(f"\n✅ Saved tuned submission -> submission_LGB_tuned_hyperparams.csv")

print("\n" + "="*60)


HYPERPARAMETER TUNING BASED ON SHAP INSIGHTS

Restoring original features (before SHAP optimization)...
✅ Restored 246 features (SelectKBest)

Final shape: Train (3043, 246), Test (7135, 246)

--- Hyperparameters tuned based on SHAP insights ---
  - max_depth: 7 → 6 (reduce complexity)
  - feature_fraction: 0.8 → 0.85 (keep more features)
  - min_child_samples: 20 → 25 (more regularization)
  - Added L1/L2 regularization: 0.05 each
  - min_gain_to_split: 0.01 (require meaningful splits)

--- Training LightGBM with Tuned Hyperparameters ---

--- Fold 1/5 ---
Fold 1: F1=0.5926 at threshold=0.10

--- Fold 2/5 ---
Fold 2: F1=0.5763 at threshold=0.35

--- Fold 3/5 ---
Fold 3: F1=0.5532 at threshold=0.48

--- Fold 4/5 ---
Fold 4: F1=0.6792 at threshold=0.48

--- Fold 5/5 ---
Fold 5: F1=0.6061 at threshold=0.34

TUNED RESULTS

LightGBM (Tuned) CV F1: 0.6015 ± 0.0427
LightGBM (Tuned) Global F1: 0.5545 at threshold=0.31
LightGBM (Tuned) ROC-AUC: 0.9495

----------------------------------------

In [36]:
# ============================================================
# CELL 22: IMPROVED THRESHOLD OPTIMIZATION
# ============================================================

print("\n" + "="*70)
print("IMPROVED THRESHOLD OPTIMIZATION")
print("="*70)

# Use tuned model predictions
lgb_oof = lgb_oof_tuned
lgb_test = lgb_test_tuned

print(f"\nCV F1 Scores: {lgb_fold_scores_tuned}")
print(f"CV F1 Mean: {np.mean(lgb_fold_scores_tuned):.4f} ± {np.std(lgb_fold_scores_tuned):.4f}")

# Method 1: Use median of fold thresholds (more stable)
median_threshold = np.median(lgb_fold_thresholds_tuned)
f1_median_th = f1_score(y, (lgb_oof >= median_threshold).astype(int))

print(f"\n1. Median Threshold Method:")
print(f"   Threshold: {median_threshold:.4f}")
print(f"   F1 Score: {f1_median_th:.4f}")

# Method 2: Weighted average of fold thresholds (weighted by fold F1)
weights = np.array(lgb_fold_scores_tuned)
weights = weights / weights.sum()
weighted_threshold = np.average(lgb_fold_thresholds_tuned, weights=weights)
f1_weighted_th = f1_score(y, (lgb_oof >= weighted_threshold).astype(int))

print(f"\n2. Weighted Average Threshold Method:")
print(f"   Threshold: {weighted_threshold:.4f}")
print(f"   F1 Score: {f1_weighted_th:.4f}")

# Method 3: Optimize on full OOF predictions (original method)
def neg_f1(th):
    return -f1_score(y, (lgb_oof >= th).astype(int))

result = minimize_scalar(neg_f1, bounds=(0.1, 0.9), method="bounded")
optimal_threshold = result.x
f1_optimal = -result.fun

print(f"\n3. Optimal Threshold Method (full OOF):")
print(f"   Threshold: {optimal_threshold:.4f}")
print(f"   F1 Score: {f1_optimal:.4f}")

# Method 4: Use best fold's threshold
best_fold_idx = int(np.argmax(lgb_fold_scores_tuned))
best_fold_threshold = lgb_fold_thresholds_tuned[best_fold_idx]
f1_best_fold_th = f1_score(y, (lgb_oof >= best_fold_threshold).astype(int))

print(f"\n4. Best Fold Threshold Method:")
print(f"   Threshold: {best_fold_threshold:.4f} (from fold {best_fold_idx+1})")
print(f"   F1 Score: {f1_best_fold_th:.4f}")

# Compare all methods
threshold_methods = {
    'Median': (median_threshold, f1_median_th),
    'Weighted': (weighted_threshold, f1_weighted_th),
    'Optimal': (optimal_threshold, f1_optimal),
    'Best Fold': (best_fold_threshold, f1_best_fold_th),
}

best_method = max(threshold_methods.items(), key=lambda x: x[1][1])
print(f"\n" + "="*70)
print(f"BEST METHOD: {best_method[0]}")
print(f"  Threshold: {best_method[1][0]:.4f}")
print(f"  F1 Score: {best_method[1][1]:.4f}")
print("="*70)

# Use best threshold for submission
final_threshold = best_method[1][0]
final_f1 = best_method[1][1]

# Predictions with best threshold
lgb_test_preds = (lgb_test >= final_threshold).astype(int)

sub_lgb_final = pd.DataFrame({
    "object_id": test_log["object_id"],
    "target": lgb_test_preds
})
sub_lgb_final.to_csv("submission_LGB_improved_threshold.csv", index=False)
print(f"\n✅ Saved submission with improved threshold -> submission_LGB_improved_threshold.csv")
print(f"   Final F1: {final_f1:.4f}, Threshold: {final_threshold:.4f}")

# Save config
lgb_config_final = {
    "SEED": SEED,
    "n_features": X.shape[1],
    "scale_pos_weight": scale_pos_weight,
    "lgb_params": lgb_params_tuned,
    "fold_scores": lgb_fold_scores_tuned,
    "fold_thresholds": lgb_fold_thresholds_tuned,
    "threshold_method": best_method[0],
    "best_threshold": float(final_threshold),
    "best_f1": float(final_f1),
    "roc_auc": float(lgb_auc_tuned),
    "cv_f1_mean": float(np.mean(lgb_fold_scores_tuned)),
    "cv_f1_std": float(np.std(lgb_fold_scores_tuned)),
}
joblib.dump(lgb_config_final, f"{GBM_MODEL_DIR}/lgb_config_final.pkl")


IMPROVED THRESHOLD OPTIMIZATION

CV F1 Scores: [0.5925925925925926, 0.576271186440678, 0.5531914893617021, 0.6792452830188679, 0.6060606060606061]
CV F1 Mean: 0.6015 ± 0.0427

1. Median Threshold Method:
   Threshold: 0.3500
   F1 Score: 0.5369

2. Weighted Average Threshold Method:
   Threshold: 0.3520
   F1 Score: 0.5369

3. Optimal Threshold Method (full OOF):
   Threshold: 0.3065
   F1 Score: 0.5545

4. Best Fold Threshold Method:
   Threshold: 0.4800 (from fold 4)
   F1 Score: 0.5649

BEST METHOD: Best Fold
  Threshold: 0.4800
  F1 Score: 0.5649

✅ Saved submission with improved threshold -> submission_LGB_improved_threshold.csv
   Final F1: 0.5649, Threshold: 0.4800


['/kaggle/working/models/lgb_config_final.pkl']

In [37]:
# ============================================================
# CELL 24: PREDICTION DISTRIBUTION ANALYSIS & CALIBRATION
# ============================================================

print("\n" + "="*70)
print("PREDICTION DISTRIBUTION ANALYSIS")
print("="*70)

# Analyze OOF predictions distribution
print("\n1. OOF Prediction Distribution:")
print(f"   Min: {lgb_oof_tuned.min():.4f}")
print(f"   Max: {lgb_oof_tuned.max():.4f}")
print(f"   Mean: {lgb_oof_tuned.mean():.4f}")
print(f"   Median: {np.median(lgb_oof_tuned):.4f}")
print(f"   Std: {lgb_oof_tuned.std():.4f}")

print("\n2. Prediction Distribution by Class:")
print(f"   TDE (y=1):")
print(f"     Mean: {lgb_oof_tuned[y == 1].mean():.4f}")
print(f"     Median: {np.median(lgb_oof_tuned[y == 1]):.4f}")
print(f"     Std: {lgb_oof_tuned[y == 1].std():.4f}")
print(f"     Min: {lgb_oof_tuned[y == 1].min():.4f}")
print(f"     Max: {lgb_oof_tuned[y == 1].max():.4f}")

print(f"\n   Non-TDE (y=0):")
print(f"     Mean: {lgb_oof_tuned[y == 0].mean():.4f}")
print(f"     Median: {np.median(lgb_oof_tuned[y == 0]):.4f}")
print(f"     Std: {lgb_oof_tuned[y == 0].std():.4f}")
print(f"     Min: {lgb_oof_tuned[y == 0].min():.4f}")
print(f"     Max: {lgb_oof_tuned[y == 0].max():.4f}")

separation = lgb_oof_tuned[y == 1].mean() - lgb_oof_tuned[y == 0].mean()
print(f"\n   Separation: {separation:.4f}")

# Prediction bins analysis
print("\n3. Prediction Bins Analysis:")
bins = np.linspace(0, 1, 21)
bin_centers = (bins[:-1] + bins[1:]) / 2

for i in range(len(bins) - 1):
    mask = (lgb_oof_tuned >= bins[i]) & (lgb_oof_tuned < bins[i+1])
    if i == len(bins) - 2:
        mask = (lgb_oof_tuned >= bins[i]) & (lgb_oof_tuned <= bins[i+1])
    
    count = mask.sum()
    if count > 0:
        tde_count = y[mask].sum()
        tde_rate = tde_count / count
        print(f"   [{bins[i]:.2f}, {bins[i+1]:.2f}]: {count:4d} samples, {tde_count:3d} TDE ({tde_rate*100:5.1f}%)")

# Find optimal threshold ranges
print("\n4. Threshold Range Analysis:")
best_thresholds_per_fold = lgb_fold_thresholds_tuned
print(f"   Fold thresholds: {[f'{t:.3f}' for t in best_thresholds_per_fold]}")
print(f"   Mean: {np.mean(best_thresholds_per_fold):.4f}")
print(f"   Std: {np.std(best_thresholds_per_fold):.4f}")
print(f"   Range: [{np.min(best_thresholds_per_fold):.4f}, {np.max(best_thresholds_per_fold):.4f}]")

# Check if predictions are well-calibrated
print("\n5. Calibration Check:")
from sklearn.calibration import calibration_curve

fraction_of_positives, mean_predicted_value = calibration_curve(
    y, lgb_oof_tuned, n_bins=10, strategy='uniform'
)

calibration_error = np.mean(np.abs(fraction_of_positives - mean_predicted_value))
print(f"   Mean Calibration Error: {calibration_error:.4f}")
if calibration_error < 0.05:
    print("   ✅ Well calibrated")
elif calibration_error < 0.10:
    print("   ⚠️  Moderately calibrated")
else:
    print("   ❌ Poorly calibrated")

print("\n" + "="*70)


PREDICTION DISTRIBUTION ANALYSIS

1. OOF Prediction Distribution:
   Min: 0.0000
   Max: 0.9970
   Mean: 0.0469
   Median: 0.0005
   Std: 0.1559

2. Prediction Distribution by Class:
   TDE (y=1):
     Mean: 0.4575
     Median: 0.4551
     Std: 0.3434
     Min: 0.0003
     Max: 0.9970

   Non-TDE (y=0):
     Mean: 0.0259
     Median: 0.0004
     Std: 0.1023
     Min: 0.0000
     Max: 0.9753

   Separation: 0.4316

3. Prediction Bins Analysis:
   [0.00, 0.05]: 2673 samples,  23 TDE (  0.9%)
   [0.05, 0.10]:   89 samples,   9 TDE ( 10.1%)
   [0.10, 0.15]:   43 samples,  12 TDE ( 27.9%)
   [0.15, 0.20]:   28 samples,  10 TDE ( 35.7%)
   [0.20, 0.25]:   19 samples,   4 TDE ( 21.1%)
   [0.25, 0.30]:   15 samples,   1 TDE (  6.7%)
   [0.30, 0.35]:   26 samples,   9 TDE ( 34.6%)
   [0.35, 0.40]:   10 samples,   4 TDE ( 40.0%)
   [0.40, 0.45]:   17 samples,   2 TDE ( 11.8%)
   [0.45, 0.50]:   14 samples,   3 TDE ( 21.4%)
   [0.50, 0.55]:   12 samples,   5 TDE ( 41.7%)
   [0.55, 0.60]:   12 sa

In [38]:
# ============================================================
# CELL 26: PREDICTION CALIBRATION
# ============================================================

print("\n" + "="*70)
print("PREDICTION CALIBRATION")
print("="*70)

from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

# Current predictions
lgb_oof_cal = lgb_oof_tuned.copy()
lgb_test_cal = lgb_test_tuned.copy()

print("\n1. Applying Isotonic Regression Calibration...")

# Fit calibration on OOF predictions
iso_reg = IsotonicRegression(out_of_bounds='clip')
iso_reg.fit(lgb_oof_cal, y)

# Apply calibration
lgb_oof_calibrated = iso_reg.transform(lgb_oof_cal)
lgb_test_calibrated = iso_reg.transform(lgb_test_cal)

# Check calibration improvement
from sklearn.calibration import calibration_curve
fraction_of_positives_orig, mean_predicted_value_orig = calibration_curve(
    y, lgb_oof_cal, n_bins=10, strategy='uniform'
)
fraction_of_positives_cal, mean_predicted_value_cal = calibration_curve(
    y, lgb_oof_calibrated, n_bins=10, strategy='uniform'
)

cal_error_orig = np.mean(np.abs(fraction_of_positives_orig - mean_predicted_value_orig))
cal_error_cal = np.mean(np.abs(fraction_of_positives_cal - mean_predicted_value_cal))

print(f"\n   Original Calibration Error: {cal_error_orig:.4f}")
print(f"   Calibrated Error: {cal_error_cal:.4f}")
print(f"   Improvement: {cal_error_orig - cal_error_cal:.4f}")

# Re-optimize threshold on calibrated predictions
def neg_f1_cal(th):
    return -f1_score(y, (lgb_oof_calibrated >= th).astype(int))

result_cal = minimize_scalar(neg_f1_cal, bounds=(0.1, 0.9), method="bounded")
best_th_cal = result_cal.x
best_f1_cal = -result_cal.fun

print(f"\n2. Threshold Optimization on Calibrated Predictions:")
print(f"   Optimal threshold: {best_th_cal:.4f}")
print(f"   F1 Score: {best_f1_cal:.4f}")

# Compare with original
if 'lgb_best_f1_tuned' in locals():
    improvement = best_f1_cal - lgb_best_f1_tuned
    print(f"\n   Original F1: {lgb_best_f1_tuned:.4f}")
    print(f"   Calibrated F1: {best_f1_cal:.4f}")
    print(f"   Improvement: {improvement:+.4f}")

# Save calibration model
joblib.dump(iso_reg, f"{GBM_MODEL_DIR}/isotonic_calibration.pkl")
print(f"\n✅ Calibration applied and saved!")


PREDICTION CALIBRATION

1. Applying Isotonic Regression Calibration...

   Original Calibration Error: 0.1141
   Calibrated Error: 0.0000
   Improvement: 0.1141

2. Threshold Optimization on Calibrated Predictions:
   Optimal threshold: 0.4056
   F1 Score: 0.5455

   Original F1: 0.5545
   Calibrated F1: 0.5455
   Improvement: -0.0091

✅ Calibration applied and saved!


In [39]:
# ============================================================
# CELL 27: FINAL MODEL WITH CALIBRATION + REGULARIZATION
# ============================================================

print("\n" + "="*70)
print("FINAL MODEL: CALIBRATION + STRONG REGULARIZATION")
print("="*70)

# Strategy: 
# - Strong regularization to get more stable predictions
# - Better calibration
# - Focus on threshold stability

lgb_params_final = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 24,  # Further reduced
    "max_depth": 4,  # Further reduced
    "learning_rate": 0.03,  # Lower
    "feature_fraction": 0.75,  # Reduced
    "bagging_fraction": 0.75,
    "bagging_freq": 5,
    "min_child_samples": 40,  # Much higher
    "min_data_in_leaf": 30,  # Higher
    "scale_pos_weight": scale_pos_weight,
    "lambda_l1": 0.2,  # Stronger
    "lambda_l2": 0.2,  # Stronger
    "min_gain_to_split": 0.03,  # Higher
    "max_bin": 200,  # Reduced
    "subsample_for_bin": 200000,
    "verbose": -1,
    "seed": SEED,
    "n_jobs": -1,
    "feature_fraction_seed": SEED,
    "bagging_seed": SEED,
    "data_random_seed": SEED,
}

print("\n--- Final Hyperparameters (Strong Regularization) ---")
print(f"  - num_leaves: 28 → 24")
print(f"  - max_depth: 5 → 4")
print(f"  - learning_rate: 0.04 → 0.03")
print(f"  - feature_fraction: 0.8 → 0.75")
print(f"  - min_child_samples: 30 → 40")
print(f"  - min_data_in_leaf: 20 → 30")
print(f"  - lambda_l1/l2: 0.1 → 0.2")
print(f"  - min_gain_to_split: 0.02 → 0.03")
print(f"  - max_bin: 255 → 200")
print(f"  → Goal: Stable, well-calibrated predictions")

lgb_oof_final_raw = np.zeros(len(y))
lgb_test_final_raw = np.zeros(len(X_test))
lgb_models_final = []
lgb_fold_scores_final = []
lgb_fold_thresholds_final = []
lgb_test_folds_final = []

print("\n--- Training Final Model ---")
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold}/{N_FOLDS} ---")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params_final,
        train_data,
        num_boost_round=3500,  # More rounds with lower LR
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(200, verbose=False)],
    )

    lgb_models_final.append(model)
    fold_oof = model.predict(X_val)
    lgb_oof_final_raw[val_idx] = fold_oof

    fold_test = model.predict(X_test)
    lgb_test_final_raw += fold_test / N_FOLDS
    lgb_test_folds_final.append(fold_test)

    best_th, best_f1 = 0.5, 0
    for th in np.arange(0.1, 0.9, 0.01):
        f1 = f1_score(y_val, (fold_oof >= th).astype(int))
        if f1 > best_f1:
            best_f1, best_th = f1, th

    lgb_fold_scores_final.append(best_f1)
    lgb_fold_thresholds_final.append(best_th)
    print(f"Fold {fold}: F1={best_f1:.4f} at threshold={best_th:.2f}")

# Apply calibration
print("\n--- Applying Calibration ---")
from sklearn.isotonic import IsotonicRegression

iso_reg_final = IsotonicRegression(out_of_bounds='clip')
iso_reg_final.fit(lgb_oof_final_raw, y)

lgb_oof_final_cal = iso_reg_final.transform(lgb_oof_final_raw)
lgb_test_final_cal = iso_reg_final.transform(lgb_test_final_raw)

# Check calibration
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(
    y, lgb_oof_final_cal, n_bins=10, strategy='uniform'
)
cal_error_final = np.mean(np.abs(fraction_of_positives - mean_predicted_value))
print(f"   Calibration Error: {cal_error_final:.4f}")

# Optimize threshold on calibrated predictions
def neg_f1_final(th):
    return -f1_score(y, (lgb_oof_final_cal >= th).astype(int))

result_final = minimize_scalar(neg_f1_final, bounds=(0.1, 0.9), method="bounded")
lgb_best_th_final = result_final.x
lgb_best_f1_final = -result_final.fun
lgb_auc_final = roc_auc_score(y, lgb_oof_final_cal)

print(f"\n" + "="*70)
print("FINAL RESULTS")
print("="*70)
print(f"\nLightGBM (Final) CV F1: {np.mean(lgb_fold_scores_final):.4f} ± {np.std(lgb_fold_scores_final):.4f}")
print(f"LightGBM (Final) Global F1 (calibrated): {lgb_best_f1_final:.4f} at threshold={lgb_best_th_final:.4f}")
print(f"LightGBM (Final) ROC-AUC: {lgb_auc_final:.4f}")
print(f"Calibration Error: {cal_error_final:.4f}")

# Threshold stability comparison
print(f"\nThreshold Stability:")
print(f"  Previous std: {np.std(lgb_fold_thresholds_tuned):.4f}")
print(f"  Final std: {np.std(lgb_fold_thresholds_final):.4f}")
threshold_improvement = np.std(lgb_fold_thresholds_tuned) - np.std(lgb_fold_thresholds_final)
print(f"  Improvement: {threshold_improvement:+.4f}")

# Compare with all previous models
comparisons = []
if 'lgb_best_f1' in locals():
    comparisons.append(("Original", lgb_best_f1))
if 'lgb_best_f1_tuned' in locals():
    comparisons.append(("Tuned", lgb_best_f1_tuned))
if 'lgb_best_f1_finetuned' in locals():
    comparisons.append(("Fine-tuned", lgb_best_f1_finetuned))

if comparisons:
    print(f"\n" + "-"*70)
    print("COMPARISON WITH ALL MODELS")
    print("-"*70)
    for name, score in comparisons:
        improvement = lgb_best_f1_final - score
        improvement_pct = improvement / score * 100 if score > 0 else 0
        print(f"{name:15s}: F1={score:.4f} → Final F1={lgb_best_f1_final:.4f} ({improvement:+.4f}, {improvement_pct:+.2f}%)")

# Save everything
for i, model in enumerate(lgb_models_final):
    model.save_model(f"{GBM_MODEL_DIR}/lgb_model_final_fold{i+1}.txt")

np.save(f"{GBM_MODEL_DIR}/lgb_oof_final_calibrated.npy", lgb_oof_final_cal)
np.save(f"{GBM_MODEL_DIR}/lgb_test_final_calibrated.npy", lgb_test_final_cal)
joblib.dump(iso_reg_final, f"{GBM_MODEL_DIR}/isotonic_calibration_final.pkl")

# Final submission with calibrated predictions
lgb_test_final_sub = lgb_test_final_cal.copy()
lgb_preds_final = (lgb_test_final_sub >= lgb_best_th_final).astype(int)

sub_lgb_final = pd.DataFrame({
    "object_id": test_log["object_id"],
    "target": lgb_preds_final
})
sub_lgb_final.to_csv("submission_LGB_final_calibrated.csv", index=False)
print(f"\n✅ Saved final calibrated submission -> submission_LGB_final_calibrated.csv")

# Save config
lgb_config_final = {
    "SEED": SEED,
    "n_features": X.shape[1],
    "scale_pos_weight": scale_pos_weight,
    "lgb_params": lgb_params_final,
    "fold_scores": lgb_fold_scores_final,
    "fold_thresholds": lgb_fold_thresholds_final,
    "threshold_method": "calibrated_optimal",
    "best_threshold": float(lgb_best_th_final),
    "best_f1": float(lgb_best_f1_final),
    "roc_auc": float(lgb_auc_final),
    "calibration_error": float(cal_error_final),
    "cv_f1_mean": float(np.mean(lgb_fold_scores_final)),
    "cv_f1_std": float(np.std(lgb_fold_scores_final)),
}
joblib.dump(lgb_config_final, f"{GBM_MODEL_DIR}/lgb_config_final.pkl")

print("\n" + "="*70)


FINAL MODEL: CALIBRATION + STRONG REGULARIZATION

--- Final Hyperparameters (Strong Regularization) ---
  - num_leaves: 28 → 24
  - max_depth: 5 → 4
  - learning_rate: 0.04 → 0.03
  - feature_fraction: 0.8 → 0.75
  - min_child_samples: 30 → 40
  - min_data_in_leaf: 20 → 30
  - lambda_l1/l2: 0.1 → 0.2
  - min_gain_to_split: 0.02 → 0.03
  - max_bin: 255 → 200
  → Goal: Stable, well-calibrated predictions

--- Training Final Model ---

--- Fold 1/5 ---
Fold 1: F1=0.6197 at threshold=0.26

--- Fold 2/5 ---
Fold 2: F1=0.5714 at threshold=0.32

--- Fold 3/5 ---
Fold 3: F1=0.5455 at threshold=0.40

--- Fold 4/5 ---
Fold 4: F1=0.6792 at threshold=0.49

--- Fold 5/5 ---
Fold 5: F1=0.6102 at threshold=0.49

--- Applying Calibration ---
   Calibration Error: 0.0000

FINAL RESULTS

LightGBM (Final) CV F1: 0.6052 ± 0.0457
LightGBM (Final) Global F1 (calibrated): 0.5829 at threshold=0.2889
LightGBM (Final) ROC-AUC: 0.9545
Calibration Error: 0.0000

Threshold Stability:
  Previous std: 0.1389
  Fina