# XGBoost model for TDE classification
Pipeline song song với LGBM: dùng GP features, SelectKBest tùy chọn, RobustScaler, CV 5-fold, tối ưu ngưỡng F1, lưu OOF/test và tạo submission (fold tốt nhất + ngưỡng global).

In [None]:
# 1) Imports & config
import os, warnings, gc, random
warnings.filterwarnings('ignore')
import numpy as np, pandas as pd, joblib
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.optimize import minimize_scalar
import xgboost as xgb

SEED = 42
random.seed(SEED); np.random.seed(SEED)

DATA_PATH = Path('/kaggle/input/project/mallorn-astronomical-classification-challenge')
FEAT_TRAIN_PKL = '/kaggle/input/2d-gp-features/kaggle/working/cache/train_features_2dgp_gpy.pkl'
FEAT_TEST_PKL  = '/kaggle/input/2d-gp-features/kaggle/working/cache/test_features_2dgp_gpy.pkl'
MODEL_DIR = 'saved_models_xgb'
os.makedirs(MODEL_DIR, exist_ok=True)

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

USE_SELECTKBEST = True
K_BEST_RATIO = 0.8
K_BEST_MIN = 200
K_BEST_MAX = None

print('Config ready')

In [None]:
# 2) Load logs và GP features
train_log = pd.read_csv(DATA_PATH / 'train_log.csv')
test_log  = pd.read_csv(DATA_PATH / 'test_log.csv')
train_feat = pd.read_pickle(FEAT_TRAIN_PKL)
test_feat  = pd.read_pickle(FEAT_TEST_PKL)
y = train_log['target'].astype(int).values
print(train_feat.shape, test_feat.shape, 'TDE ratio', y.mean())

In [None]:
# 3) Chuẩn bị đặc trưng: căn cột, xử lý NaN/Inf, SelectKBest (tùy chọn), RobustScaler
X_df = train_feat.copy(); X_test_df = test_feat.copy()
for col in ['object_id','target']:
    if col in X_df: X_df = X_df.drop(columns=[col])
    if col in X_test_df: X_test_df = X_test_df.drop(columns=[col])
common_cols = sorted(set(X_df.columns) & set(X_test_df.columns))
X_df = X_df[common_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_df = X_test_df[common_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
feature_names = X_df.columns.tolist()
X_raw = X_df.values.astype(np.float32)
X_test_raw = X_test_df.values.astype(np.float32)

n_feat = X_raw.shape[1]
if USE_SELECTKBEST and n_feat > K_BEST_MIN:
    k_best = int(n_feat * K_BEST_RATIO) if K_BEST_RATIO < 1 else int(K_BEST_RATIO)
    k_best = max(K_BEST_MIN, k_best)
    if K_BEST_MAX is not None: k_best = min(K_BEST_MAX, k_best)
    k_best = min(k_best, n_feat)
    selector = SelectKBest(score_func=f_classif, k=k_best)
    X_sel = selector.fit_transform(X_raw, y)
    X_test_sel = selector.transform(X_test_raw)
    selected_names = [f for f, keep in zip(feature_names, selector.get_support()) if keep]
    joblib.dump(selector, f'{MODEL_DIR}/xgb_feature_selector.pkl')
    joblib.dump(selected_names, f'{MODEL_DIR}/xgb_selected_features.pkl')
    X_raw, X_test_raw, feature_names = X_sel, X_test_sel, selected_names
    print(f'SelectKBest: {n_feat} -> {X_raw.shape[1]} features')
else:
    joblib.dump(feature_names, f'{MODEL_DIR}/xgb_feature_names.pkl')

scaler = RobustScaler()
X = scaler.fit_transform(X_raw)
X_test = scaler.transform(X_test_raw)
joblib.dump(scaler, f'{MODEL_DIR}/xgb_robust_scaler.pkl')
scale_pos_weight = (y == 0).sum() / max((y == 1).sum(), 1)
print('Prepared features:', X.shape, X_test.shape, 'scale_pos_weight', scale_pos_weight)

In [None]:
# 4) Huấn luyện XGBoost 5-fold CV, tối ưu ngưỡng F1
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 7,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 5,
    'scale_pos_weight': scale_pos_weight,
    'seed': SEED,
    'n_jobs': -1
}

oof = np.zeros(len(y))
test_pred = np.zeros(len(X_test))
fold_scores, fold_ths, test_folds = [], [], []

for fold, (tr, va) in enumerate(skf.split(X, y), 1):
    print(f'Fold {fold}/{N_FOLDS}')
    dtrain = xgb.DMatrix(X[tr], label=y[tr])
    dval = xgb.DMatrix(X[va], label=y[va])
    booster = xgb.train(params, dtrain, num_boost_round=2000, evals=[(dval,'val')], early_stopping_rounds=100, verbose_eval=False)
    oof[va] = booster.predict(dval)
    fold_test = booster.predict(xgb.DMatrix(X_test))
    test_pred += fold_test / N_FOLDS
    test_folds.append(fold_test)
    best_th, best_f1 = 0.5, 0.0
    for th in np.arange(0.1, 0.9, 0.01):
        f1 = f1_score(y[va], (oof[va] >= th).astype(int))
        if f1 > best_f1: best_f1, best_th = f1, th
    fold_scores.append(best_f1); fold_ths.append(best_th)
    print(f'  F1={best_f1:.4f} @th={best_th:.2f}')

def neg_f1(th): return -f1_score(y, (oof >= th).astype(int))
opt = minimize_scalar(neg_f1, bounds=(0.1, 0.9), method='bounded')
best_th_global, best_f1_global = opt.x, -opt.fun
auc_global = roc_auc_score(y, oof)
print(f'CV F1 mean={np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}')
print(f'Global F1={best_f1_global:.4f} @th={best_th_global:.2f}, AUC={auc_global:.4f}')

np.save(f'{MODEL_DIR}/xgb_oof.npy', oof)
np.save(f'{MODEL_DIR}/xgb_test_mean.npy', test_pred)
joblib.dump({
print('Saved OOF/test/config')

In [None]:
# 5) Submission: fold tốt nhất + ngưỡng global
best_fold = int(np.argmax(fold_scores))
best_fold_th = float(fold_ths[best_fold])
best_fold_test = np.asarray(test_folds[best_fold], dtype=float)
preds_fold = (best_fold_test >= best_fold_th).astype(int)
sub_fold = pd.DataFrame({'object_id': test_log['object_id'], 'target': preds_fold})
sub_fold.to_csv('submission_xgb_bestfold.csv', index=False)

preds_global = (test_pred >= best_th_global).astype(int)
sub_global = pd.DataFrame({'object_id': test_log['object_id'], 'target': preds_global})
sub_global.to_csv('submission_xgb_global.csv', index=False)

print('Saved submissions: bestfold & global threshold')