In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedGroupKFold
from xgboost import XGBClassifier
from extinction import fitzpatrick99
import lightgbm as lgb
from lightgbm import LGBMClassifier
import optuna
from scipy.optimize import curve_fit
import xgboost as xgb

FILTERS = ["u", "g", "r", "i", "z", "y"]

EFF_WL_AA = {
    "u": 3641.0,
    "g": 4704.0,
    "r": 6155.0,
    "i": 7504.0,
    "z": 8695.0,
    "y": 10056.0,
}

R_V = 3.1
PRE_BASE_FRAC = 0.20
MIN_BAND_POINTS = 5
PEAK_SIGMA_K = 3.0
REBRIGHT_FRAC = 0.30
EPS = 1e-8

SEASON_GAP_DAYS = 90.0

SF_LAGS = [5.0, 10.0, 20.0, 50.0, 100.0]

In [2]:
def safe_float(x, default=np.nan):
    try:
        if x is None:
            return default
        x = float(x)
        if np.isnan(x):
            return default
        return x
    except Exception:
        return default


def trapz_safe(y, x):
    if hasattr(np, "trapezoid"):
        return float(np.trapezoid(y, x))
    y = np.asarray(y)
    x = np.asarray(x)
    if len(x) < 2:
        return np.nan
    return float(np.sum((x[1:] - x[:-1]) * (y[1:] + y[:-1]) * 0.5))


def median_abs_dev(x):
    x = np.asarray(x)
    if len(x) == 0:
        return np.nan
    med = np.median(x)
    return float(np.median(np.abs(x - med)))


def iqr(x):
    x = np.asarray(x)
    if len(x) < 2:
        return np.nan
    q75, q25 = np.percentile(x, [75, 25])
    return float(q75 - q25)


def skewness(x):
    x = np.asarray(x)
    n = len(x)
    if n < 3:
        return np.nan
    mu = np.mean(x)
    s = np.std(x)
    if s < 1e-12:
        return 0.0
    m3 = np.mean((x - mu) ** 3)
    return float(m3 / (s ** 3))


def kurtosis_excess(x):
    x = np.asarray(x)
    n = len(x)
    if n < 4:
        return np.nan
    mu = np.mean(x)
    s = np.std(x)
    if s < 1e-12:
        return 0.0
    m4 = np.mean((x - mu) ** 4)
    return float(m4 / (s ** 4) - 3.0)


def von_neumann_eta(x):
    x = np.asarray(x)
    n = len(x)
    if n < 3:
        return np.nan
    v = np.var(x)
    if v < 1e-12:
        return 0.0
    dif = np.diff(x)
    return float(np.mean(dif ** 2) / v)


def max_slope(t, f):
    t = np.asarray(t)
    f = np.asarray(f)
    if len(t) < 3:
        return np.nan
    dt = np.diff(t)
    df = np.diff(f)
    good = dt > 0
    if not np.any(good):
        return np.nan
    slopes = df[good] / dt[good]
    return float(np.max(np.abs(slopes)))


def median_abs_slope(t, f):
    t = np.asarray(t)
    f = np.asarray(f)
    if len(t) < 3:
        return np.nan
    dt = np.diff(t)
    df = np.diff(f)
    good = dt > 0
    if not np.any(good):
        return np.nan
    slopes = df[good] / dt[good]
    return float(np.median(np.abs(slopes)))


def linear_slope(t, f):
    t = np.asarray(t)
    f = np.asarray(f)
    if len(t) < 3:
        return np.nan
    try:
        a, b = np.polyfit(t, f, 1)
        return float(a)
    except Exception:
        return np.nan


def chi2_to_constant(f, ferr):
    f = np.asarray(f)
    ferr = np.asarray(ferr)
    n = len(f)
    if n < 3:
        return np.nan
    mu = np.median(f)
    denom = (ferr + EPS) ** 2
    chi2 = np.sum((f - mu) ** 2 / denom)
    dof = max(1, n - 1)
    return float(chi2 / dof)


def interp_flux_at_time(tb, fb, t0):
    tb = np.asarray(tb)
    fb = np.asarray(fb)
    if len(tb) < 2:
        return np.nan
    if (t0 < tb.min()) or (t0 > tb.max()):
        return np.nan
    return float(np.interp(t0, tb, fb))


def interp_err_at_time(tb, eb, t0):
    tb = np.asarray(tb)
    eb = np.asarray(eb)
    if len(tb) < 2:
        return np.nan
    if (t0 < tb.min()) or (t0 > tb.max()):
        return np.nan
    return float(np.interp(t0, tb, eb))


def fractional_variability(f, ferr):
    f = np.asarray(f, float)
    ferr = np.asarray(ferr, float)
    n = len(f)
    if n < 3:
        return np.nan
    mu = np.mean(f)
    if np.abs(mu) < 1e-8:
        return np.nan
    s2 = np.var(f, ddof=1)
    mean_err2 = np.mean(ferr ** 2)
    excess = max(0.0, s2 - mean_err2)
    return float(np.sqrt(excess) / np.abs(mu))


def stetson_J_consecutive(t, f, ferr):
    t = np.asarray(t)
    f = np.asarray(f)
    ferr = np.asarray(ferr)
    n = len(t)
    if n < 4:
        return np.nan
    mu = np.mean(f)
    scale = np.sqrt(n / max(1, n - 1))
    delta = scale * (f - mu) / (ferr + EPS)
    vals = []
    for i in range(n - 1):
        P = delta[i] * delta[i + 1]
        vals.append(np.sign(P) * np.sqrt(np.abs(P)))
    return float(np.mean(vals))


def pre_peak_baseline(tb, fb, eb, frac=PRE_BASE_FRAC):
    tb = np.asarray(tb)
    fb = np.asarray(fb)
    eb = np.asarray(eb)
    n = len(tb)
    if n < 3:
        return np.nan, np.nan, np.nan
    k = max(2, int(np.ceil(frac * n)))
    k = min(k, n)
    base = float(np.median(fb[:k]))
    mad_pre = median_abs_dev(fb[:k])
    mederr_pre = float(np.median(eb[:k])) if k > 0 else np.nan
    return base, mad_pre, mederr_pre


def count_significant_peaks(tb, fb, eb, baseline_pre, k_sigma=PEAK_SIGMA_K):
    tb = np.asarray(tb)
    fb = np.asarray(fb)
    eb = np.asarray(eb)
    n = len(fb)
    if n < 5:
        return 0
    mederr = float(np.median(eb)) if np.isfinite(np.median(eb)) else 0.0
    thresh = baseline_pre + k_sigma * mederr
    peaks = 0
    for i in range(1, n - 1):
        if (fb[i] > fb[i - 1]) and (fb[i] > fb[i + 1]) and (fb[i] > thresh):
            peaks += 1
    return int(peaks)


def postpeak_monotonicity(tb, fb, pidx):
    tb = np.asarray(tb)
    fb = np.asarray(fb)
    if pidx is None or pidx >= len(fb) - 2:
        return np.nan
    t2 = tb[pidx:]
    f2 = fb[pidx:]
    if len(f2) < 3:
        return np.nan
    dt = np.diff(t2)
    df = np.diff(f2)
    good = dt > 0
    if not np.any(good):
        return np.nan
    frac_neg = float(np.mean((df[good] / dt[good]) < 0))
    return frac_neg


def count_rebrighten(tb, fb, baseline_pre, amp, pidx, frac=REBRIGHT_FRAC):
    if pidx is None or pidx >= len(fb) - 2:
        return 0
    level = baseline_pre + frac * amp
    post = fb[pidx:]
    if len(post) < 3:
        return 0
    above = post > level
    crossings = np.sum((~above[:-1]) & (above[1:]))
    return int(crossings)


def fall_time_to_level(tb, fb, baseline_pre, amp, pidx, frac):
    if amp <= 0 or pidx is None:
        return np.nan
    level = baseline_pre + frac * amp
    t_dec = tb[pidx:]
    f_dec = fb[pidx:]
    if len(f_dec) < 2:
        return np.nan
    idx = np.where(f_dec <= level)[0]
    if len(idx) == 0:
        return np.nan
    return float(t_dec[idx[0]] - t_dec[0])


def rise_time_to_level(tb, fb, baseline_pre, amp, pidx, frac):
    if amp <= 0 or pidx is None or pidx < 2:
        return np.nan
    level = baseline_pre + frac * amp
    t_pre = tb[:pidx + 1]
    f_pre = fb[:pidx + 1]
    idx = np.where(f_pre >= level)[0]
    if len(idx) == 0:
        return np.nan
    return float(t_pre[-1] - t_pre[idx[0]])


def decay_powerlaw_fit(tb, fb, baseline_pre, pidx, tmax=300.0):
    if pidx is None or pidx >= len(fb) - 3:
        return np.nan, np.nan, 0
    t0 = tb[pidx]
    t_dec = tb[pidx:]
    f_dec = fb[pidx:]
    dt = t_dec - t0
    m = (dt > 0.0) & (dt <= tmax)
    dt = dt[m]
    fd = f_dec[m] - baseline_pre
    m2 = fd > 0.0
    dt = dt[m2]
    fd = fd[m2]
    if len(dt) < 4:
        return np.nan, np.nan, int(len(dt))
    x = np.log(dt + EPS)
    y = np.log(fd + EPS)
    try:
        b, a = np.polyfit(x, y, 1)
    except Exception:
        return np.nan, np.nan, int(len(dt))
    yhat = a + b * x
    ss_res = float(np.sum((y - yhat) ** 2))
    ss_tot = float(np.sum((y - np.mean(y)) ** 2)) + EPS
    r2 = 1.0 - ss_res / ss_tot
    return float(b), float(r2), int(len(dt))


def signed_log1p(x):
    x = float(x)
    return float(np.sign(x) * np.log1p(np.abs(x)))


def deextinct_band(flux, flux_err, ebv, band, r_v=R_V):
    if ebv is None or (isinstance(ebv, float) and np.isnan(ebv)):
        return flux, flux_err
    A_V = float(ebv) * float(r_v)
    wave = np.array([EFF_WL_AA[band]], dtype=float)
    A_lambda = float(fitzpatrick99(wave, A_V, r_v=r_v, unit="aa")[0])
    fac = 10.0 ** (0.4 * A_lambda)
    return flux * fac, flux_err * fac


def deextinct_lightcurve(lc, ebv):
    flux = lc["Flux"].to_numpy().astype(float)
    ferr = lc["Flux_err"].to_numpy().astype(float)
    filt = lc["Filter"].to_numpy()
    flux_corr = flux.copy()
    ferr_corr = ferr.copy()
    for b in FILTERS:
        m = (filt == b)
        if not np.any(m):
            continue
        flux_corr[m], ferr_corr[m] = deextinct_band(flux_corr[m], ferr_corr[m], ebv, b)
    return flux_corr, ferr_corr


def band_corr(tt_a, ff_a, tt_b, ff_b, n_grid=30):
    tt_a = np.asarray(tt_a, float)
    ff_a = np.asarray(ff_a, float)
    tt_b = np.asarray(tt_b, float)
    ff_b = np.asarray(ff_b, float)

    if len(tt_a) < 3 or len(tt_b) < 3:
        return np.nan

    tmin = max(tt_a.min(), tt_b.min())
    tmax = min(tt_a.max(), tt_b.max())
    if (tmax - tmin) < 5.0:
        return np.nan

    grid = np.linspace(tmin, tmax, n_grid)
    fa = np.interp(grid, tt_a, ff_a)
    fb = np.interp(grid, tt_b, ff_b)

    sa = np.std(fa)
    sb = np.std(fb)
    if sa < 1e-12 or sb < 1e-12:
        return 0.0
    return float(np.corrcoef(fa, fb)[0, 1])

In [3]:
def seasonality_features(tb):
    tb = np.asarray(tb, float)
    if len(tb) < 2:
        return np.nan, np.nan, np.nan
    dt = np.diff(tb)
    breaks = np.where(dt > SEASON_GAP_DAYS)[0]
    seg_starts = [0] + (breaks + 1).tolist()
    seg_ends = breaks.tolist() + [len(tb) - 1]
    spans = []
    for s, e in zip(seg_starts, seg_ends):
        spans.append(tb[e] - tb[s])
    spans = np.asarray(spans, float)
    n_seasons = float(len(spans))
    return n_seasons, float(np.max(spans)), float(np.mean(spans))


def structure_function_lags(tb, fb, lags=SF_LAGS):
    tb = np.asarray(tb, float)
    fb = np.asarray(fb, float)
    n = len(tb)
    out = {}
    if n < 6:
        for lag in lags:
            out[f"sf_medabs_{int(lag)}"] = np.nan
            out[f"sf_n_{int(lag)}"] = 0.0
        return out

    for lag in lags:
        tol = max(2.0, 0.2 * lag)
        vals = []
        for i in range(n - 1):
            dt = tb[i + 1:] - tb[i]
            m = (dt >= (lag - tol)) & (dt <= (lag + tol))
            if np.any(m):
                dif = np.abs(fb[i + 1:][m] - fb[i])
                vals.extend(dif.tolist())
        if len(vals) == 0:
            out[f"sf_medabs_{int(lag)}"] = np.nan
            out[f"sf_n_{int(lag)}"] = 0.0
        else:
            out[f"sf_medabs_{int(lag)}"] = float(np.median(vals))
            out[f"sf_n_{int(lag)}"] = float(len(vals))
    return out


def peak_vs_wavelength_slope(tpeak_by_band, val_by_band, z=0.0):
    xs = []
    ys = []
    for b in FILTERS:
        v = val_by_band.get(b, np.nan)
        t = tpeak_by_band.get(b, np.nan)
        if np.isfinite(v):
            lam = float(EFF_WL_AA[b] / (1.0 + float(z)))
            xs.append(lam)
            ys.append(float(v))
    xs = np.asarray(xs, float)
    ys = np.asarray(ys, float)
    if len(xs) < 2:
        return np.nan, np.nan, np.nan
    try:
        slope, intercept = np.polyfit(xs, ys, 1)
        yhat = slope * xs + intercept
        ss_res = float(np.sum((ys - yhat) ** 2))
        ss_tot = float(np.sum((ys - np.mean(ys)) ** 2)) + EPS
        r2 = 1.0 - ss_res / ss_tot
        return float(slope), float(intercept), float(r2)
    except Exception:
        return np.nan, np.nan, np.nan


def sed_logflux_vs_loglambda_at_time(band_tb, band_fb, band_eb, t0, z=0.0):
    xs = []
    ys = []
    ws = []
    for b in FILTERS:
        tb = band_tb.get(b, None)
        fb = band_fb.get(b, None)
        eb = band_eb.get(b, None)
        if tb is None or fb is None or eb is None:
            continue
        f = interp_flux_at_time(tb, fb, t0)
        e = interp_err_at_time(tb, eb, t0)
        if not np.isfinite(f) or not np.isfinite(e):
            continue
        if f <= 0:
            continue
        lam_rest = float(EFF_WL_AA[b] / (1.0 + float(z)))
        xs.append(np.log(lam_rest + EPS))
        ys.append(np.log(f + EPS))
        ws.append(1.0 / ((e / (f + EPS)) ** 2 + EPS))  # weight by relative error
    if len(xs) < 2:
        return np.nan, np.nan, np.nan, float(len(xs))
    xs = np.asarray(xs, float)
    ys = np.asarray(ys, float)
    ws = np.asarray(ws, float)

    try:
        W = np.sum(ws)
        xbar = np.sum(ws * xs) / (W + EPS)
        ybar = np.sum(ws * ys) / (W + EPS)
        cov = np.sum(ws * (xs - xbar) * (ys - ybar))
        var = np.sum(ws * (xs - xbar) ** 2) + EPS
        slope = cov / var
        intercept = ybar - slope * xbar
        yhat = slope * xs + intercept
        ss_res = float(np.sum(ws * (ys - yhat) ** 2))
        ss_tot = float(np.sum(ws * (ys - ybar) ** 2)) + EPS
        r2 = 1.0 - ss_res / ss_tot
        return float(slope), float(intercept), float(r2), float(len(xs))
    except Exception:
        return np.nan, np.nan, np.nan, float(len(xs))

Had to add bounds to Bazin cause it was overflowing. Pain in the ass. Most of the formulas in here are new to me. Astronomy is not my forte. My entire strategy for this competition was to add quality features, which turned into just creating a plethora of features. Hard to manage. But hey if you are reading this you for some reason decided to look through my projects and are probably doubting whether or not I understand all of this. You are correct. I have a very strong understanding of ML, but when it comes to astronomy I am clueless. I followed a lead in a discussion to focus on what classifies a TDE from the SpecType feature available in the train set, but not in the test. My whole model is to create a seperate model to predict SpecType, and then use that to create even more features with the help of Chat GPT barreling through thousands of research papers and finding hundreds of formulas. I then throw XGB and run that shit through optuna overnight to get a decent model. It's worked. haven't ran this one yet, but I currently sit ~130th out of 800. Should move up to top 50 we will see. 2026/01/24 1:57 AM.

In [4]:
from scipy.optimize import curve_fit

EPS = 1e-8

def sigmoid(x):
    x = np.clip(x, -60.0, 60.0)
    return 1.0 / (1.0 + np.exp(-x))


def bazin_stable(t, A, t0, trise, tfall, B, eps=EPS):
    """
    Numerically stable Bazin-like function:
        f(t) = A * exp(-(t-t0)/tfall) * sigmoid((t-t0)/trise) + B
    """
    trise = np.maximum(trise, eps)
    tfall = np.maximum(tfall, eps)

    x = (t - t0) / trise
    exp_term = np.exp(np.clip(-(t - t0) / tfall, -60.0, 60.0))
    return A * exp_term * sigmoid(x) + B


def should_fit_bazin(tb, fb, eb, min_points=8, amp_sigma=3.0):
    """
    Gate: only fit when there is enough data and a detectable transient-like signal.
    """
    tb = np.asarray(tb, float)
    fb = np.asarray(fb, float)
    eb = np.asarray(eb, float)

    if len(tb) < min_points:
        return False

    mederr = float(np.median(eb)) if np.isfinite(np.median(eb)) else np.inf
    if not np.isfinite(mederr) or mederr <= 0:
        return False

    amp = float(np.percentile(fb, 95) - np.percentile(fb, 5))
    if not np.isfinite(amp) or amp < amp_sigma * mederr:
        return False

    if float(np.std(fb)) < 1e-10:
        return False

    return True


def fit_bazin(tb, fb, eb):
    """
    Returns (A, t0, trise, tfall, B, chi2_red) on success.
    Returns (nan...nan) on failure or if scipy missing / gate fails.
    """
    nan_out = (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)


    tb = np.asarray(tb, float)
    fb = np.asarray(fb, float)
    eb = np.asarray(eb, float)

    order = np.argsort(tb)
    t = tb[order]
    f = fb[order]
    e = eb[order]

    m = np.isfinite(t) & np.isfinite(f) & np.isfinite(e)
    t, f, e = t[m], f[m], e[m]
    if len(t) < 3:
        return nan_out

    e = np.maximum(e, 1e-6)

    if not should_fit_bazin(t, f, e, min_points=8, amp_sigma=3.0):
        return nan_out

    B0 = float(np.median(f))
    A0 = float(max(1e-6, np.percentile(f, 95) - B0))
    t0_0 = float(t[int(np.argmax(f))])
    tr0 = 20.0
    tf0 = 60.0
    p0 = [A0, t0_0, tr0, tf0, B0]

    tmin, tmax = float(t.min()), float(t.max())
    iqr = float(np.percentile(f, 75) - np.percentile(f, 25))
    amp = float(max(1e-6, np.percentile(f, 95) - np.percentile(f, 5)))

    lo = [0.0, tmin - 50.0, 0.5, 1.0, B0 - 5.0 * (iqr + 1e-6)]
    hi = [10.0 * amp, tmax + 50.0, 200.0, 600.0, B0 + 5.0 * (iqr + 1e-6)]

    try:
        popt, _ = curve_fit(
            bazin_stable, t, f,
            p0=p0,
            sigma=e,
            absolute_sigma=True,
            bounds=(lo, hi),
            maxfev=5000
        )

        fhat = bazin_stable(t, *popt)
        resid = (f - fhat) / e
        chi2 = float(np.sum(resid * resid))
        dof = max(1, len(t) - len(popt))
        chi2_red = chi2 / dof

        A, t0, trise, tfall, B = [float(x) for x in popt]
        return (A, t0, trise, tfall, B, float(chi2_red))

    except Exception:
        return nan_out

In [5]:
def extract_features_for_object(lc_raw, z, z_err, ebv):
    feats = {}
    lc = lc_raw.sort_values("Time (MJD)").reset_index(drop=True)

    t = lc["Time (MJD)"].to_numpy().astype(float)
    filt = lc["Filter"].to_numpy()

    if len(t) == 0:
        feats["n_obs"] = 0
        return feats

    z = safe_float(z, default=0.0)
    z_err = safe_float(z_err, default=0.0)
    ebv = safe_float(ebv, default=np.nan)

    t_rel = t - t.min()
    t_rest = t_rel / (1.0 + z)

    flux_raw = lc["Flux"].to_numpy().astype(float)
    err_raw = lc["Flux_err"].to_numpy().astype(float)
    flux_corr, err_corr = deextinct_lightcurve(lc, ebv)

    feats["n_obs"] = int(len(t))
    feats["total_time_obs"] = float(t_rel.max() - t_rel.min())
    feats["total_time_rest"] = float(t_rest.max() - t_rest.min())

    feats["flux_mean"] = float(np.mean(flux_corr))
    feats["flux_median"] = float(np.median(flux_corr))
    feats["flux_std"] = float(np.std(flux_corr))
    feats["flux_min"] = float(np.min(flux_corr))
    feats["flux_max"] = float(np.max(flux_corr))

    feats["flux_mad"] = median_abs_dev(flux_corr)
    feats["flux_iqr"] = iqr(flux_corr)
    feats["flux_skew"] = skewness(flux_corr)
    feats["flux_kurt_excess"] = kurtosis_excess(flux_corr)

    p5, p25, p75, p95 = np.percentile(flux_corr, [5, 25, 75, 95])
    feats["flux_p5"] = float(p5)
    feats["flux_p25"] = float(p25)
    feats["flux_p75"] = float(p75)
    feats["flux_p95"] = float(p95)
    feats["robust_amp_global"] = float(p95 - p5)

    feats["neg_flux_frac"] = float(np.mean(flux_corr < 0))

    snr = np.abs(flux_corr) / (err_corr + EPS)
    feats["snr_median"] = float(np.median(snr))
    feats["snr_max"] = float(np.max(snr))

    feats["flux_mean_raw"] = float(np.mean(flux_raw))
    feats["flux_std_raw"] = float(np.std(flux_raw))
    feats["snr_max_raw"] = float(np.max(np.abs(flux_raw) / (err_raw + EPS)))
    feats["fvar_raw"] = fractional_variability(flux_raw, err_raw)

    feats["flux_mean_deext_minus_raw"] = float(feats["flux_mean"] - feats["flux_mean_raw"])
    feats["snrmax_deext_minus_raw"] = float(feats["snr_max"] - feats["snr_max_raw"])

    if len(t_rel) >= 2:
        dt = np.diff(t_rel)
        feats["median_dt"] = float(np.median(dt))
        feats["max_gap"] = float(np.max(dt))
        feats["n_seasons_global"] = float(np.sum(dt > SEASON_GAP_DAYS) + 1)
        feats["gap_frac_gt90"] = float(np.mean(dt > SEASON_GAP_DAYS))
        feats["gap_frac_gt30"] = float(np.mean(dt > 30.0))
    else:
        feats["median_dt"] = np.nan
        feats["max_gap"] = np.nan
        feats["n_seasons_global"] = np.nan
        feats["gap_frac_gt90"] = np.nan
        feats["gap_frac_gt30"] = np.nan

    feats["eta_von_neumann"] = von_neumann_eta(flux_corr)
    feats["chi2_const_global"] = chi2_to_constant(flux_corr, err_corr)
    feats["stetsonJ_global_obs"] = stetson_J_consecutive(t_rel, flux_corr, err_corr)
    feats["stetsonJ_global_rest"] = stetson_J_consecutive(t_rest, flux_corr, err_corr)

    feats["max_slope_global_obs"] = max_slope(t_rel, flux_corr)
    feats["max_slope_global_rest"] = max_slope(t_rest, flux_corr)

    feats["med_abs_slope_global_obs"] = median_abs_slope(t_rel, flux_corr)
    feats["med_abs_slope_global_rest"] = median_abs_slope(t_rest, flux_corr)

    feats["slope_global_obs"] = linear_slope(t_rel, flux_corr)
    feats["slope_global_rest"] = linear_slope(t_rest, flux_corr)

    feats["fvar_global"] = fractional_variability(flux_corr, err_corr)

    feats["Z"] = float(z)
    feats["log1pZ"] = float(np.log1p(max(0.0, z)))
    feats["Z_err"] = float(max(0.0, z_err))
    feats["log1pZerr"] = float(np.log1p(max(0.0, feats["Z_err"])))
    feats["EBV"] = ebv

    feats["n_filters_present"] = 0
    feats["total_obs"] = 0

    band_tpeak_obs = {}
    band_tpeak_rest = {}
    band_peak_flux = {}

    band_tb_obs = {}
    band_tb_rest = {}
    band_fb = {}
    band_eb = {}

    for b in FILTERS:
        m = (filt == b)
        nb = int(np.sum(m))
        feats[f"n_{b}"] = nb
        feats["total_obs"] += nb

        keys = [
            f"amp_{b}",
            f"amp_pre_{b}",
            f"baseline_pre_{b}",
            f"robust_amp_{b}",
            f"tpeak_{b}_obs",
            f"tpeak_{b}_rest",
            f"width50_{b}_obs",
            f"width50_{b}_rest",
            f"width80_{b}_obs",
            f"width80_{b}_rest",
            f"auc_pos_{b}_obs",
            f"auc_pos_{b}_rest",
            f"snrmax_{b}",
            f"eta_{b}",
            f"chi2_const_{b}",
            f"slope_{b}_obs",
            f"slope_{b}_rest",
            f"maxslope_{b}_obs",
            f"maxslope_{b}_rest",
            f"stetsonJ_{b}_obs",
            f"stetsonJ_{b}_rest",
            f"p5_{b}",
            f"p25_{b}",
            f"p75_{b}",
            f"p95_{b}",
            f"mad_{b}",
            f"iqr_{b}",
            f"mad_over_std_{b}",
            f"fvar_{b}",
            f"t_fall50_{b}_obs",
            f"t_fall20_{b}_obs",
            f"t_fall50_{b}_rest",
            f"t_fall20_{b}_rest",
            f"t_rise50_{b}_obs",
            f"t_rise20_{b}_obs",
            f"t_rise50_{b}_rest",
            f"t_rise20_{b}_rest",
            f"asym50_{b}_obs",
            f"asym50_{b}_rest",
            f"sharp50_{b}_obs",
            f"sharp50_{b}_rest",
            f"peak_dominance_{b}",
            f"std_ratio_prepost_{b}",
            f"n_peaks_{b}",
            f"postpeak_monotone_frac_{b}",
            f"n_rebrighten_{b}",
            f"decay_pl_slope_{b}_obs",
            f"decay_pl_r2_{b}_obs",
            f"decay_pl_npts_{b}_obs",
            f"decay_pl_slope_{b}_rest",
            f"decay_pl_r2_{b}_rest",
            f"decay_pl_npts_{b}_rest",

            # seasonality and structure function per band
            f"n_seasons_{b}",
            f"season_maxspan_{b}",
            f"season_meanspan_{b}",
            f"sf_medabs_5_{b}",
            f"sf_n_5_{b}",
            f"sf_medabs_10_{b}",
            f"sf_n_10_{b}",
            f"sf_medabs_20_{b}",
            f"sf_n_20_{b}",
            f"sf_medabs_50_{b}",
            f"sf_n_50_{b}",
            f"sf_medabs_100_{b}",
            f"sf_n_100_{b}",

            # Bazin shape fit
            f"bazin_A_{b}",
            f"bazin_t0_{b}_obs",
            f"bazin_trise_{b}_obs",
            f"bazin_tfall_{b}_obs",
            f"bazin_B_{b}",
            f"bazin_chi2red_{b}_obs",
            f"bazin_trise_{b}_rest",
            f"bazin_tfall_{b}_rest",
        ]
        for k in keys:
            feats[k] = np.nan

        if nb == 0:
            continue

        feats["n_filters_present"] += 1

        tb_obs = t_rel[m]
        fb = flux_corr[m]
        eb = err_corr[m]

        order = np.argsort(tb_obs)
        tb_obs = tb_obs[order]
        fb = fb[order]
        eb = eb[order]
        tb_rest = tb_obs / (1.0 + z)

        band_tb_obs[b] = tb_obs
        band_tb_rest[b] = tb_rest
        band_fb[b] = fb
        band_eb[b] = eb

        ns, maxsp, meansp = seasonality_features(tb_obs)
        feats[f"n_seasons_{b}"] = ns
        feats[f"season_maxspan_{b}"] = maxsp
        feats[f"season_meanspan_{b}"] = meansp

        sf = structure_function_lags(tb_obs, fb, lags=SF_LAGS)
        for lag in SF_LAGS:
            feats[f"sf_medabs_{int(lag)}_{b}"] = sf.get(f"sf_medabs_{int(lag)}", np.nan)
            feats[f"sf_n_{int(lag)}_{b}"] = sf.get(f"sf_n_{int(lag)}", 0.0)

        p5b, p25b, p75b, p95b = np.percentile(fb, [5, 25, 75, 95])
        feats[f"p5_{b}"] = float(p5b)
        feats[f"p25_{b}"] = float(p25b)
        feats[f"p75_{b}"] = float(p75b)
        feats[f"p95_{b}"] = float(p95b)
        feats[f"robust_amp_{b}"] = float(p95b - p5b)

        feats[f"mad_{b}"] = median_abs_dev(fb)
        feats[f"iqr_{b}"] = iqr(fb)
        stdb = float(np.std(fb))
        feats[f"mad_over_std_{b}"] = float(feats[f"mad_{b}"] / (stdb + EPS))

        base_pre, mad_pre, mederr_pre = pre_peak_baseline(tb_obs, fb, eb, frac=PRE_BASE_FRAC)
        feats[f"baseline_pre_{b}"] = float(base_pre) if np.isfinite(base_pre) else np.nan

        pidx = int(np.argmax(fb))
        peak_flux = float(fb[pidx])
        tpeak_obs = float(tb_obs[pidx])
        tpeak_rest = float(tb_rest[pidx])

        amp_median = peak_flux - float(np.median(fb))
        amp_pre = peak_flux - base_pre if np.isfinite(base_pre) else np.nan

        feats[f"amp_{b}"] = float(amp_median)
        feats[f"amp_pre_{b}"] = float(amp_pre) if np.isfinite(amp_pre) else np.nan

        feats[f"tpeak_{b}_obs"] = tpeak_obs
        feats[f"tpeak_{b}_rest"] = tpeak_rest
        feats[f"snrmax_{b}"] = float(np.max(np.abs(fb) / (eb + EPS)))

        feats[f"eta_{b}"] = von_neumann_eta(fb)
        feats[f"chi2_const_{b}"] = chi2_to_constant(fb, eb)

        feats[f"slope_{b}_obs"] = linear_slope(tb_obs, fb)
        feats[f"slope_{b}_rest"] = linear_slope(tb_rest, fb)

        feats[f"maxslope_{b}_obs"] = max_slope(tb_obs, fb)
        feats[f"maxslope_{b}_rest"] = max_slope(tb_rest, fb)

        feats[f"stetsonJ_{b}_obs"] = stetson_J_consecutive(tb_obs, fb, eb)
        feats[f"stetsonJ_{b}_rest"] = stetson_J_consecutive(tb_rest, fb, eb)

        feats[f"fvar_{b}"] = fractional_variability(fb, eb)

        A, t0, trise, tfall, B, chi2 = fit_bazin(tb_obs, fb, eb)
        feats[f"bazin_A_{b}"] = A
        feats[f"bazin_t0_{b}_obs"] = t0
        feats[f"bazin_trise_{b}_obs"] = trise
        feats[f"bazin_tfall_{b}_obs"] = tfall
        feats[f"bazin_B_{b}"] = B
        feats[f"bazin_chi2red_{b}_obs"] = chi2

        feats[f"bazin_trise_{b}_rest"] = trise / (1.0 + z) if np.isfinite(trise) else np.nan
        feats[f"bazin_tfall_{b}_rest"] = tfall / (1.0 + z) if np.isfinite(tfall) else np.nan

        if np.isfinite(amp_pre) and amp_pre > 0:
            feats[f"peak_dominance_{b}"] = float(amp_pre / (mad_pre + EPS))

            pre_seg = fb[:max(2, pidx)]
            post_seg = fb[pidx:]
            std_pre = float(np.std(pre_seg)) if len(pre_seg) >= 2 else np.nan
            std_post = float(np.std(post_seg)) if len(post_seg) >= 2 else np.nan
            if np.isfinite(std_pre) and np.isfinite(std_post):
                feats[f"std_ratio_prepost_{b}"] = float(std_pre / (std_post + EPS))

            feats[f"postpeak_monotone_frac_{b}"] = float(postpeak_monotonicity(tb_obs, fb, pidx))
            feats[f"n_peaks_{b}"] = float(count_significant_peaks(tb_obs, fb, eb, base_pre, k_sigma=PEAK_SIGMA_K))
            feats[f"n_rebrighten_{b}"] = float(count_rebrighten(tb_obs, fb, base_pre, amp_pre, pidx, frac=REBRIGHT_FRAC))

            feats[f"t_fall50_{b}_obs"] = float(fall_time_to_level(tb_obs, fb, base_pre, amp_pre, pidx, frac=0.50))
            feats[f"t_fall20_{b}_obs"] = float(fall_time_to_level(tb_obs, fb, base_pre, amp_pre, pidx, frac=0.20))
            feats[f"t_fall50_{b}_rest"] = float(fall_time_to_level(tb_rest, fb, base_pre, amp_pre, pidx, frac=0.50))
            feats[f"t_fall20_{b}_rest"] = float(fall_time_to_level(tb_rest, fb, base_pre, amp_pre, pidx, frac=0.20))

            feats[f"t_rise50_{b}_obs"] = float(rise_time_to_level(tb_obs, fb, base_pre, amp_pre, pidx, frac=0.50))
            feats[f"t_rise20_{b}_obs"] = float(rise_time_to_level(tb_obs, fb, base_pre, amp_pre, pidx, frac=0.20))
            feats[f"t_rise50_{b}_rest"] = float(rise_time_to_level(tb_rest, fb, base_pre, amp_pre, pidx, frac=0.50))
            feats[f"t_rise20_{b}_rest"] = float(rise_time_to_level(tb_rest, fb, base_pre, amp_pre, pidx, frac=0.20))

            tr50o = feats[f"t_rise50_{b}_obs"]
            tf50o = feats[f"t_fall50_{b}_obs"]
            tr50r = feats[f"t_rise50_{b}_rest"]
            tf50r = feats[f"t_fall50_{b}_rest"]
            feats[f"asym50_{b}_obs"] = float(tf50o / (tr50o + EPS)) if np.isfinite(tf50o) and np.isfinite(tr50o) else np.nan
            feats[f"asym50_{b}_rest"] = float(tf50r / (tr50r + EPS)) if np.isfinite(tf50r) and np.isfinite(tr50r) else np.nan

            feats[f"auc_pos_{b}_obs"] = float(trapz_safe(np.maximum(fb - base_pre, 0.0), tb_obs))
            feats[f"auc_pos_{b}_rest"] = float(trapz_safe(np.maximum(fb - base_pre, 0.0), tb_rest))

            def width_at_level(tt, ff, base, amp, frac):
                if amp <= 0 or len(ff) < 3:
                    return np.nan
                level = base + frac * amp
                above = ff >= level
                if not np.any(above):
                    return np.nan
                idx = np.where(above)[0]
                return float(tt[idx[-1]] - tt[idx[0]])

            w50_obs = width_at_level(tb_obs, fb, base_pre, amp_pre, 0.50)
            w80_obs = width_at_level(tb_obs, fb, base_pre, amp_pre, 0.80)
            w50_rest = width_at_level(tb_rest, fb, base_pre, amp_pre, 0.50)
            w80_rest = width_at_level(tb_rest, fb, base_pre, amp_pre, 0.80)

            feats[f"width50_{b}_obs"] = w50_obs
            feats[f"width80_{b}_obs"] = w80_obs
            feats[f"width50_{b}_rest"] = w50_rest
            feats[f"width80_{b}_rest"] = w80_rest

            feats[f"sharp50_{b}_obs"] = float(amp_pre / (w50_obs + EPS)) if np.isfinite(w50_obs) else np.nan
            feats[f"sharp50_{b}_rest"] = float(amp_pre / (w50_rest + EPS)) if np.isfinite(w50_rest) else np.nan

            b_obs, r2_obs, npts_obs = decay_powerlaw_fit(tb_obs, fb, base_pre, pidx, tmax=300.0)
            b_rest, r2_rest, npts_rest = decay_powerlaw_fit(tb_rest, fb, base_pre, pidx, tmax=300.0)

            feats[f"decay_pl_slope_{b}_obs"] = b_obs
            feats[f"decay_pl_r2_{b}_obs"] = r2_obs
            feats[f"decay_pl_npts_{b}_obs"] = float(npts_obs)

            feats[f"decay_pl_slope_{b}_rest"] = b_rest
            feats[f"decay_pl_r2_{b}_rest"] = r2_rest
            feats[f"decay_pl_npts_{b}_rest"] = float(npts_rest)

        band_tpeak_obs[b] = tpeak_obs
        band_tpeak_rest[b] = tpeak_rest
        band_peak_flux[b] = peak_flux

    tpeaks_obs = np.array([band_tpeak_obs.get(b, np.nan) for b in FILTERS], float)
    tpeaks_rest = np.array([band_tpeak_rest.get(b, np.nan) for b in FILTERS], float)
    tpeaks_obs = np.array([x for x in tpeaks_obs if np.isfinite(x)], float)
    tpeaks_rest = np.array([x for x in tpeaks_rest if np.isfinite(x)], float)
    feats["tpeak_std_obs"] = float(np.std(tpeaks_obs)) if len(tpeaks_obs) >= 2 else np.nan
    feats["tpeak_std_rest"] = float(np.std(tpeaks_rest)) if len(tpeaks_rest) >= 2 else np.nan

    pairs = [("u", "g"), ("g", "r"), ("r", "i"), ("i", "z"), ("z", "y")]
    for a, b in pairs:
        ta_obs = band_tpeak_obs.get(a, np.nan)
        tb_obs2 = band_tpeak_obs.get(b, np.nan)
        ta_rest = band_tpeak_rest.get(a, np.nan)
        tb_rest2 = band_tpeak_rest.get(b, np.nan)
        pa = band_peak_flux.get(a, np.nan)
        pb = band_peak_flux.get(b, np.nan)

        feats[f"tpeakdiff_{a}{b}_obs"] = (ta_obs - tb_obs2) if (np.isfinite(ta_obs) and np.isfinite(tb_obs2)) else np.nan
        feats[f"tpeakdiff_{a}{b}_rest"] = (ta_rest - tb_rest2) if (np.isfinite(ta_rest) and np.isfinite(tb_rest2)) else np.nan
        feats[f"peakratio_{a}{b}"] = (pa / (pb + EPS)) if (np.isfinite(pa) and np.isfinite(pb)) else np.nan

    def ratio_feature(name, num, den):
        if np.isfinite(num) and np.isfinite(den):
            feats[name] = float(num / (den + EPS))
        else:
            feats[name] = np.nan

    for a, b in pairs:
        ratio_feature(f"amppreratio_{a}{b}", feats.get(f"amp_pre_{a}", np.nan), feats.get(f"amp_pre_{b}", np.nan))
        ratio_feature(f"aucratio_{a}{b}_obs", feats.get(f"auc_pos_{a}_obs", np.nan), feats.get(f"auc_pos_{b}_obs", np.nan))
        ratio_feature(f"width50ratio_{a}{b}_obs", feats.get(f"width50_{a}_obs", np.nan), feats.get(f"width50_{b}_obs", np.nan))
        ratio_feature(f"asym50ratio_{a}{b}_obs", feats.get(f"asym50_{a}_obs", np.nan), feats.get(f"asym50_{b}_obs", np.nan))

    for a, b in [("g", "r"), ("r", "i"), ("i", "z")]:
        if (a in band_tb_obs) and (b in band_tb_obs):
            feats[f"corr_{a}{b}_obs"] = band_corr(
                band_tb_obs[a], band_fb[a],
                band_tb_obs[b], band_fb[b]
            )
        else:
            feats[f"corr_{a}{b}_obs"] = np.nan

    slope_t, intercept_t, r2_t = peak_vs_wavelength_slope(band_tpeak_obs, band_tpeak_obs, z=z)
    feats["tpeak_vs_lambda_slope_obs"] = slope_t
    feats["tpeak_vs_lambda_intercept_obs"] = intercept_t
    feats["tpeak_vs_lambda_r2_obs"] = r2_t

    slope_pf, intercept_pf, r2_pf = peak_vs_wavelength_slope(band_tpeak_obs, band_peak_flux, z=z)
    feats["peakflux_vs_lambda_slope"] = slope_pf
    feats["peakflux_vs_lambda_intercept"] = intercept_pf
    feats["peakflux_vs_lambda_r2"] = r2_pf

    tpr_obs = feats.get("tpeak_r_obs", np.nan)
    if np.isfinite(tpr_obs):
        def colors_at_time(t0):
            fr = interp_flux_at_time(band_tb_obs.get("r", np.array([])), band_fb.get("r", np.array([])), t0)
            fg = interp_flux_at_time(band_tb_obs.get("g", np.array([])), band_fb.get("g", np.array([])), t0)
            fi = interp_flux_at_time(band_tb_obs.get("i", np.array([])), band_fb.get("i", np.array([])), t0)
            cgr = (signed_log1p(fg) - signed_log1p(fr)) if (np.isfinite(fg) and np.isfinite(fr)) else np.nan
            cri = (signed_log1p(fr) - signed_log1p(fi)) if (np.isfinite(fr) and np.isfinite(fi)) else np.nan
            return cgr, cri

        cgr0, cri0 = colors_at_time(tpr_obs)
        feats["color_gr_at_rpeak_obs"] = cgr0
        feats["color_ri_at_rpeak_obs"] = cri0

        cgr20, cri20 = colors_at_time(tpr_obs + 20.0)
        cgr40, cri40 = colors_at_time(tpr_obs + 40.0)

        feats["color_gr_rpeak_p20_obs"] = cgr20
        feats["color_ri_rpeak_p20_obs"] = cri20
        feats["color_gr_rpeak_p40_obs"] = cgr40
        feats["color_ri_rpeak_p40_obs"] = cri40

        def slope(c1, c2, dt):
            if np.isfinite(c1) and np.isfinite(c2):
                return float((c2 - c1) / dt)
            return np.nan

        feats["color_gr_slope20_obs"] = slope(cgr0, cgr20, 20.0)
        feats["color_ri_slope20_obs"] = slope(cri0, cri20, 20.0)
        feats["color_gr_slope40_obs"] = slope(cgr0, cgr40, 40.0)
        feats["color_ri_slope40_obs"] = slope(cri0, cri40, 40.0)
    else:
        feats["color_gr_at_rpeak_obs"] = np.nan
        feats["color_ri_at_rpeak_obs"] = np.nan
        feats["color_gr_rpeak_p20_obs"] = np.nan
        feats["color_ri_rpeak_p20_obs"] = np.nan
        feats["color_gr_rpeak_p40_obs"] = np.nan
        feats["color_ri_rpeak_p40_obs"] = np.nan
        feats["color_gr_slope20_obs"] = np.nan
        feats["color_ri_slope20_obs"] = np.nan
        feats["color_gr_slope40_obs"] = np.nan
        feats["color_ri_slope40_obs"] = np.nan

    if np.isfinite(tpr_obs):
        sed_slope, sed_int, sed_r2, sed_n = sed_logflux_vs_loglambda_at_time(
            band_tb_obs, band_fb, band_eb, tpr_obs, z=z
        )
        feats["sed_logflux_loglambda_slope_rpeak"] = sed_slope
        feats["sed_logflux_loglambda_r2_rpeak"] = sed_r2
        feats["sed_logflux_loglambda_nbands_rpeak"] = sed_n

        sed_slope20, sed_int20, sed_r2_20, sed_n20 = sed_logflux_vs_loglambda_at_time(
            band_tb_obs, band_fb, band_eb, tpr_obs + 20.0, z=z
        )
        feats["sed_slope_rpeak_p20"] = sed_slope20
        feats["sed_r2_rpeak_p20"] = sed_r2_20
        feats["sed_nbands_rpeak_p20"] = sed_n20
    else:
        feats["sed_logflux_loglambda_slope_rpeak"] = np.nan
        feats["sed_logflux_loglambda_r2_rpeak"] = np.nan
        feats["sed_logflux_loglambda_nbands_rpeak"] = np.nan
        feats["sed_slope_rpeak_p20"] = np.nan
        feats["sed_r2_rpeak_p20"] = np.nan
        feats["sed_nbands_rpeak_p20"] = np.nan

    return feats

In [6]:
def build_lightcurve_cache(splits, base_dir="data", kind="train"):
    lc_cache = {}
    idx_cache = {}
    for s in splits:
        path = f"{base_dir}/{s}/{kind}_full_lightcurves.csv"
        lc = pd.read_csv(path)
        lc["object_id"] = lc["object_id"].astype(str)
        groups = lc.groupby("object_id").indices
        lc_cache[s] = lc
        idx_cache[s] = groups
    return lc_cache, idx_cache


def get_lightcurve(lc_cache, idx_cache, split, object_id):
    object_id = str(object_id)
    idx = idx_cache[split].get(object_id, None)
    if idx is None:
        return None
    return lc_cache[split].iloc[idx]

def build_feature_table(
    log_df,
    lc_cache,
    idx_cache,
    augment_photoz=False,
    test_zerr_pool=None,
    n_aug=2,
    seed=6
):
    rng = np.random.default_rng(seed)
    rows = []

    if test_zerr_pool is not None:
        test_zerr_pool = np.asarray(test_zerr_pool, float)
        test_zerr_pool = test_zerr_pool[np.isfinite(test_zerr_pool)]
        test_zerr_pool = test_zerr_pool[test_zerr_pool > 0]

    for i in range(len(log_df)):
        r = log_df.iloc[i]
        obj = str(r["object_id"])
        split = r["split"]

        lc = get_lightcurve(lc_cache, idx_cache, split, obj)
        if lc is None:
            feats = {"n_obs": 0}
            feats["object_id"] = obj
            feats["split"] = split
            feats["photoz_aug"] = 0
            if "target" in log_df.columns:
                feats["target"] = int(r["target"])
            rows.append(feats)
            continue

        feats = extract_features_for_object(
            lc_raw=lc,
            z=r["Z"],
            z_err=r.get("Z_err", 0.0),
            ebv=r["EBV"],
        )
        feats["object_id"] = obj
        feats["split"] = split
        feats["photoz_aug"] = 0
        if "target" in log_df.columns:
            feats["target"] = int(r["target"])
        rows.append(feats)

        if augment_photoz and ("target" in log_df.columns) and (test_zerr_pool is not None) and (len(test_zerr_pool) > 0):
            z0 = safe_float(r["Z"], default=0.0)
            for _ in range(n_aug):
                sigma = float(rng.choice(test_zerr_pool))
                z_sim = max(0.0, z0 + float(rng.normal(0.0, sigma)))
                feats2 = extract_features_for_object(
                    lc_raw=lc,
                    z=z_sim,
                    z_err=sigma,
                    ebv=r["EBV"],
                )
                feats2["object_id"] = obj
                feats2["split"] = split
                feats2["target"] = int(r["target"])
                feats2["photoz_aug"] = 1
                rows.append(feats2)

    return pd.DataFrame(rows)

def clean_features(df, drop_cols, add_missing_flags=True):
    X = df.drop(columns=drop_cols).copy()
    X = X.replace([np.inf, -np.inf], np.nan)

    if add_missing_flags:
        miss = X.isna().astype(np.uint8)
        miss.columns = [c + "_isnan" for c in miss.columns]
        X = pd.concat([X, miss], axis=1)

    return X

In [7]:
def best_threshold_f1(y_true, probs):
    ths = np.linspace(0.01, 0.99, 401)
    f1s = [f1_score(y_true, probs > t, zero_division=0) for t in ths]
    j = int(np.argmax(f1s))
    return float(ths[j]), float(f1s[j])


def make_splitter(n_splits, random_state=6):
    return StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

In [8]:
def add_spectype_teacher_features(train_feat, train_log, test_feat, n_splits=10, seed=6):

    df = train_feat.merge(train_log[["object_id", "SpecType"]], on="object_id", how="left")
    spec = df["SpecType"].fillna("Unknown").astype(str)

    def map_group(s):
        s2 = s.strip()
        if s2 == "TDE":
            return "TDE"
        if s2 == "AGN":
            return "AGN"
        if "SLSN" in s2:
            return "SLSN"
        if s2 == "SN Ia" or s2.startswith("SN Ia"):
            return "SNIa"
        if s2.startswith("SN II") or ("SN II" in s2):
            return "SNII"
        if s2.startswith("SN"):
            return "SNother"
        return "Other"

    spec_group = spec.map(map_group).astype(str)

    classes = sorted(spec_group.unique())
    class_to_idx = {c: i for i, c in enumerate(classes)}
    y_mc = spec_group.map(class_to_idx).to_numpy()

    X_tr = clean_features(df, drop_cols=["object_id", "split", "target", "SpecType"], add_missing_flags=True)
    X_te = clean_features(test_feat, drop_cols=["object_id", "split"], add_missing_flags=True)

    groups = df["split"].to_numpy()

    splitter = make_splitter(n_splits, random_state=seed)
    split_iter = splitter.split(X_tr, y_mc, groups)

    oof = np.zeros((len(X_tr), len(classes)), dtype=float)

    base = dict(
        objective="multiclass",
        num_class=len(classes),
        metric="multi_logloss",
        n_estimators=20000,
        learning_rate=0.03,
        num_leaves=63,
        min_child_samples=5,
        subsample=0.8,
        subsample_freq=1,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=0.0,
        n_jobs=-1,
        random_state=seed,
        verbosity=-1,
        force_col_wise=True
    )

    for fold, (tr_idx, va_idx) in enumerate(split_iter, 1):
        model = LGBMClassifier(**base)
        model.fit(
            X_tr.iloc[tr_idx], y_mc[tr_idx],
            eval_set=[(X_tr.iloc[va_idx], y_mc[va_idx])],
            eval_metric="multi_logloss",
            callbacks=[lgb.early_stopping(200, verbose=False)]
        )
        oof[va_idx] = model.predict_proba(X_tr.iloc[va_idx], num_iteration=model.best_iteration_)

    model_full = LGBMClassifier(**base)
    model_full.fit(X_tr, y_mc)
    p_test = model_full.predict_proba(X_te)

    def entropy(p):
        p = np.clip(p, 1e-12, 1.0)
        return -np.sum(p * np.log(p), axis=1)

    for i, c in enumerate(classes):
        train_feat[f"p_spec_{c}"] = oof[:, i]
        test_feat[f"p_spec_{c}"] = p_test[:, i]

    train_feat["spec_entropy"] = entropy(oof)
    test_feat["spec_entropy"] = entropy(p_test)

    train_feat["spec_topprob"] = np.max(oof, axis=1)
    test_feat["spec_topprob"] = np.max(p_test, axis=1)

    return train_feat, test_feat

In [9]:
def feature_select_gain_topk(train_feat, k=350, n_splits=10, seed=6):
    y = train_feat["target"].astype(int).to_numpy()
    groups = train_feat["split"].to_numpy()
    X = clean_features(train_feat, drop_cols=["object_id", "split", "target"], add_missing_flags=True)

    splitter = make_splitter(n_splits, random_state=seed)
    split_iter = splitter.split(X, y, groups)

    gains = {c: 0.0 for c in X.columns}

    base_params = dict(
        objective="binary:logistic",
        eval_metric="aucpr",
        random_state=seed,
        n_jobs=-1,
        tree_method="hist",
        device="cuda",
        n_estimators=6000,
        learning_rate=0.02,
        max_depth=6,
        min_child_weight=10,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=2.0,
        reg_lambda=2.0,
        gamma=0.0,
        max_bin=256,
    )

    for tr_idx, va_idx in split_iter:
        X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
        neg = np.sum(y_tr == 0)
        pos = np.sum(y_tr == 1)
        spw = float(neg / max(1, pos))

        model = XGBClassifier(**{**base_params, "scale_pos_weight": spw})
        model.fit(X_tr, y_tr, verbose=False)

        score = model.get_booster().get_score(importance_type="gain")
        for feat, g in score.items():
            if feat in gains:
                gains[feat] += float(g)

    ranked = sorted(gains.items(), key=lambda x: x[1], reverse=True)
    top = [f for f, _ in ranked[:k]]
    return top

In [10]:
def run_optuna_xgb_f1(train_feat, feature_cols, n_folds_tune=10, timeout_sec=28800, seed=6):
    y = train_feat["target"].astype(int).to_numpy()
    groups = train_feat["split"].to_numpy()

    X_all = clean_features(train_feat, drop_cols=["object_id", "split", "target"], add_missing_flags=True)
    X = X_all[feature_cols].copy()

    splitter = make_splitter(n_folds_tune, random_state=seed)
    split_iter_all = list(splitter.split(X, y, groups))

    def objective(trial):
        params = {
            "objective": "binary:logistic",
            "eval_metric": "aucpr",
            "random_state": seed,
            "n_jobs": -1,
            "tree_method": "hist",
            "device": "cuda",
            "n_estimators": trial.suggest_int("n_estimators", 1500, 14000),
            "learning_rate": trial.suggest_float("learning_rate", 0.002, 0.08, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 80),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
            "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
            "max_bin": trial.suggest_int("max_bin", 128, 512),
            "gamma": trial.suggest_float("gamma", 0.0, 12.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 35.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.05, 50.0),
            "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
            "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        }

        if params["grow_policy"] == "lossguide":
            params["max_leaves"] = trial.suggest_int("max_leaves", 16, 512)

        oof = np.zeros(len(X), dtype=float)
        f1_progress = []

        for fold, (tr_idx, va_idx) in enumerate(split_iter_all, 1):
            X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
            X_va, y_va = X.iloc[va_idx], y[va_idx]

            neg = np.sum(y_tr == 0)
            pos = np.sum(y_tr == 1)
            spw = float(neg / max(1, pos))

            model = XGBClassifier(**{**params, "scale_pos_weight": spw})
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_va, y_va)],
                verbose=False,
                callbacks=[xgb.callback.EarlyStopping(rounds=200, save_best=True)]
            )

            oof[va_idx] = model.predict_proba(X_va)[:, 1]
            th_fold, f1_fold = best_threshold_f1(y_va, oof[va_idx])
            f1_progress.append(f1_fold)

            trial.report(float(np.mean(f1_progress)), step=fold)
            if trial.should_prune():
                raise optuna.TrialPruned()

        th, f1 = best_threshold_f1(y, oof)
        return float(f1)

    sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True, group=True)
    pruner = optuna.pruners.MedianPruner(n_startup_trials=40, n_warmup_steps=3)

    study = optuna.create_study(
        direction="maximize",
        sampler=sampler,
        pruner=pruner,
        study_name="xgb_oof_f1_splitcv_gpu_selected",
        storage="sqlite:///optuna_xgb_oof_f1_gpu_selected.db",
        load_if_exists=True
    )

    study.optimize(objective, n_trials=999999, timeout=timeout_sec)

    print("\nOptuna best OOF F1:", study.best_value)
    print("Best params:")
    for k, v in study.best_params.items():
        print(k, "=", v)

    return study.best_params

change

In [11]:
def xgb_fit_kaggle(model, X_tr, y_tr, X_va=None, y_va=None, verbose=False):
    if X_va is not None and y_va is not None:
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=verbose)
    else:
        model.fit(X_tr, y_tr, verbose=verbose)
    return model

change

In [12]:
def predict_xgb_multiseed(train_feat, test_feat, best_params, feature_cols, n_splits_oof=20, seeds=(6, 67, 6767)):
    y = train_feat["target"].astype(int).to_numpy()
    groups = train_feat["split"].to_numpy()

    X_all = clean_features(train_feat, drop_cols=["object_id", "split", "target"], add_missing_flags=True)
    X_test_all = clean_features(test_feat, drop_cols=["object_id", "split"], add_missing_flags=True)

    X = X_all[feature_cols].copy()
    X_test = X_test_all[feature_cols].copy()

    splitter = make_splitter(n_splits_oof, random_state=6)
    split_iter = list(splitter.split(X, y, groups))

    oof = np.zeros(len(X), dtype=float)

    for fold, (tr_idx, va_idx) in enumerate(split_iter, 1):
        X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
        X_va, y_va = X.iloc[va_idx], y[va_idx]

        neg = np.sum(y_tr == 0)
        pos = np.sum(y_tr == 1)
        spw = float(neg / max(1, pos))

        probs_va = []
        for sd in seeds:
            model = XGBClassifier(
                objective="binary:logistic",
                eval_metric="logloss",
                random_state=sd,
                n_jobs=-1,
                tree_method="hist",
                device="cuda",
                scale_pos_weight=spw,
                **best_params
            )
            xgb_fit_kaggle(model, X_tr, y_tr, X_va, y_va, verbose=False)
            probs_va.append(model.predict_proba(X_va)[:, 1])

        oof[va_idx] = np.mean(probs_va, axis=0)

    best_th, best_f1 = best_threshold_f1(y, oof)
    ap = average_precision_score(y, oof)
    print("\nOOF multiseed best threshold:", best_th)
    print("OOF multiseed best F1:", best_f1)
    print("OOF AP (aucpr-ish):", ap)

    probs_test = []
    neg_full = np.sum(y == 0)
    pos_full = np.sum(y == 1)
    spw_full = float(neg_full / max(1, pos_full))

    for sd in seeds:
        model = XGBClassifier(
            objective="binary:logistic",
            eval_metric="aucpr",
            random_state=sd,
            n_jobs=-1,
            tree_method="hist",
            device="cuda",
            scale_pos_weight=spw_full,
            **best_params
        )
        xgb_fit_kaggle(model, X, y, verbose=False)
        probs_test.append(model.predict_proba(X_test)[:, 1])

    p_test = np.mean(probs_test, axis=0)
    return p_test, best_th

In [13]:
N_AUG = 2
FS_TOPK = 380
FS_FOLDS = 10
OPTUNA_FOLDS = 10
OPTUNA_TIMEOUT_SEC = 28800
FINAL_OOF_FOLDS = 20
SEEDS = (6, 67, 6767)

from pathlib import Path

ROOT = Path.cwd().parents[1]
DATA_DIR = ROOT / "data"

train_log = pd.read_csv(DATA_DIR / "train_log.csv")
test_log  = pd.read_csv(DATA_DIR / "test_log.csv")

train_log["object_id"] = train_log["object_id"].astype(str)
test_log["object_id"] = test_log["object_id"].astype(str)

if "Z_err" not in train_log.columns:
    train_log["Z_err"] = 0.0
train_log["Z_err"] = train_log["Z_err"].fillna(0.0)

if "Z_err" not in test_log.columns:
    test_log["Z_err"] = 0.0
test_log["Z_err"] = test_log["Z_err"].fillna(0.0)

train_splits = sorted(train_log["split"].unique())
test_splits = sorted(test_log["split"].unique())


train_lc_cache, train_idx_cache = build_lightcurve_cache(
    train_splits, base_dir=DATA_DIR, kind="train"
)

test_lc_cache, test_idx_cache = build_lightcurve_cache(
    test_splits, base_dir=DATA_DIR, kind="test"
)

test_zerr_pool = test_log["Z_err"].dropna().values

train_feat = build_feature_table(
    train_log, train_lc_cache, train_idx_cache,
    augment_photoz=True,
    test_zerr_pool=test_zerr_pool,
    n_aug=N_AUG,
    seed=6
)

test_feat = build_feature_table(
    test_log, test_lc_cache, test_idx_cache,
    augment_photoz=False
)
print(f"train shape: {train_feat.shape}")
print(f"test shape: {test_feat.shape}")


train shape: (9129, 559)
test shape: (7135, 558)


In [14]:
train_feat, test_feat = add_spectype_teacher_features(train_feat, train_log, test_feat, n_splits=10, seed=6)
selected_cols = feature_select_gain_topk(train_feat, k=FS_TOPK, n_splits=FS_FOLDS, seed=6)

In [None]:
best_params = run_optuna_xgb_f1(
    train_feat,
    feature_cols=selected_cols,
    n_folds_tune=OPTUNA_FOLDS,
    timeout_sec=OPTUNA_TIMEOUT_SEC,
    seed=6
)

In [20]:
best_params = {'n_estimators': 9476,
               'learning_rate': 0.0024306289953670325,
               'max_depth': 7, 'min_child_weight': 6,
               'subsample': 0.5344962939912224,
               'colsample_bytree': 0.464696420753079,
               'colsample_bylevel': 0.8146569410634974,
               'colsample_bynode': 0.7285475291884695,
               'max_bin': 181, 'gamma': 8.476938947246458,
               'reg_alpha': 0.44957196104419117,
               'reg_lambda': 5.23806334613521,
               'max_delta_step': 0,
               'grow_policy': 'depthwise'}

In [21]:
p_test, best_th = predict_xgb_multiseed(
    train_feat,
    test_feat,
    best_params,
    feature_cols=selected_cols,
    n_splits_oof=min(FINAL_OOF_FOLDS, len(train_splits)),
    seeds=(99, 999, 909)
)


OOF multiseed best threshold: 0.419
OOF multiseed best F1: 0.6243705941591138
OOF AP (aucpr-ish): 0.5164263302782434


OOF multiseed best threshold: 0.402
OOF multiseed best F1: 0.6187624750499002
OOF AP (aucpr-ish): 0.5153549152691365

In [None]:
"""train_feat = pd.read_parquet("artifacts/train_feat.parquet")
test_feat  = pd.read_parquet("artifacts/test_feat.parquet")"""

In [22]:
test_pred = (p_test > best_th).astype(int)

sub = pd.DataFrame({
    "object_id": test_feat["object_id"].values,
    "target": test_pred
})
out_name = "XGB_multiseed_teacher5.csv"
sub.to_csv(out_name, index=False)
print("Saved", out_name, " threshold:", best_th)

Saved XGB_multiseed_teacher5.csv  threshold: 0.419


In [19]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

BEST_PARAMS = {'n_estimators': 11766, 'learning_rate': 0.024381791324748325, 'max_depth': 4, 'min_child_weight': 23, 'subsample': 0.6004305768627155, 'colsample_bytree': 0.594854201978887, 'colsample_bylevel': 0.8241023733283193, 'colsample_bynode': 0.5372150805015081, 'max_bin': 406, 'gamma': 11.596780399929514, 'reg_alpha': 4.092494873246832, 'reg_lambda': 10.592102344523834, 'max_delta_step': 8, 'grow_policy': 'lossguide', 'max_leaves': 16}

THRESH = 0.402

SEEDS = (563,)

y = train_feat["target"].astype(int).to_numpy()

X_all = clean_features(train_feat, drop_cols=["object_id", "split", "target"], add_missing_flags=True)
X_test_all = clean_features(test_feat, drop_cols=["object_id", "split"], add_missing_flags=True)

X = X_all.reindex(columns=selected_cols, fill_value=0.0)
X_test = X_test_all.reindex(columns=selected_cols, fill_value=0.0)

neg = np.sum(y == 0)
pos = np.sum(y == 1)
spw_full = float(neg / max(1, pos))

probs_test = []

for sd in SEEDS:
    model = XGBClassifier(
        objective="binary:logistic",
        eval_metric="aucpr",
        random_state=sd,
        n_jobs=-1,
        tree_method="hist",
        device="cuda",
        scale_pos_weight=spw_full,
        **BEST_PARAMS
    )

    if "xgb_fit_kaggle" in globals() and callable(xgb_fit_kaggle):
        xgb_fit_kaggle(model, X, y, verbose=False)
    else:
        model.fit(X, y, verbose=False)

    probs_test.append(model.predict_proba(X_test)[:, 1])

p_test = np.mean(probs_test, axis=0)

test_pred = (p_test > THRESH).astype(int)

sub = pd.DataFrame({
    "object_id": test_feat["object_id"].values,
    "target": test_pred
})

out_name = "XGB_multiseed_teacher.csv"
sub.to_csv(out_name, index=False)

print("Saved", out_name, " threshold:", THRESH)
print("Predicted positive rate:", test_pred.mean())


Saved XGB_multiseed_teacher.csv  threshold: 0.402
Predicted positive rate: 0.05199719691660827


F1: 0.81742
ROC-AUC: 0.99906
PR-AUC: 0.98116
Accuracy: 0.97842
Threshold: 0.42
OOF positive rate: 0.06955854967685399

Saved XGB_multiseed_teacher4.csv
Test positive rate: 0.051156271899089

Saved XGB_multiseed_teacher.csv  threshold: 0.42
Predicted positive rate: 0.051296426068675544