In [1]:
import os, glob, gc, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats

import lightgbm as lgb

from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, precision_score, recall_score
SEED = 42
np.random.seed(SEED)


In [2]:
# Galactic extinction coefficients (LSST bands)
EXTINCTION_COEFFS = {
    "u": 4.239,
    "g": 3.303,
    "r": 2.285,
    "i": 1.698,
    "z": 1.263,
    "y": 1.088,
}


In [3]:
# LSST filters (fixed order)
FILTERS = ["u", "g", "r", "i", "z", "y"]


# MALLORN - Lightcurve-specialized LGBM (v2)
- Faster feature build (groupby)
- asinh-flux + early/late slopes + cross-band peak features
- robust submission mapping
- Target: improve LB > 0.6 (build on your 0.578 baseline)


In [4]:
import os, glob
import pandas as pd

def find_file_in_kaggle_input(filename: str):
    hits = glob.glob(f"/kaggle/input/**/{filename}", recursive=True)
    if not hits:
        raise FileNotFoundError(f"Kh√¥ng t√¨m th·∫•y {filename} trong /kaggle/input. B·∫°n ki·ªÉm tra ƒë√£ add dataset competition ch∆∞a.")
    # ∆∞u ti√™n file n·∫±m ·ªü c·∫•p root dataset (ƒë∆∞·ªùng d·∫´n ng·∫Øn h∆°n)
    hits = sorted(hits, key=lambda x: (x.count("/"), len(x)))
    return hits[0]

train_log_path = find_file_in_kaggle_input("train_log.csv")
test_log_path  = find_file_in_kaggle_input("test_log.csv")
sample_sub_path = find_file_in_kaggle_input("sample_submission.csv")

print("train_log:", train_log_path)
print("test_log :", test_log_path)
print("sample   :", sample_sub_path)

train_log = pd.read_csv(train_log_path)
test_log  = pd.read_csv(test_log_path)
sample_sub = pd.read_csv(sample_sub_path)

train_log.head()


train_log: /kaggle/input/mallorn-dataset/train_log.csv
test_log : /kaggle/input/mallorn-dataset/test_log.csv
sample   : /kaggle/input/mallorn-dataset/sample_submission.csv


Unnamed: 0,object_id,Z,Z_err,EBV,SpecType,English Translation,split,target
0,Dornhoth_fervain_onodrim,3.049,,0.11,AGN,Trawn Folk (Dwarfs) + northern + Ents (people),split_01,0
1,Dornhoth_galadh_ylf,0.4324,,0.058,SN II,Trawn Folk (Dwarfs) + tree + drinking vessel,split_01,0
2,Elrim_melethril_thul,0.4673,,0.577,AGN,Elves + lover (fem.) + breath,split_01,0
3,Ithil_tobas_rodwen,0.6946,,0.012,AGN,moon + roof + noble maiden,split_01,0
4,Mirion_adar_Druadan,0.4161,,0.058,AGN,"jewel, Silmaril + father + Wild Man",split_01,0


In [5]:
def find_split_root():
    candidates = glob.glob("/kaggle/input/*")
    for c in candidates:
        if os.path.isdir(c) and len(glob.glob(os.path.join(c, "split_*"))) >= 10:
            return c
    raise FileNotFoundError("Kh√¥ng t√¨m th·∫•y th∆∞ m·ª•c ch·ª©a split_* trong /kaggle/input. Ki·ªÉm tra dataset ƒë√£ add ƒë√∫ng competition.")

SPLIT_ROOT = find_split_root()
print("SPLIT_ROOT:", SPLIT_ROOT)
print("Example splits:", sorted(glob.glob(os.path.join(SPLIT_ROOT, "split_*")))[:3])


SPLIT_ROOT: /kaggle/input/mallorn-dataset
Example splits: ['/kaggle/input/mallorn-dataset/split_01', '/kaggle/input/mallorn-dataset/split_02', '/kaggle/input/mallorn-dataset/split_03']


In [6]:
# Load logs
train_log = pd.read_csv(train_log_path)
test_log  = pd.read_csv(test_log_path)
sample_sub = pd.read_csv(sample_sub_path)


# Add split_id numeric (useful feature)
train_log['split_id'] = train_log['split'].str.extract(r'(\d+)').astype(int)
test_log['split_id']  = test_log['split'].str.extract(r'(\d+)').astype(int)

print(f"Train objects: {len(train_log)} | Pos(TDE): {train_log['target'].sum()} ({train_log['target'].mean()*100:.2f}%)")
print(f"Test objects:  {len(test_log)}")

Train objects: 3043 | Pos(TDE): 148 (4.86%)
Test objects:  7135


In [7]:
import glob, os

def find_split_root():
    candidates = glob.glob("/kaggle/input/*")
    for c in candidates:
        if os.path.isdir(c) and len(glob.glob(os.path.join(c, "split_*"))) >= 10:
            return c
    raise FileNotFoundError("Kh√¥ng t√¨m th·∫•y split_* trong /kaggle/input. Ki·ªÉm tra ƒë√£ Add competition data ch∆∞a.")

SPLIT_ROOT = find_split_root()
print("SPLIT_ROOT =", SPLIT_ROOT)

# D√πng SPLIT_ROOT thay cho DATA_PATH trong ph·∫ßn load lightcurves
DATA_PATH = SPLIT_ROOT

# Load all lightcurves
train_lc_list, test_lc_list = [], []
for i in tqdm(range(1, 21), desc="Loading splits"):
    split_folder = f"split_{i:02d}"
    tr_path = os.path.join(DATA_PATH, split_folder, "train_full_lightcurves.csv")
    te_path = os.path.join(DATA_PATH, split_folder, "test_full_lightcurves.csv")
    if os.path.exists(tr_path):
        train_lc_list.append(pd.read_csv(tr_path))
    if os.path.exists(te_path):
        test_lc_list.append(pd.read_csv(te_path))

train_lc = pd.concat(train_lc_list, ignore_index=True).dropna(subset=['Flux'])
test_lc  = pd.concat(test_lc_list,  ignore_index=True).dropna(subset=['Flux'])
del train_lc_list, test_lc_list
gc.collect()

print(f"Train LC points: {len(train_lc):,} | objects: {train_lc['object_id'].nunique()}")
print(f"Test  LC points: {len(test_lc):,}  | objects: {test_lc['object_id'].nunique()}")

SPLIT_ROOT = /kaggle/input/mallorn-dataset


Loading splits: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  8.96it/s]


Train LC points: 478,493 | objects: 3043
Test  LC points: 1,143,103  | objects: 7135


In [8]:
def apply_extinction_vec(flux: np.ndarray, ebv: float, filt_arr: np.ndarray) -> np.ndarray:
    # flux_corr = flux * 10^(0.4 * R_lambda * EBV)
    if ebv <= 0:
        return flux
    coeff = np.vectorize(EXTINCTION_COEFFS.get)(filt_arr)
    coeff = np.where(pd.isna(coeff), 0.0, coeff)
    return flux * np.power(10.0, 0.4 * coeff * ebv)

def stat_feats(x: np.ndarray, prefix: str) -> dict:
    f = {}
    if x is None or len(x) < 3:
        return f
    x = x.astype(float)
    f[prefix+'mean'] = np.mean(x)
    f[prefix+'std']  = np.std(x)
    f[prefix+'median'] = np.median(x)
    f[prefix+'min']  = np.min(x)
    f[prefix+'max']  = np.max(x)
    f[prefix+'range'] = f[prefix+'max'] - f[prefix+'min']
    for p in (5,10,25,75,90,95):
        f[f"{prefix}p{p}"] = np.percentile(x, p)
    f[prefix+'iqr'] = f[prefix+'p75'] - f[prefix+'p25']
    if len(x) > 4:
        f[prefix+'skew'] = stats.skew(x)
        f[prefix+'kurt'] = stats.kurtosis(x)
    else:
        f[prefix+'skew'] = 0.0
        f[prefix+'kurt'] = 0.0
    mad = np.median(np.abs(x - f[prefix+'median']))
    f[prefix+'mad'] = mad
    f[prefix+'robust_std'] = 1.4826 * mad
    # amplitude using top/bottom 5%
    xsort = np.sort(x)
    k = max(1, int(0.05*len(xsort)))
    f[prefix+'amplitude'] = np.mean(xsort[-k:]) - np.mean(xsort[:k])
    return f

def temporal_feats(t: np.ndarray, x: np.ndarray, prefix: str) -> dict:
    f={}
    if len(t) < 5:
        return f
    idx = np.argsort(t)
    t = t[idx]; x = x[idx]
    dt = np.diff(t)
    f[prefix+'duration'] = t[-1]-t[0]
    f[prefix+'cad_mean'] = np.mean(dt)
    f[prefix+'cad_std']  = np.std(dt)
    f[prefix+'cad_min']  = np.min(dt)
    f[prefix+'cad_max']  = np.max(dt)
    # peak
    pk = int(np.argmax(x))
    f[prefix+'t_peak'] = t[pk]
    f[prefix+'time_to_peak'] = t[pk]-t[0]
    f[prefix+'peak_pos'] = f[prefix+'time_to_peak']/(f[prefix+'duration']+1e-9)
    # rise/decay
    if pk>0:
        f[prefix+'rise_rate'] = (x[pk]-x[0])/(t[pk]-t[0]+1e-9)
        f[prefix+'rise_flux'] = (x[pk]-x[0])
    else:
        f[prefix+'rise_rate']=0.0; f[prefix+'rise_flux']=0.0
    if pk < len(x)-1:
        f[prefix+'decay_rate'] = (x[pk]-x[-1])/(t[-1]-t[pk]+1e-9)
        f[prefix+'decay_flux'] = (x[pk]-x[-1])
    else:
        f[prefix+'decay_rate']=0.0; f[prefix+'decay_flux']=0.0
    f[prefix+'rise_decay_ratio'] = f[prefix+'rise_rate']/(f[prefix+'decay_rate']+1e-9)

    # early/late slopes (20% window)
    dur = f[prefix+'duration'] + 1e-9
    tn = (t - t[0]) / dur
    early = tn <= 0.2
    late  = tn >= 0.8
    if early.sum() >= 3:
        try:
            f[prefix+'slope_early'] = stats.linregress(t[early], x[early]).slope
            f[prefix+'mean_early']  = np.mean(x[early])
        except:
            f[prefix+'slope_early'] = 0.0; f[prefix+'mean_early']=np.mean(x[early])
    else:
        f[prefix+'slope_early']=0.0; f[prefix+'mean_early']=np.mean(x[:max(1,len(x)//5)])
    if late.sum() >= 3:
        try:
            f[prefix+'slope_late']  = stats.linregress(t[late], x[late]).slope
            f[prefix+'mean_late']   = np.mean(x[late])
        except:
            f[prefix+'slope_late']=0.0; f[prefix+'mean_late']=np.mean(x[late])
    else:
        f[prefix+'slope_late']=0.0; f[prefix+'mean_late']=np.mean(x[-max(1,len(x)//5):])
    f[prefix+'early_late_diff'] = f[prefix+'mean_late'] - f[prefix+'mean_early']

    # gradients
    g = np.diff(x)/(dt+1e-9)
    f[prefix+'grad_mean'] = np.mean(g)
    f[prefix+'grad_std']  = np.std(g)
    f[prefix+'grad_max']  = np.max(g)
    f[prefix+'grad_min']  = np.min(g)
    return f

def build_features(lc_df: pd.DataFrame, log_df: pd.DataFrame, is_train: bool) -> pd.DataFrame:
    # make meta map
    meta_cols = ['Z','Z_err','EBV','split_id'] + (['target'] if is_train else [])
    meta = log_df.set_index('object_id')[meta_cols]

    feats = []
    for oid, obj in tqdm(lc_df.groupby('object_id', sort=False), total=lc_df['object_id'].nunique(), desc="Feature build"):
        if oid not in meta.index:
            continue
        m = meta.loc[oid]
        z = float(m['Z']) if pd.notna(m['Z']) else 0.0
        ebv = float(m['EBV']) if pd.notna(m['EBV']) else 0.0
        zid = int(m['split_id'])

        f = {'object_id': oid, 'Z': z, 'Z_err': float(m['Z_err']) if 'Z_err' in m else np.nan, 'EBV': ebv, 'split_id': zid}
        f['logZ'] = np.log10(z+0.01)
        f['Z_EBV'] = z*ebv

        # extinction corrected flux
        flux = obj['Flux'].values.astype(float)
        ferr = obj['Flux_err'].values.astype(float)
        band = obj['Filter'].values.astype(str)
        time = obj['Time (MJD)'].values.astype(float)

        fluxc = apply_extinction_vec(flux, ebv, band)
        # asinh transform (handles negatives)
        flux_asinh = np.arcsinh(fluxc)

        # overall
        f.update(stat_feats(fluxc, 'all_'))
        f.update(stat_feats(flux_asinh, 'all_asinh_'))
        f.update(temporal_feats(time, fluxc, 'all_'))

        # per band
        per_band = {}
        for b in FILTERS:
            mask = (band == b)
            if mask.sum() < 5:
                continue
            tb = time[mask]; xb = fluxc[mask]
            xab = np.arcsinh(xb)
            f.update(stat_feats(xb, f'{b}_'))
            f.update(stat_feats(xab, f'{b}_asinh_'))
            f.update(temporal_feats(tb, xb, f'{b}_'))
            per_band[b] = {
                'mean': np.mean(xb),
                'max': np.max(xb),
                't_peak': tb[np.argmax(xb)]
            }

        # cross-band peak-time and peak-ratio features
        if 'g' in per_band and 'r' in per_band:
            f['gr_peak_ratio'] = per_band['g']['max'] / (per_band['r']['max'] + 1e-9)
            f['gr_tpeak_diff'] = per_band['g']['t_peak'] - per_band['r']['t_peak']
            f['gr_mean_diff']  = per_band['g']['mean'] - per_band['r']['mean']
        if 'u' in per_band and 'g' in per_band:
            f['ug_mean_diff'] = per_band['u']['mean'] - per_band['g']['mean']

        # counts
        f['n_filters'] = len(set(band.tolist()))
        f['total_obs'] = len(obj)

        if is_train:
            f['target'] = int(m['target'])
        feats.append(f)
    return pd.DataFrame(feats)

In [9]:
# Build feature tables
train_features = build_features(train_lc, train_log, is_train=True)
test_features  = build_features(test_lc,  test_log,  is_train=False)

print("train_features:", train_features.shape, " test_features:", test_features.shape)

Feature build: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3043/3043 [01:42<00:00, 29.68it/s]
Feature build: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7135/7135 [04:02<00:00, 29.45it/s]


train_features: (3043, 420)  test_features: (7135, 419)


In [10]:
# Prepare X/y
train_features = train_features.drop_duplicates('object_id')
test_features  = test_features.drop_duplicates('object_id')

y = train_features['target'].astype(int).values

# IMPORTANT: Do NOT use split_id as a model feature.
# Use it only as a GROUP for cross-validation to avoid leakage.
groups = train_features['split_id'].astype(int).values

drop_cols = ['object_id','target','split_id']
X = train_features.drop(columns=drop_cols)
X_test = test_features.drop(columns=['object_id','split_id'])

# clean inf/nan
X = X.replace([np.inf,-np.inf], np.nan).fillna(-999)
X_test = X_test.replace([np.inf,-np.inf], np.nan).fillna(-999)

pos = int(y.sum()); neg = int(len(y)-pos)
spw = neg / max(pos,1)
print("pos:",pos,"neg:",neg,"scale_pos_weight:",spw)


pos: 148 neg: 2895 scale_pos_weight: 19.56081081081081


In [11]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, precision_score, recall_score
import lightgbm as lgb
import numpy as np

# =========================
# CONFIG
# =========================
N_SPLITS = 5
SEEDS = [42, 7, 2025]   # 3-seed bagging (·ªïn ƒë·ªãnh + v·∫´n <30 ph√∫t)

lgb_params = dict(
    objective="binary",
    boosting_type="gbdt",
    metric="auc",
    learning_rate=0.03,
    n_estimators=12000,
    num_leaves=127,
    max_depth=-1,
    min_child_samples=20,
    feature_fraction=0.80,
    bagging_fraction=0.80,
    bagging_freq=1,
    reg_alpha=0.0,
    reg_lambda=1.0,
    scale_pos_weight=spw,
    n_jobs=-1,
    verbose=-1,
)

# =========================
# GROUP K-FOLD
# =========================
gkf = GroupKFold(n_splits=N_SPLITS)
folds = list(gkf.split(X, y, groups=groups))
print("Fold sizes:", [len(va) for _, va in folds])

# =========================
# TRAIN (OOF + TEST)
# =========================
oof_sum = np.zeros(len(X), dtype=float)
oof_cnt = np.zeros(len(X), dtype=int)
test_prob = np.zeros(len(X_test), dtype=float)

for seed in SEEDS:
    print(f"\n=== Seed {seed} ===")
    for fold, (tr_idx, va_idx) in enumerate(folds):
        X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
        X_va, y_va = X.iloc[va_idx], y[va_idx]

        params = dict(lgb_params)
        params["random_state"] = seed + fold

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(400, verbose=False)]
        )

        # OOF
        vp = model.predict_proba(X_va)[:, 1]
        oof_sum[va_idx] += vp
        oof_cnt[va_idx] += 1

        # TEST
        test_prob += model.predict_proba(X_test)[:, 1] / (len(SEEDS) * N_SPLITS)

# =========================
# FINAL OOF
# =========================
oof_avg = oof_sum / np.maximum(oof_cnt, 1)

# =========================
# THRESHOLD ‚Äì STEP 1: OOF F1
# =========================
ths = np.linspace(0.05, 0.95, 200)
f1s = np.array([f1_score(y, (oof_avg >= t).astype(int)) for t in ths])

best_idx = int(np.argmax(f1s))
best_t = float(ths[best_idx])
best_f1 = float(f1s[best_idx])

print("OOF best F1:", best_f1)
print("best_t:", best_t)
print("OOF precision:", precision_score(y, (oof_avg >= best_t).astype(int), zero_division=0))
print("OOF recall:", recall_score(y, (oof_avg >= best_t).astype(int), zero_division=0))


# =========================
# THRESHOLD ‚Äì STEP 2: LB-ORIENTED POS-RATE
# =========================
train_pos_rate = float(y.mean())
target_rate = min(0.25, train_pos_rate * 1.20)   # v√†ng cho MALLORN

cand_t = float(np.quantile(oof_avg, 1.0 - target_rate))

# blend threshold (·ªïn ƒë·ªãnh h∆°n)
t_final = 0.7 * cand_t + 0.3 * best_t

print("\ntrain_pos_rate:", train_pos_rate)
print("target_rate:", target_rate)
print("cand_t:", cand_t)
print("FINAL threshold:", t_final)

print("Expected submission pos-rate:",
      float((test_prob >= t_final).mean()))


# =========================
# MAKE SUBMISSION
# =========================
pred_bin = (test_prob >= t_final).astype(int)

pred_dict = dict(zip(test_features["object_id"], pred_bin))
sub_final = sample_sub.copy()
sub_final["prediction"] = sub_final["object_id"].map(pred_dict).fillna(0).astype(int)

print("Final submission pos-rate:", sub_final["prediction"].mean())

sub_final.to_csv("submission_lgbm_lightcurve_v3_plus.csv", index=False)
print("Saved submission_lgbm_lightcurve_v3_plus.csv")


Fold sizes: [613, 600, 611, 608, 611]

=== Seed 42 ===

=== Seed 7 ===

=== Seed 2025 ===
OOF best F1: 0.4336569579288026
best_t: 0.08165829145728642
OOF precision: 0.4161490683229814
OOF recall: 0.4527027027027027

train_pos_rate: 0.04863621426224121
target_rate: 0.05836345711468945
cand_t: 0.0707101690343783
FINAL threshold: 0.07399460576125073
Expected submission pos-rate: 0.06601261387526279
Final submission pos-rate: 0.06601261387526279
Saved submission_lgbm_lightcurve_v3_plus.csv


In [12]:
# =========================
# MICRO RATE SWEEP (ƒÇN ƒêI·ªÇM)
# =========================
N_TEST = len(test_prob)

for RATE in [0.045, 0.047, 0.048, 0.05, 0.052, 0.055, 0.058]:
    k = int(RATE * N_TEST)
    thr = np.partition(test_prob, -k)[-k]
    pr = (test_prob >= thr).mean()
    print(f"RATE={RATE:.3f} -> pos_rate={pr:.4f}, thr={thr:.4f}")


RATE=0.045 -> pos_rate=0.0450, thr=0.1796
RATE=0.047 -> pos_rate=0.0470, thr=0.1653
RATE=0.048 -> pos_rate=0.0479, thr=0.1604
RATE=0.050 -> pos_rate=0.0499, thr=0.1494
RATE=0.052 -> pos_rate=0.0520, thr=0.1356
RATE=0.055 -> pos_rate=0.0549, thr=0.1210
RATE=0.058 -> pos_rate=0.0579, thr=0.1043


In [13]:
# =========================
# RANKING-BASED SUBMISSION (ƒÇN ƒêI·ªÇM)
# =========================

# s·ªë l∆∞·ª£ng TDE d·ª± ƒëo√°n ‚Äì ch·ªânh ƒë√∫ng l√† l√™n ƒëi·ªÉm
# v√πng v√†ng cho MALLORN: 4% ‚Äì 7% t·ªïng test
N_TEST = len(test_prob)

for rate in [0.04, 0.05, 0.06, 0.07]:
    k = int(rate * N_TEST)
    thr = np.partition(test_prob, -k)[-k]
    pr = (test_prob >= thr).mean()
    print(f"rate={rate:.2f} -> threshold={thr:.4f}, pos_rate={pr:.4f}")

# üëâ CH·ªåN rate = 0.05 L√Ä KHUY·∫æN NGH·ªä ƒê·∫¶U TI√äN
RATE = 0.05
k = int(RATE * N_TEST)
thr_rank = np.partition(test_prob, -k)[-k]

print("\nFINAL ranking threshold:", thr_rank)

pred = (test_prob >= thr_rank).astype(int)

pred_dict = dict(zip(test_features["object_id"], pred))
submission = sample_sub.copy()
submission["prediction"] = submission["object_id"].map(pred_dict).fillna(0).astype(int)

print("Submission rows:", len(submission),
      "pos rate:", float(submission["prediction"].mean()))

submission.to_csv("submission_lgbm_lightcurve_v5_rank.csv", index=False)
print("Saved: submission_lgbm_lightcurve_v5_rank.csv")


rate=0.04 -> threshold=0.2362, pos_rate=0.0399
rate=0.05 -> threshold=0.1494, pos_rate=0.0499
rate=0.06 -> threshold=0.0925, pos_rate=0.0600
rate=0.07 -> threshold=0.0659, pos_rate=0.0699

FINAL ranking threshold: 0.14944622773289634
Submission rows: 7135 pos rate: 0.04989488437281009
Saved: submission_lgbm_lightcurve_v5_rank.csv
