In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

# Set the path to the 'RADIAL' folder in your Drive
main_path = '/content/drive/MyDrive/nasa_exoplanet/Radial-Velocity/RADIAL'

# Verify the path exists
if not os.path.exists(main_path):
    print(f"Error: The folder '{main_path}' does not exist. Please check the folder name and location in your Google Drive.")
else:
    print(f"Main path set to: {main_path}")

# You can now use 'main_path' to access files and folders within the RADIAL folder

Main path set to: /content/drive/MyDrive/nasa_exoplanet/RADIAL


In [None]:
# === Exoplanet RV: download + prep + balanced-train (RADIAL + HARPS) ===
# - Downloads into /content/drive/MyDrive/nasa_exoplanet/
# - Reads RADIAL .tbl with IPAC parser (fixes the tiny-positives issue)
# - Builds tidy series: time, rv, rv_err, star_id, source, label
# - Extracts per-star features (periodogram peaks + stats)
# - Stratified 90/10 split by star
# - Balanced training (undersample negatives to match positives) + LightGBM
# - Saves artifacts to processed/

import os, re, io, gzip, time, warnings, sys, subprocess
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

# deps
try:
    import requests, pandas as pd, numpy as np
except:
    pip_install(["requests","pandas","numpy"])
    import requests, pandas as pd, numpy as np

try:
    from astropy.io import ascii as astro_ascii
    from astropy.timeseries import LombScargle
except:
    pip_install(["astropy"])
    from astropy.io import ascii as astro_ascii
    from astropy.timeseries import LombScargle

try:
    import lightgbm as lgb
except:
    pip_install(["lightgbm"])
    import lightgbm as lgb

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, precision_recall_curve

warnings.filterwarnings("ignore")

# ====================== CONFIG ======================
BASE_DIR   = Path("/content/drive/MyDrive/nasa_exoplanet/Radial-Velocity")
RADIAL_DIR = BASE_DIR / "RADIAL_raw"
HARPS_DIR  = BASE_DIR / "HARPS_raw"
PROC_DIR   = BASE_DIR / "processed"
for d in (PROC_DIR, RADIAL_DIR, HARPS_DIR): d.mkdir(parents=True, exist_ok=True)

# NASA Exoplanet Archive endpoints
WGET_RADIAL_URL = "https://exoplanetarchive.ipac.caltech.edu/bulk_data_download/wget_RADIAL.bat"
CONFIRMED_HOSTS_CSV = (
    "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?"
    "query=select+distinct+hostname+from+pscomppars&format=csv"
)

# HARPS rvbank mirrors (Trifonov+2020 SERVAL)
HARPS_URLS = [
    "https://cdsarc.cds.unistra.fr/ftp/J/A+A/636/A74/rvbank.dat.gz",
    "https://cdsarc.u-strasbg.fr/ftp/J/A+A/636/A74/rvbank.dat.gz",
    "https://cdsarc.cds.unistra.fr/ftp/J/A+A/636/A74/rvbank.dat",
]

# knobs
MAX_WORKERS     = 12
TIMEOUT         = 60
RANDOM_STATE    = 42
TEST_FRAC       = 0.10   # ~10% by star
MIN_OBS         = 3      # keep short series to retain positives
BALANCE_RATIO   = 1.0    # undersample neg:pos ≈ 1.0 => 1:1
# =====================================================

def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent":"RV-Pipeline/3.0"})
    return s

def stream_download(s, url, dest: Path):
    tmp = dest.with_suffix(dest.suffix+".part")
    with s.get(url, stream=True, timeout=TIMEOUT) as r:
        r.raise_for_status()
        with open(tmp,"wb") as f:
            for chunk in r.iter_content(1024*64):
                if chunk: f.write(chunk)
    tmp.replace(dest)

# ---------------- 1) RADIAL (download + parse IPAC) ----------------
def fetch_radial(s):
    print("Fetching RADIAL wget script …")
    bat = s.get(WGET_RADIAL_URL, timeout=TIMEOUT).text
    (RADIAL_DIR/"wget_RADIAL.bat").write_text(bat, encoding="utf-8")

    pairs = []
    for line in bat.splitlines():
        line = line.strip()
        if not line or line.lower().startswith(("rem","::")): continue
        m = re.search(r'(https?://[^\s"\']+)', line)
        if not m: continue
        url = m.group(1)
        if not re.search(r'\.(tbl|csv|fits)(\.gz)?$', url, re.I):  # keep only data
            continue
        m2 = re.search(r'-O\s+("?)([^"\s]+)\1', line)
        out_name = m2.group(2).strip() if m2 else None
        pairs.append((url, out_name))
    # dedupe
    seen = {}
    for u,n in pairs:
        if u not in seen or (seen[u] is None and n): seen[u]=n
    pairs = [(u, seen[u]) for u in seen]

    tasks=[]
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        for url, out_name in pairs:
            fname = out_name or url.rstrip("/").split("/")[-1].split("?")[0]
            dest  = RADIAL_DIR/fname
            if dest.exists(): continue
            tasks.append(ex.submit(stream_download, s, url, dest))
        for i, fut in enumerate(as_completed(tasks),1):
            try: fut.result()
            except Exception: pass
            if i%25==0: print(f"  … {i}/{len(tasks)} RADIAL files")
    print(f"RADIAL: have {len(list(RADIAL_DIR.glob('*.tbl')))} .tbl files.")

# robust IPAC column mapping
def _map_ipac_cols(df):
    cl = {c.lower().strip(): c for c in df.columns}
    def pick(cands):
        for k in cands:
            if k in cl: return cl[k]
        # fallback: contains
        for k in cl:
            if any(sub in k for sub in cands): return cl[k]
        return None
    time_col = pick(['bjd_tdb','bjd','time','jd','jd_utc','mjd'])
    rv_col   = pick(['rv','radial_velocity','vrad','mnvel','vel','velocity','v_r'])
    err_col  = pick(['rv_err','sigma_rv','e_rv','erv','rv_error','sig_rv','stdev','unc_rv'])
    return time_col, rv_col, err_col

def read_radial_file(fp: Path):
    # IPAC reader first; fallback to whitespace
    try:
        tab = astro_ascii.read(str(fp), format='ipac', guess=True, fast_reader=False)
        df = tab.to_pandas()
    except Exception:
        try:
            df = pd.read_csv(fp, delim_whitespace=True, comment="#")
        except Exception:
            return None
    if df.empty: return None
    tcol, rcol, ecol = _map_ipac_cols(df)
    if tcol is None or rcol is None:
        return None
    out = pd.DataFrame({
        "time": pd.to_numeric(df[tcol], errors="coerce"),
        "rv":   pd.to_numeric(df[rcol], errors="coerce"),
    })
    if ecol and ecol in df.columns:
        out["rv_err"] = pd.to_numeric(df[ecol], errors="coerce")
    else:
        out["rv_err"] = np.nan
    out = out.dropna(subset=["time","rv"])
    if out.empty: return None
    # star_id from filename family (UID_xxx_RVC_###)
    m = re.match(r'^(.*)_RVC_', fp.stem)
    star_id = m.group(1) if m else fp.stem
    out["star_id"] = star_id
    out["source"]  = "RADIAL"
    out["label"]   = 1
    return out

def load_radial_ipac():
    fps = list(RADIAL_DIR.glob("*.tbl"))
    if not fps: return pd.DataFrame(columns=["time","rv","rv_err","star_id","source","label"])
    rows=[]
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = {ex.submit(read_radial_file, fp): fp for fp in fps}
        for fut in as_completed(futs):
            df = fut.result()
            if df is not None: rows.append(df)
    return pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(columns=["time","rv","rv_err","star_id","source","label"])

# ---------------- 2) HARPS (download + parse) ----------------
def fetch_harps(s):
    existing = list(HARPS_DIR.glob("rvbank.dat")) + list(HARPS_DIR.glob("rvbank.dat.gz"))
    if existing:
        print("HARPS: file present:", existing[0].name)
        return existing[0]
    for url in HARPS_URLS:
        try:
            print("Trying HARPS:", url)
            dest = HARPS_DIR / url.split("/")[-1]
            stream_download(s, url, dest)
            print("HARPS: downloaded", dest.name)
            return dest
        except Exception:
            continue
    print("HARPS: download failed.")
    return None

def load_harps(rvbank_path: Path|None):
    if rvbank_path is None:
        return pd.DataFrame(columns=["time","rv","rv_err","star_id","source"])
    if rvbank_path.suffix==".gz":
        with gzip.open(rvbank_path,"rt",encoding="utf-8",errors="replace") as f:
            text=f.read()
    else:
        text=rvbank_path.read_text(encoding="utf-8",errors="replace")
    rows=[]
    for ln in text.splitlines():
        if not ln or ln.startswith("#"): continue
        parts = re.split(r"\s+", ln.strip())
        if len(parts)<4: continue
        try:
            t=float(parts[1]); rv=float(parts[2]); er=float(parts[3])
        except:
            continue
        rows.append((t,rv,er,parts[0]))
    if not rows:
        return pd.DataFrame(columns=["time","rv","rv_err","star_id","source"])
    df=pd.DataFrame(rows,columns=["time","rv","rv_err","star_id"])
    df["source"]="HARPS"
    return df

def fetch_confirmed_hosts(s):
    try:
        csv = s.get(CONFIRMED_HOSTS_CSV, timeout=TIMEOUT).text
        df = pd.read_csv(io.StringIO(csv))
        df["hostname_norm"]=df["hostname"].astype(str).str.strip().str.lower()
        return set(df["hostname_norm"].tolist())
    except Exception:
        print("WARN: failed to fetch confirmed host list; treating HARPS all as negatives.")
        return set()

def label_harps(df_harps, confirmed_set):
    if df_harps.empty: return df_harps
    df=df_harps.copy()
    df["label"]=df["star_id"].astype(str).str.strip().str.lower().isin(confirmed_set).astype(int)
    return df

# ---------------- 3) Cleaning + series sanity ----------------
def clean_series(df, min_obs=MIN_OBS):
    if df.empty: return df
    keep=["time","rv","rv_err","star_id","source","label"]
    for c in keep:
        if c not in df.columns: df[c]=np.nan
    df=df[keep].copy()
    df["time"]=pd.to_numeric(df["time"], errors="coerce")
    df["rv"]=pd.to_numeric(df["rv"], errors="coerce")
    df["rv_err"]=pd.to_numeric(df["rv_err"], errors="coerce")
    df=df.dropna(subset=["time","rv"])
    # require min_obs per star
    counts=df.groupby("star_id")["time"].count()
    good=counts[counts>=min_obs].index
    return df[df.star_id.isin(good)]

# ---------------- 4) Features ----------------
def features_of_one(df):
    t=df["time"].values.astype(float)
    y=df["rv"].values.astype(float)
    if len(t)<3 or np.std(y)==0: return None
    # detrend-light: normalize
    y=(y-np.median(y))/(np.std(y)+1e-9)

    try:
        freq,power=LombScargle(t,y).autopower()
        if len(freq)==0: return None
        idx=np.argsort(power)[-3:]  # top-3 peaks
        topf=freq[idx]; topp=power[idx]
        bestf=topf[-1]
        period = 1.0/bestf if bestf>0 else np.nan
        # sinusoid amplitude at bestf
        phi=2*np.pi*bestf*t
        A=np.vstack([np.sin(phi), np.cos(phi), np.ones_like(phi)]).T
        coef, *_ = np.linalg.lstsq(A, y, rcond=None)
        amp=float(np.sqrt(coef[0]**2+coef[1]**2))
        p1=float(topp[-1])
        p2=float(topp[-2]) if len(topp)>1 else 0.0
        p3=float(topp[-3]) if len(topp)>2 else 0.0
    except Exception:
        period=np.nan; amp=0.0; p1=p2=p3=0.0

    return {
        "period": period,
        "power1": p1,
        "power2": p2,
        "power3": p3,
        "amp":    amp,
        "rms":    float(np.std(y)),
        "mad":    float(np.median(np.abs(y-np.median(y)))),
        "skew":   float(pd.Series(y).skew()),
        "n_obs":  int(len(y)),
        "span_days": float(t.max()-t.min()),
    }

def build_feature_table(series_df):
    feats=[]
    for sid,g in series_df.groupby("star_id", sort=False):
        f=features_of_one(g)
        if f is None: continue
        f["star_id"]=sid
        f["label"]=int(g["label"].iloc[0])
        feats.append(f)
    return pd.DataFrame(feats)

# ---------------- 5) Split (stratified by star) + balance ----------------
def stratified_group_split(feat_df, test_frac=TEST_FRAC, seed=RANDOM_STATE):
    X = feat_df.drop(columns=["label","star_id"])
    y = feat_df["label"].values
    groups = feat_df["star_id"].values
    n_splits = max(3, int(round(1/test_frac)))
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    train_idx, test_idx = next(sgkf.split(X, y, groups))
    train_df = feat_df.iloc[train_idx].copy()
    test_df  = feat_df.iloc[test_idx].copy()
    return train_df, test_df

def make_balanced_train(train_df, ratio=BALANCE_RATIO, seed=RANDOM_STATE):
    pos = train_df[train_df.label==1]
    neg = train_df[train_df.label==0]
    if len(pos)==0: raise RuntimeError("No positives in training after split.")
    need_neg = int(max(1, round(len(pos)*ratio)))
    neg_bal = neg.sample(n=min(need_neg, len(neg)), random_state=seed, replace=False)
    bal = pd.concat([pos, neg_bal], ignore_index=True).sample(frac=1.0, random_state=seed)
    return bal

# ---------------- 6) Train + eval ----------------
def train_lightgbm(train_df, test_df, seed=RANDOM_STATE):
    features=[c for c in train_df.columns if c not in ("star_id","label")]
    Xtr, ytr = train_df[features].values, train_df["label"].values
    Xte, yte = test_df[features].values,  test_df["label"].values

    clf = lgb.LGBMClassifier(
        n_estimators=900, learning_rate=0.03, num_leaves=31,
        subsample=0.9, colsample_bytree=0.9,
        min_child_samples=20, random_state=seed
    )
    clf.fit(Xtr, ytr)

    pred = clf.predict_proba(Xte)[:,1]
    auc = roc_auc_score(yte, pred)
    ap  = average_precision_score(yte, pred)
    print(f"\n=== Eval ===\nAUC: {auc:.3f} | AP: {ap:.3f}")

    # choose threshold by best F1 on PR curve
    prec, rec, thr = precision_recall_curve(yte, pred)
    f1 = 2*prec*rec/(prec+rec+1e-9)
    best_idx = int(np.argmax(f1))
    best_thr = float(thr[best_idx]) if best_idx < len(thr) else 0.5
    yhat = (pred>=best_thr).astype(int)
    print(f"Best F1 thr ~ {best_thr:.3f} | P={float(prec[best_idx]):.3f} R={float(rec[best_idx]):.3f}")
    print("\nReport @bestF1:\n", classification_report(yte, yhat, digits=3))

    fi = pd.DataFrame({
        "feature": features,
        "gain": clf.booster_.feature_importance(importance_type="gain")
    }).sort_values("gain", ascending=False)
    print("\nTop features:\n", fi.head(10).to_string(index=False))
    return clf, fi, pred, best_thr

# ========================== RUN ==========================
np.random.seed(RANDOM_STATE)

with make_session() as s:
    # download
    fetch_radial(s)
    harps_path = fetch_harps(s)
    # load
    df_radial = load_radial_ipac()
    df_harps  = load_harps(harps_path)
    # label harps via confirmed hosts
    hosts = fetch_confirmed_hosts(s)
    df_harps = label_harps(df_harps, hosts)

# clean & keep short series too (>= MIN_OBS)
df_radial = clean_series(df_radial, MIN_OBS)
df_harps  = clean_series(df_harps,  MIN_OBS)

# merge (RADIAL positives + HARPS mixed)
df_all = pd.concat([df_radial, df_harps], ignore_index=True)

# save series
PROC_DIR.mkdir(parents=True, exist_ok=True)
df_radial.to_parquet(PROC_DIR/"radial_series.parquet", index=False)
if not df_harps.empty: df_harps.to_parquet(PROC_DIR/"harps_series.parquet", index=False)
df_all.to_parquet(PROC_DIR/"rv_series_merged.parquet", index=False)

# sanity log
print("\nSeries rows by source:")
print(df_all.groupby("source").size())
print("\nStar counts by label:")
star_lab = df_all.groupby("star_id")["label"].first().value_counts()
print(star_lab)

# features
feat_df = build_feature_table(df_all).dropna()
feat_df.to_parquet(PROC_DIR/"rv_features.parquet", index=False)
print("\nFeature table shape:", feat_df.shape)
print("Pos/Neg in features:", feat_df["label"].value_counts().to_dict())

# split (stratified by star) and balance train
train_df, test_df = stratified_group_split(feat_df, TEST_FRAC, RANDOM_STATE)
print(f"\nSplit → Train stars: {train_df['star_id'].nunique()} | Test stars: {test_df['star_id'].nunique()}")
print("Train pos/neg:", train_df["label"].sum(), "/", len(train_df)-train_df["label"].sum())
print("Test  pos/neg:", test_df["label"].sum(),  "/", len(test_df)-test_df["label"].sum())

train_bal = make_balanced_train(train_df, ratio=BALANCE_RATIO, seed=RANDOM_STATE)
print("\nBalanced train size:", train_bal.shape, "→ pos/neg:",
      int(train_bal['label'].sum()), "/", int(len(train_bal)-train_bal['label'].sum()))

# train
model, fi, pred, best_thr = train_lightgbm(train_bal, test_df)

# save artifacts
(model.booster_).save_model(str(PROC_DIR/"lightgbm_rv.txt"))
fi.to_csv(PROC_DIR/"feature_importances.csv", index=False)
out = test_df[["star_id","label"]].copy()
out["pred_prob"]=pred
out["pred_label_bestF1"]=(pred>=best_thr).astype(int)
out.to_csv(PROC_DIR/"test_predictions.csv", index=False)

print("\n✅ Done. Artifacts:")
print("  -", PROC_DIR/"radial_series.parquet")
if not df_harps.empty: print("  -", PROC_DIR/"harps_series.parquet")
print("  -", PROC_DIR/"rv_series_merged.parquet")
print("  -", PROC_DIR/"rv_features.parquet")
print("  -", PROC_DIR/"feature_importances.csv")
print("  -", PROC_DIR/"lightgbm_rv.txt")
print("  -", PROC_DIR/"test_predictions.csv")


Fetching RADIAL wget script …




  … 25/1071 RADIAL files
  … 50/1071 RADIAL files
  … 75/1071 RADIAL files
  … 100/1071 RADIAL files
  … 125/1071 RADIAL files
  … 150/1071 RADIAL files
  … 175/1071 RADIAL files
  … 200/1071 RADIAL files
  … 225/1071 RADIAL files
  … 250/1071 RADIAL files
  … 275/1071 RADIAL files
  … 300/1071 RADIAL files
  … 325/1071 RADIAL files
  … 350/1071 RADIAL files
  … 375/1071 RADIAL files
  … 400/1071 RADIAL files
  … 425/1071 RADIAL files
  … 450/1071 RADIAL files
  … 475/1071 RADIAL files
  … 500/1071 RADIAL files
  … 525/1071 RADIAL files
  … 550/1071 RADIAL files
  … 575/1071 RADIAL files
  … 600/1071 RADIAL files
  … 625/1071 RADIAL files
  … 650/1071 RADIAL files
  … 675/1071 RADIAL files
  … 700/1071 RADIAL files
  … 725/1071 RADIAL files
  … 750/1071 RADIAL files
  … 775/1071 RADIAL files
  … 800/1071 RADIAL files
  … 825/1071 RADIAL files
  … 850/1071 RADIAL files
  … 875/1071 RADIAL files
  … 900/1071 RADIAL files
  … 925/1071 RADIAL files
  … 950/1071 RADIAL files
  … 975/1071 RA



RADIAL: have 1071 .tbl files.
Trying HARPS: https://cdsarc.cds.unistra.fr/ftp/J/A+A/636/A74/rvbank.dat.gz
HARPS: downloaded rvbank.dat.gz

Series rows by source:
source
HARPS     212552
RADIAL     43540
dtype: int64

Star counts by label:
label
0    2859
1     559
Name: count, dtype: int64

Feature table shape: (3416, 12)
Pos/Neg in features: {0: 2857, 1: 559}

Split → Train stars: 3074 | Test stars: 342
Train pos/neg: 503 / 2571
Test  pos/neg: 56 / 286

Balanced train size: (1006, 12) → pos/neg: 503 / 503
[LightGBM] [Info] Number of positive: 503, number of negative: 503
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2433
[LightGBM] [Info] Number of data points in the train set: 1006, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

=== Eval ===
AUC: 0.935 | AP: 0.783
Best F1 thr ~ 0.9

In [None]:
import pandas as pd
from pathlib import Path
PROC_DIR = Path("/content/drive/MyDrive/nasa_exoplanet/Radial-Velocity/processed")

series = pd.read_parquet(PROC_DIR/"rv_series_merged.parquet")
print(series.groupby(["source"]).size().sort_values(ascending=False).head(10))
print(series.groupby(["source","star_id"]).size().describe())

# After feature build:
feat = pd.read_parquet(PROC_DIR/"rv_features.parquet")
print(feat.groupby("label").size())
print(feat.groupby(["label"]).size(), " total:", len(feat))


source
HARPS     209324
RADIAL        16
dtype: int64
count     2316.000000
mean        90.388601
std        510.448358
min          8.000000
25%         12.000000
50%         25.000000
75%         51.000000
max      15490.000000
dtype: float64
label
0    2269
1      45
dtype: int64
label
0    2269
1      45
dtype: int64  total: 2314


In [None]:
#how to use

In [None]:
# rv_infer_standalone.py
# Standalone inference helpers for your RV model.
# - predict_from_arrays(model_path, time, rv, rv_err=None, threshold=0.5)
# - predict_from_files(model_path, file_paths, threshold=0.5)
#
# File parsing supports: IPAC .tbl (Astropy), CSV, or whitespace-separated text.

import sys, subprocess, io, re, warnings
from pathlib import Path

def _pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

warnings.filterwarnings("ignore")

# Deps (install if missing)
try:
    import numpy as np
    import pandas as pd
except Exception:
    _pip_install(["numpy", "pandas"])
    import numpy as np
    import pandas as pd

try:
    import lightgbm as lgb
except Exception:
    _pip_install(["lightgbm"])
    import lightgbm as lgb

try:
    from astropy.io import ascii as astro_ascii
    from astropy.timeseries import LombScargle
except Exception:
    _pip_install(["astropy"])
    from astropy.io import ascii as astro_ascii
    from astropy.timeseries import LombScargle


# ==== config: must match training features ====
FEATURE_COLS = [
    "period","power1","power2","power3","amp",
    "rms","mad","skew","n_obs","span_days"
]


# ---------- core utils ----------
def load_model(model_path: str | Path) -> lgb.Booster:
    model_path = Path(model_path)
    if not model_path.exists():
        raise FileNotFoundError(f"Model not found: {model_path}")
    return lgb.Booster(model_file=str(model_path))

def _features_from_series(time: np.ndarray, rv: np.ndarray, rv_err: np.ndarray | None = None):
    """Build the same features as training from a single star's RV curve."""
    t = np.asarray(time, dtype=float)
    y = np.asarray(rv, dtype=float)
    if len(t) < 3 or np.std(y) == 0 or np.any(~np.isfinite(t)) or np.any(~np.isfinite(y)):
        return None

    # normalize like training
    y = (y - np.median(y)) / (np.std(y) + 1e-9)

    # Lomb–Scargle periodogram (unweighted to mirror training)
    try:
        freq, power = LombScargle(t, y).autopower()
    except Exception:
        return None
    if len(freq) == 0:
        return None

    idx = np.argsort(power)[-3:]
    topf = freq[idx]; topp = power[idx]
    bestf = topf[-1]
    period = 1.0 / bestf if bestf > 0 else np.nan

    # sinusoid amplitude at bestf (least squares)
    phi = 2 * np.pi * bestf * t
    A = np.vstack([np.sin(phi), np.cos(phi), np.ones_like(phi)]).T
    coef, *_ = np.linalg.lstsq(A, y, rcond=None)
    amp = float(np.sqrt(coef[0]**2 + coef[1]**2))

    feats = {
        "period": float(period),
        "power1": float(topp[-1]),
        "power2": float(topp[-2]) if len(topp) > 1 else 0.0,
        "power3": float(topp[-3]) if len(topp) > 2 else 0.0,
        "amp":    amp,
        "rms":    float(np.std(y)),
        "mad":    float(np.median(np.abs(y - np.median(y)))),
        "skew":   float(pd.Series(y).skew()),
        "n_obs":  int(len(y)),
        "span_days": float(np.max(t) - np.min(t)),
    }
    return feats

def _map_cols(df: pd.DataFrame):
    """Find time/rv/rv_err column names in arbitrary tables."""
    cl = {c.lower().strip(): c for c in df.columns}
    def pick(cands):
        for k in cands:
            if k in cl: return cl[k]
        for k in cl:
            if any(sub in k for sub in cands): return cl[k]
        return None
    tcol = pick(['bjd_tdb','bjd','time','jd','jd_utc','mjd','date'])
    rcol = pick(['rv','radial_velocity','vrad','mnvel','vel','velocity','v_r'])
    ecol = pick(['rv_err','sigma_rv','e_rv','erv','rv_error','sig_rv','stdev','unc_rv'])
    return tcol, rcol, ecol

def _read_one_file(path: str | Path) -> tuple[pd.DataFrame, str]:
    """
    Read one RV file: IPAC .tbl preferred, else CSV, else whitespace.
    Returns (df, star_id_guess). df has columns [time, rv, rv_err].
    """
    fp = Path(path)
    if not fp.exists():
        raise FileNotFoundError(f"File not found: {fp}")

    data = None
    name = fp.name.lower()

    # IPAC first for .tbl
    if name.endswith(".tbl"):
        try:
            tab = astro_ascii.read(str(fp), format="ipac", guess=True, fast_reader=False)
            data = tab.to_pandas()
        except Exception:
            data = None

    # CSV fallback
    if data is None:
        try:
            data = pd.read_csv(fp)
        except Exception:
            # whitespace fallback
            data = pd.read_csv(fp, delim_whitespace=True, comment="#")

    if data is None or data.empty:
        raise ValueError(f"Could not parse any rows from: {fp}")

    tcol, rcol, ecol = _map_cols(data)
    if tcol is None or rcol is None:
        raise ValueError(f"Could not find time/rv columns in: {fp}")

    df = pd.DataFrame({
        "time": pd.to_numeric(data[tcol], errors="coerce"),
        "rv":   pd.to_numeric(data[rcol], errors="coerce"),
    })
    if ecol and ecol in data.columns:
        df["rv_err"] = pd.to_numeric(data[ecol], errors="coerce")
    else:
        df["rv_err"] = np.nan

    df = df.dropna(subset=["time","rv"]).sort_values("time")
    if len(df) < 3:
        raise ValueError(f"Need at least 3 valid (time, rv) points in: {fp}")

    # star_id guess from filename like UID_xxx_RVC_###.tbl
    m = re.match(r'^(.*)_RVC_', fp.stem, flags=re.IGNORECASE)
    star_id = m.group(1) if m else fp.stem
    return df, star_id


# ---------- public API ----------
def predict_from_arrays(model_path: str | Path,
                        time, rv, rv_err=None,
                        threshold: float = 0.5) -> dict:
    """
    Predict from raw arrays (one star).
    Returns dict with prob, label, features, n_obs, span_days.
    """
    booster = load_model(model_path)
    time = np.asarray(time, dtype=float)
    rv   = np.asarray(rv,   dtype=float)
    if rv_err is None:
        rv_err = np.full_like(rv, np.nan, dtype=float)
    else:
        rv_err = np.asarray(rv_err, dtype=float)

    feats = _features_from_series(time, rv, rv_err)
    if feats is None:
        return {"ok": False, "msg": "Not enough signal/points to compute features (need ≥3 and non-zero std)."}

    X = pd.DataFrame([feats])[FEATURE_COLS]
    prob = float(booster.predict(X)[0])
    pred = int(prob >= float(threshold))
    return {
        "ok": True,
        "probability": prob,
        "pred_label": pred,   # 1=planet, 0=not
        "threshold": float(threshold),
        "features": feats,
        "n_obs": int(feats["n_obs"]),
        "span_days": float(feats["span_days"]),
    }

def predict_from_files(model_path: str | Path,
                       file_paths: list[str | Path],
                       threshold: float = 0.5) -> pd.DataFrame:
    """
    Predict for one or many files. Returns a DataFrame with:
    [file, star_id, prob, pred_label, n_obs, span_days, error]
    """
    booster = load_model(model_path)
    rows = []
    for path in file_paths:
        rec = {"file": str(path), "star_id": None,
               "prob": np.nan, "pred_label": np.nan,
               "n_obs": np.nan, "span_days": np.nan,
               "error": ""}
        try:
            df, sid = _read_one_file(path)
            rec["star_id"] = sid
            feats = _features_from_series(df["time"].values, df["rv"].values, df["rv_err"].values)
            if feats is None:
                rec["error"] = "insufficient data / zero variance"
            else:
                X = pd.DataFrame([feats])[FEATURE_COLS]
                prob = float(booster.predict(X)[0])
                rec["prob"] = prob
                rec["pred_label"] = int(prob >= float(threshold))
                rec["n_obs"] = int(feats["n_obs"])
                rec["span_days"] = float(feats["span_days"])
        except Exception as e:
            rec["error"] = str(e)
        rows.append(rec)

    out = pd.DataFrame(rows)
    return out


# ---------- example usage (optional) ----------
if __name__ == "__main__":
    # Example 1: arrays input
    # Replace with your own quick test arrays
    t = [2450000.0, 2450020.0, 2450100.0, 2450300.0, 2450500.0]
    v = [10.2, -5.0, 15.1, -8.2, 12.3]
    res = predict_from_arrays(
        model_path="/content/drive/MyDrive/nasa_exoplanet/processed/lightgbm_rv.txt",
        time=t, rv=v, rv_err=None, threshold=0.5  # or 0.984 if that's your chosen cutoff
    )
    print("arrays inference →", res)

    # Example 2: files input (supports multiple files)
    # Replace these with actual paths to your .tbl/.csv
    files = [
        "/content/drive/MyDrive/nasa_exoplanet/RADIAL_raw/UID_0000522_RVC_001.tbl",
        "/content/drive/MyDrive/nasa_exoplanet/RADIAL_raw/UID_0000522_RVC_002.tbl",
    ]
    df_preds = predict_from_files(
        model_path="/content/drive/MyDrive/nasa_exoplanet/Radial-Velocity/processed/lightgbm_rv.txt",
        file_paths=files,
        threshold=0.5  # or your picked threshold, e.g., 0.984
    )
    print(df_preds)


arrays inference → {'ok': True, 'probability': 2.950631051185226e-07, 'pred_label': 0, 'threshold': 0.5, 'features': {'period': 135.13513513513513, 'power1': 0.9939665207495667, 'power2': 0.9856172793291845, 'power3': 0.9814772392200638, 'amp': 1.567861280102445, 'rms': 0.9999999998953448, 'mad': 0.512810472683866, 'skew': -0.5384892867104116, 'n_obs': 5, 'span_days': 500.0}, 'n_obs': 5, 'span_days': 500.0}
                                                file      star_id      prob  \
0  /content/drive/MyDrive/nasa_exoplanet/RADIAL_r...  UID_0000522  0.299377   
1  /content/drive/MyDrive/nasa_exoplanet/RADIAL_r...  UID_0000522  0.999761   

   pred_label  n_obs    span_days error  
0           0     53  2801.237949        
1           1     27  1359.044500        


In [None]:

model_path = "/content/drive/MyDrive/nasa_exoplanet/Radial-Velocity/processed/lightgbm_rv.txt"
# define your own arrays
T_array   = [2450000.0, 2450020.0, 2450100.0, 2450300.0, 2450500.0]
RV_array  = [10.2, -5.0, 15.1, -8.2, 12.3]
ERR_array = [1.5, 1.2, 1.0, 1.3, 1.1]  # optional; can be None

res = predict_from_arrays(
    model_path=model_path,
    time=T_array, rv=RV_array, rv_err=ERR_array,
    threshold=0.984  # use the threshold you liked
)
print(res)


{'ok': True, 'probability': 2.950631051185226e-07, 'pred_label': 0, 'threshold': 0.984, 'features': {'period': 135.13513513513513, 'power1': 0.9939665207495667, 'power2': 0.9856172793291845, 'power3': 0.9814772392200638, 'amp': 1.567861280102445, 'rms': 0.9999999998953448, 'mad': 0.512810472683866, 'skew': -0.5384892867104116, 'n_obs': 5, 'span_days': 500.0}, 'n_obs': 5, 'span_days': 500.0}


In [None]:

model_path = "/content/drive/MyDrive/nasa_exoplanet/Radial-Velocity/processed/lightgbm_rv.txt"
path = "/content/drive/MyDrive/nasa_exoplanet/RADIAL_raw/UID_0000522_RVC_001.tbl"

df, star_id = _read_one_file(path)   # parses IPAC .tbl / csv / whitespace
res = predict_from_arrays(
    model_path=model_path,
    time=df["time"], rv=df["rv"], rv_err=df["rv_err"],
    threshold=0.984
)
print(star_id, res)


UID_0000522 {'ok': True, 'probability': 0.2993767721896214, 'pred_label': 0, 'threshold': 0.984, 'features': {'period': 28012.379489997402, 'power1': 0.5554498514772406, 'power2': 0.5516828204002343, 'power3': 0.5396734479828982, 'amp': 23.573162267889586, 'rms': 0.9999999999648725, 'mad': 0.6252664049935912, 'skew': -0.21970752994827183, 'n_obs': 53, 'span_days': 2801.23794899974}, 'n_obs': 53, 'span_days': 2801.23794899974}
