<a href="https://colab.research.google.com/github/NASA-Hackathon-Imaginarium-Team/AI-Team/blob/main/AI_Exoplanet_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
!pip -q install pandas==2.2.2 scikit-learn==1.5.2 lightgbm catboost xgboost joblib matplotlib


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [37]:
import glob, os, pandas as pd

# Tries to find your attached file in /content
candidates = sorted(glob.glob("/content/*.csv"))
# Prefer something with "Kepler" or "KOI" in the name
prio = [p for p in candidates if "kepler" in p.lower() or "koi" in p.lower()]
DATA_PATH = prio[0] if prio else (candidates[0] if candidates else None)
assert DATA_PATH is not None, "No CSV found in /content. Upload your file to the Colab Files panel."

df_raw = pd.read_csv(DATA_PATH, low_memory=False)
print(f"Using: {os.path.basename(DATA_PATH)}, shape={df_raw.shape}")
print("First 15 columns:", df_raw.columns[:15].tolist())


Using: Kepler Objects of Interest - Filtered.csv, shape=(7585, 44)
First 15 columns: ['kepid', 'koi_disposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1']


In [38]:
import numpy as np
from typing import List

def pick(cols: List[str], candidates: List[str]):
    m = {c.lower().strip(): c for c in cols}
    for cand in candidates:
        key = cand.lower().strip()
        if key in m:
            return m[key]
    return None

# Try to find label and star_id
label_col = pick(df_raw.columns, ["koi_disposition","Disposition Using Kepler Data","kepler_disposition","disposition"])
star_col  = pick(df_raw.columns, ["kepid","kep id","kepler id","kepler_id"])

print("Detected label column:", label_col)
print("Detected star_id column:", star_col)

# Candidate feature names typical in KOI
allowlist = [
    "koi_period","koi_duration","koi_depth","koi_prad","koi_ror","koi_dor","koi_impact",
    "koi_count","koi_snr","koi_model_snr","koi_steff","koi_slogg","koi_smet","koi_srad","koi_smass"
]
present_allow = [c for c in allowlist if c in df_raw.columns]

# Fallback: numeric columns with leakage-guards
num_cols = [c for c in df_raw.columns if pd.api.types.is_numeric_dtype(df_raw[c])]
bad_bits = ("disposition","pdisposition","fpflag","flag","tfop","vet","koi_score")
fallback = [c for c in num_cols if not any(b in c.lower() for b in bad_bits)]

# Use allowlist if we found at least 5; else fallback
FEATS = present_allow if len(present_allow) >= 5 else fallback
print(f"Selected {len(FEATS)} features.")
print(FEATS[:20])

# Build working frame
df = df_raw.copy()
if label_col is None:
    raise ValueError("Couldn't detect label column. Tell me the exact header name for the disposition/label.")

df["label"] = df[label_col].astype(str).str.upper().str.strip()
label_map = {
    "CONFIRMED":"CONFIRMED",
    "CANDIDATE":"CANDIDATE",
    "FALSE POSITIVE":"FALSE POSITIVE",
    "FALSE_POSITIVE":"FALSE POSITIVE",
    "FP":"FALSE POSITIVE"
}
df["label"] = df["label"].map(lambda x: label_map.get(x, x))
df = df[df["label"].isin(["CONFIRMED","CANDIDATE","FALSE POSITIVE"])].reset_index(drop=True)

# star_id (for GroupKFold); if missing, we’ll fallback to StratifiedKFold later
df["star_id"] = df[star_col].astype(str) if star_col is not None else "NA"

# Keep only needed columns; coerce to numeric
dfX = df[FEATS].apply(pd.to_numeric, errors="coerce")
# Drop rows with all-NaN features
df = df.loc[~dfX.isna().all(axis=1)].copy()
dfX = df[FEATS].apply(pd.to_numeric, errors="coerce")
print("Data after cleaning:", df.shape)
print("Label counts:\n", df["label"].value_counts())


Detected label column: koi_disposition
Detected star_id column: kepid
Selected 9 features.
['koi_period', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_impact', 'koi_model_snr', 'koi_steff', 'koi_slogg', 'koi_srad']
Data after cleaning: (7585, 46)
Label counts:
 label
FALSE POSITIVE    4839
CONFIRMED         2746
Name: count, dtype: int64


In [39]:
from sklearn.model_selection import GroupKFold, StratifiedKFold

use_groups = (star_col is not None) and (df["star_id"].nunique() > 1)
if use_groups:
    splitter = GroupKFold(n_splits=5)
    split_iter = list(splitter.split(dfX, df["label"], df["star_id"]))
    print("Using GroupKFold by star_id.")
else:
    splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    split_iter = list(splitter.split(dfX, df["label"]))
    print("Using StratifiedKFold (no star_id detected).")


Using GroupKFold by star_id.


In [40]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

def logish(A):
    A = np.asarray(A, dtype=float)
    A = np.clip(A, a_min=0, a_max=None)
    return np.log1p(A)

numeric_pre = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(logish, feature_names_out="one-to-one")),
])

numeric_pre_scaled = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(logish, feature_names_out="one-to-one")),
    ("sc", StandardScaler(with_mean=False))  # sparse-safe
])

def evaluate_oof(y_true, oof_pred, oof_proba, classes):
    print(classification_report(y_true, oof_pred, target_names=classes, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_true, oof_pred))
    # AUROC/AUPRC (OvR)
    y_bin = np.eye(len(classes))[y_true]
    auroc = {cls: roc_auc_score(y_bin[:,i], oof_proba[:,i]) for i,cls in enumerate(classes)}
    auprc = {cls: average_precision_score(y_bin[:,i], oof_proba[:,i]) for i,cls in enumerate(classes)}
    print("AUROC:", auroc)
    print("AUPRC:", auprc)
    return auroc, auprc

def cv_oof(model_pipeline, X, y, split_iter):
    oof_proba = np.zeros((len(X), len(np.unique(y))))
    oof_pred  = np.zeros(len(X), dtype=int)
    fold_models = []
    for fold, idx in enumerate(split_iter, 1):
        if len(idx) == 2:
            tr, va = idx
            groups = None
        else:
            tr, va, _ = idx  # not used
        model_pipeline.fit(X.iloc[tr], y[tr])
        p = model_pipeline.predict_proba(X.iloc[va])
        oof_proba[va] = p
        oof_pred[va]  = p.argmax(1)
        fold_models.append(model_pipeline)
        print(f"Fold {fold} ✓")
    return oof_proba, oof_pred, fold_models


In [41]:
le = LabelEncoder()
y = le.fit_transform(df["label"].values)
classes = list(le.classes_)
print("Classes:", classes)


Classes: ['CONFIRMED', 'FALSE POSITIVE']


In [42]:
import numpy as np

maj_idx = np.bincount(y).argmax()
oof_pred = np.full_like(y, maj_idx)
# Constant probabilities: 1.0 for majority class
oof_proba = np.zeros((len(y), len(classes)))
oof_proba[:, maj_idx] = 1.0

print("== Majority class baseline ==")
_ = evaluate_oof(y, oof_pred, oof_proba, classes)


== Majority class baseline ==
                precision    recall  f1-score   support

     CONFIRMED     0.0000    0.0000    0.0000      2746
FALSE POSITIVE     0.6380    1.0000    0.7790      4839

      accuracy                         0.6380      7585
     macro avg     0.3190    0.5000    0.3895      7585
  weighted avg     0.4070    0.6380    0.4970      7585

Confusion matrix:
 [[   0 2746]
 [   0 4839]]
AUROC: {'CONFIRMED': np.float64(0.5), 'FALSE POSITIVE': np.float64(0.5)}
AUPRC: {'CONFIRMED': np.float64(0.36203032300593274), 'FALSE POSITIVE': np.float64(0.6379696769940673)}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
from sklearn.linear_model import LogisticRegression

log_pre = ColumnTransformer([("num", numeric_pre_scaled, FEATS)], remainder="drop")
log_clf = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1)

log_pipe = Pipeline([("pre", log_pre), ("clf", log_clf)])

print("== Logistic Regression ==")
log_oof_proba, log_oof_pred, log_models = cv_oof(log_pipe, dfX, y, split_iter)
_ = evaluate_oof(y, log_oof_pred, log_oof_proba, classes)


== Logistic Regression ==
Fold 1 ✓
Fold 2 ✓
Fold 3 ✓
Fold 4 ✓
Fold 5 ✓
                precision    recall  f1-score   support

     CONFIRMED     0.6020    0.8758    0.7135      2746
FALSE POSITIVE     0.9050    0.6714    0.7709      4839

      accuracy                         0.7454      7585
     macro avg     0.7535    0.7736    0.7422      7585
  weighted avg     0.7953    0.7454    0.7501      7585

Confusion matrix:
 [[2405  341]
 [1590 3249]]
AUROC: {'CONFIRMED': np.float64(0.8503878041170406), 'FALSE POSITIVE': np.float64(0.8503878041170406)}
AUPRC: {'CONFIRMED': np.float64(0.7374722719580048), 'FALSE POSITIVE': np.float64(0.9165374792091237)}


In [44]:
from sklearn.ensemble import RandomForestClassifier

rf_pre = ColumnTransformer([("num", numeric_pre, FEATS)], remainder="drop")
rf_clf = RandomForestClassifier(
    n_estimators=600, max_depth=None, min_samples_leaf=2,
    class_weight="balanced_subsample", n_jobs=-1, random_state=42
)
rf_pipe = Pipeline([("pre", rf_pre), ("clf", rf_clf)])

print("== Random Forest ==")
rf_oof_proba, rf_oof_pred, rf_models = cv_oof(rf_pipe, dfX, y, split_iter)
_ = evaluate_oof(y, rf_oof_pred, rf_oof_proba, classes)


== Random Forest ==
Fold 1 ✓
Fold 2 ✓
Fold 3 ✓
Fold 4 ✓
Fold 5 ✓
                precision    recall  f1-score   support

     CONFIRMED     0.8816    0.9086    0.8949      2746
FALSE POSITIVE     0.9472    0.9308    0.9389      4839

      accuracy                         0.9227      7585
     macro avg     0.9144    0.9197    0.9169      7585
  weighted avg     0.9235    0.9227    0.9230      7585

Confusion matrix:
 [[2495  251]
 [ 335 4504]]
AUROC: {'CONFIRMED': np.float64(0.9738242945044565), 'FALSE POSITIVE': np.float64(0.9738242945044565)}
AUPRC: {'CONFIRMED': np.float64(0.9507325942152898), 'FALSE POSITIVE': np.float64(0.9848989690254815)}


In [45]:
from lightgbm import LGBMClassifier

lgb_pre = ColumnTransformer([("num", numeric_pre, FEATS)], remainder="drop")
lgb_clf = LGBMClassifier(
    n_estimators=3000, learning_rate=0.02,
    subsample=0.9, colsample_bytree=0.9, max_depth=-1,
    min_child_samples=40, class_weight="balanced",
    n_jobs=-1, random_state=42
)
lgb_pipe = Pipeline([("pre", lgb_pre), ("clf", lgb_clf)])

print("== LightGBM ==")
lgb_oof_proba, lgb_oof_pred, lgb_models = cv_oof(lgb_pipe, dfX, y, split_iter)
_ = evaluate_oof(y, lgb_oof_pred, lgb_oof_proba, classes)


== LightGBM ==
[LightGBM] [Info] Number of positive: 3869, number of negative: 2199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004981 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 6068, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fold 1 ✓
[LightGBM] [Info] Number of positive: 3865, number of negative: 2203
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 6068, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.00000

In [46]:
from catboost import CatBoostClassifier

cb_pre = ColumnTransformer([("num", numeric_pre, FEATS)], remainder="drop")
cb_clf = CatBoostClassifier(
    iterations=2000, learning_rate=0.03, depth=8,
    loss_function="MultiClass", eval_metric="TotalF1", auto_class_weights="Balanced",
    verbose=False, random_seed=42
)
cb_pipe = Pipeline([("pre", cb_pre), ("clf", cb_clf)])

print("== CatBoost ==")
cb_oof_proba, cb_oof_pred, cb_models = cv_oof(cb_pipe, dfX, y, split_iter)
_ = evaluate_oof(y, cb_oof_pred, cb_oof_proba, classes)


== CatBoost ==
Fold 1 ✓
Fold 2 ✓
Fold 3 ✓
Fold 4 ✓
Fold 5 ✓
                precision    recall  f1-score   support

     CONFIRMED     0.8828    0.9330    0.9072      2746
FALSE POSITIVE     0.9607    0.9297    0.9450      4839

      accuracy                         0.9309      7585
     macro avg     0.9218    0.9314    0.9261      7585
  weighted avg     0.9325    0.9309    0.9313      7585

Confusion matrix:
 [[2562  184]
 [ 340 4499]]
AUROC: {'CONFIRMED': np.float64(0.9782143054422319), 'FALSE POSITIVE': np.float64(0.9782143054422319)}
AUPRC: {'CONFIRMED': np.float64(0.9577404619078919), 'FALSE POSITIVE': np.float64(0.9871895668312323)}


In [47]:
from xgboost import XGBClassifier

xgb_pre = ColumnTransformer([("num", numeric_pre, FEATS)], remainder="drop")
xgb_clf = XGBClassifier(
    n_estimators=2000, learning_rate=0.03, max_depth=8,
    subsample=0.9, colsample_bytree=0.9,
    objective="multi:softprob", num_class=len(classes),
    tree_method="hist", reg_lambda=1.0, random_state=42
)
xgb_pipe = Pipeline([("pre", xgb_pre), ("clf", xgb_clf)])

print("== XGBoost ==")
xgb_oof_proba, xgb_oof_pred, xgb_models = cv_oof(xgb_pipe, dfX, y, split_iter)
_ = evaluate_oof(y, xgb_oof_pred, xgb_oof_proba, classes)


== XGBoost ==
Fold 1 ✓
Fold 2 ✓
Fold 3 ✓
Fold 4 ✓
Fold 5 ✓
                precision    recall  f1-score   support

     CONFIRMED     0.8969    0.9093    0.9031      2746
FALSE POSITIVE     0.9481    0.9407    0.9444      4839

      accuracy                         0.9293      7585
     macro avg     0.9225    0.9250    0.9237      7585
  weighted avg     0.9296    0.9293    0.9294      7585

Confusion matrix:
 [[2497  249]
 [ 287 4552]]
AUROC: {'CONFIRMED': np.float64(0.9775584076754377), 'FALSE POSITIVE': np.float64(0.9775585958166132)}
AUPRC: {'CONFIRMED': np.float64(0.9566834072162151), 'FALSE POSITIVE': np.float64(0.9875387432610079)}


In [48]:
from sklearn.isotonic import IsotonicRegression

base_proba = lgb_oof_proba  # <- change to cb_oof_proba / xgb_oof_proba if CatBoost/XGB is better
base_pred  = lgb_oof_pred
base_models = lgb_models

calibrators = []
proba_cal = np.zeros_like(base_proba)
for i, cls in enumerate(classes):
    iso = IsotonicRegression(out_of_bounds="clip")
    y_bin = (y == i).astype(int)
    iso.fit(base_proba[:, i], y_bin)
    proba_cal[:, i] = iso.transform(base_proba[:, i])
    calibrators.append(iso)

# renormalize to sum=1
row_sums = proba_cal.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1.0
proba_cal = proba_cal / row_sums
pred_cal = proba_cal.argmax(1)

print("== After isotonic calibration (LightGBM) ==")
_ = evaluate_oof(y, pred_cal, proba_cal, classes)


== After isotonic calibration (LightGBM) ==
                precision    recall  f1-score   support

     CONFIRMED     0.8927    0.9090    0.9008      2746
FALSE POSITIVE     0.9478    0.9380    0.9429      4839

      accuracy                         0.9275      7585
     macro avg     0.9203    0.9235    0.9218      7585
  weighted avg     0.9279    0.9275    0.9276      7585

Confusion matrix:
 [[2496  250]
 [ 300 4539]]
AUROC: {'CONFIRMED': np.float64(0.9775045240427114), 'FALSE POSITIVE': np.float64(0.9775045240427113)}
AUPRC: {'CONFIRMED': np.float64(0.9530834871072866), 'FALSE POSITIVE': np.float64(0.9863790395963317)}


In [49]:
import joblib, json, os, glob
os.makedirs("artifacts", exist_ok=True)

joblib.dump(le, "artifacts/label_encoder.joblib")
joblib.dump(FEATS, "artifacts/feature_list.joblib")
for k, m in enumerate(base_models):
    joblib.dump(m, f"artifacts/model_fold{k}.joblib")
joblib.dump(calibrators, "artifacts/calibrators.joblib")

card = {
    "dataset": os.path.basename(DATA_PATH),
    "classes": classes,
    "features": FEATS,
    "cv": "GroupKFold by star_id" if use_groups else "StratifiedKFold",
    "model": "LightGBM"  # change if you switched
}
with open("artifacts/model_card.json", "w") as f:
    json.dump(card, f, indent=2)

!ls -lh artifacts


total 51M
-rw-r--r-- 1 root root 3.1K Oct  4 11:11 calibrators.joblib
-rw-r--r-- 1 root root  131 Oct  4 11:11 feature_list.joblib
-rw-r--r-- 1 root root  502 Oct  4 11:11 label_encoder.joblib
-rw-r--r-- 1 root root  354 Oct  4 11:11 model_card.json
-rw-r--r-- 1 root root  11M Oct  4 11:11 model_fold0.joblib
-rw-r--r-- 1 root root  11M Oct  4 11:11 model_fold1.joblib
-rw-r--r-- 1 root root  11M Oct  4 11:11 model_fold2.joblib
-rw-r--r-- 1 root root  11M Oct  4 11:11 model_fold3.joblib
-rw-r--r-- 1 root root  11M Oct  4 11:11 model_fold4.joblib


In [50]:
import numpy as np, pandas as pd, joblib, glob

le2 = joblib.load("artifacts/label_encoder.joblib")
FEATS2 = joblib.load("artifacts/feature_list.joblib")
cal2 = joblib.load("artifacts/calibrators.joblib")
fold_paths = sorted(glob.glob("artifacts/model_fold*.joblib"))
models2 = [joblib.load(p) for p in fold_paths]
classes2 = list(le2.classes_)

def predict_rows(df_in: pd.DataFrame, confirmed_threshold: float|None=None):
    # schema check
    missing = [c for c in FEATS2 if c not in df_in.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    Xnew = df_in[FEATS2].apply(pd.to_numeric, errors="coerce")
    Xnew = Xnew.fillna(Xnew.median(numeric_only=True))
    # average probs
    probs = np.zeros((len(Xnew), len(classes2)))
    for m in models2:
        probs += m.predict_proba(Xnew)
    probs /= len(models2)
    # isotonic per class
    for i in range(len(classes2)):
        probs[:, i] = cal2[i].transform(probs[:, i])
    probs = probs / probs.sum(axis=1, keepdims=True)
    pred = probs.argmax(1)
    if confirmed_threshold is not None and "CONFIRMED" in classes2:
        idx = classes2.index("CONFIRMED")
        pred[probs[:, idx] >= confirmed_threshold] = idx
    return probs, [classes2[i] for i in pred]

# Demo on 5 rows from your file (drop any label/star columns if present)
demo_cols = [c for c in FEATS2 if c in df.columns]
demo_df = df[demo_cols].sample(5, random_state=0)
probs_demo, labels_demo = predict_rows(demo_df, confirmed_threshold=0.7)
pd.DataFrame({"pred": labels_demo, **{f"P({c})": probs_demo[:,i] for i,c in enumerate(classes2)}})


Unnamed: 0,pred,P(CONFIRMED),P(FALSE POSITIVE)
0,FALSE POSITIVE,0.000559,0.999441
1,FALSE POSITIVE,0.011019,0.988981
2,FALSE POSITIVE,0.000559,0.999441
3,CONFIRMED,0.95389,0.04611
4,CONFIRMED,0.95858,0.04142
