In [1]:
pip install lightgbm



In [3]:
pip install shap



In [2]:
import argparse
import os
import sys
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.inspection import permutation_importance

from lightgbm import LGBMClassifier

In [4]:
try:
    import shap
    _HAS_SHAP = True
except Exception:
    _HAS_SHAP = False

In [5]:
CSV_PATH = "cleaned_exoplanets.csv"      # path to your merged+cleaned csv
TEST_SIZE = 0.2
RANDOM_STATE = 42
SAVE_MODEL_PATH = "exo_lgbm_binary.joblib"

In [6]:
def load_data(path):
    df = pd.read_csv(path)
    return df

In [7]:
def prepare_data(df):
    # keep only CONFIRMED and FALSE POSITIVE rows
    df = df[df["disposition"].astype(str).str.upper().isin(["CONFIRMED", "FALSE POSITIVE"])].copy()

    # encode target: 1 = CONFIRMED, 0 = FALSE POSITIVE
    y = df["disposition"].astype(str).str.upper().map({
        "FALSE POSITIVE": 0,
        "CONFIRMED": 1
    }).astype(int)

    drop_cols = ["disposition", "src_rowid", "unified_id", "id_bundle", "host_name"]
    feature_cols = [c for c in df.columns if c not in drop_cols]
    cat_cols = ["mission"]
    num_cols = [c for c in feature_cols if c not in cat_cols]
    X = df[cat_cols + num_cols].copy()
    return X, y, cat_cols, num_cols


In [8]:
def build_pipeline(cat_cols, num_cols):
    pre = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ])

    clf = LGBMClassifier(
        n_estimators=600,
        learning_rate=0.03,
        max_depth=-1,
        num_leaves=63,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="binary",
        class_weight="balanced",
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    pipe = Pipeline([
        ("prep", pre),
        ("clf", clf)
    ])
    return pipe

In [9]:
def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    print("\n=== Classification Report ===")
    print(classification_report(y_test, preds, target_names=["FALSE POSITIVE", "CONFIRMED"], digits=4))
    print("\n=== Confusion Matrix ===")
    print(pd.DataFrame(confusion_matrix(y_test, preds),
                       index=["FALSE POSITIVE", "CONFIRMED"],
                       columns=["FALSE POSITIVE", "CONFIRMED"]))


In [14]:
def _feature_names_from_preprocessor(prep):
    """
    Build feature names in the exact order produced by the ColumnTransformer.
    Works with OneHotEncoder + passthrough numeric columns.
    """
    names = []
    for name, trans, cols in prep.transformers_:
        if name == "remainder":
            # not used here
            continue
        if trans == "drop":
            continue
        if hasattr(trans, "get_feature_names_out"):
            # For OneHotEncoder
            try:
                base = cols
                if isinstance(base, (list, tuple, np.ndarray)):
                    pass
                else:
                    base = [base]
                names.extend(list(trans.get_feature_names_out(base)))
            except Exception:
                names.extend(list(trans.get_feature_names_out()))
        else:
            # passthrough numeric cols
            if isinstance(cols, (list, tuple, np.ndarray)):
                names.extend(list(cols))
            else:
                names.append(cols)
    return names


def explain_permutation(model, X_test, y_test, n_repeats=10, random_state=0):
    print("\n=== Permutation Importance ===")
    r = permutation_importance(model, X_test, y_test,
                               n_repeats=n_repeats, random_state=random_state)

    # Try to get feature names in the exact transformed order
    try:
        prep = model.named_steps["prep"]
        feat_names = _feature_names_from_preprocessor(prep)
    except Exception:
        # Fallback to whatever ColumnTransformer can provide, or generic names
        try:
            feat_names = model.named_steps["prep"].get_feature_names_out().tolist()
        except Exception:
            feat_names = [f"f{i}" for i in range(len(r.importances_mean))]

    # Hard-align lengths to avoid mismatch crashes
    n = len(r.importances_mean)
    if len(feat_names) != n:
        print(f"[warn] feature_names({len(feat_names)}) != importances({n}); aligning by truncation.")
        feat_names = feat_names[:n]

    imp = pd.DataFrame({
        "feature": feat_names,
        "mean_importance": r.importances_mean,
        "std": r.importances_std
    }).sort_values("mean_importance", ascending=False)

    print(imp.head(20).to_string(index=False))
    return imp


In [15]:
def main():
    df = load_data(CSV_PATH)
    X, y, cat_cols, num_cols = prepare_data(df)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

    model = build_pipeline(cat_cols, num_cols)
    model.fit(X_train, y_train)

    evaluate(model, X_test, y_test)
    explain_permutation(model, X_test, y_test)

    if SAVE_MODEL_PATH:
        import joblib
        joblib.dump(model, SAVE_MODEL_PATH)
        print(f"[info] model saved to {SAVE_MODEL_PATH}")



In [16]:

if __name__ == "__main__":
    main()

[LightGBM] [Info] Number of positive: 3210, number of negative: 4907
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2809
[LightGBM] [Info] Number of data points in the train set: 8117, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

=== Classification Report ===
                precision    recall  f1-score   support

FALSE POSITIVE     0.9296    0.8826    0.9055      1227
     CONFIRMED     0.8335    0.8979    0.8645       803

      accuracy                         0.8887      2030
     macro avg     0.8816    0.8903    0.8850      2030
  weighted avg     0.8916    0.8887    0.8893      2030


=== Confusion Matrix ===
                FALSE POSITIVE  CONFIRMED
FALSE POSITIVE            1083        144
CONFIRMED                   82 