In [None]:
!pip install pandas numpy scikit-learn lightgbm matplotlib joblib




In [None]:
# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
import joblib
import re

# Modeling + ML
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    brier_score_loss, roc_curve, auc
)
from lightgbm import LGBMClassifier

# Settings
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

pd.set_option("display.max_columns", 100)


In [None]:
def fetch_csv(url):
    print(f"Downloading from:\n  {url}")
    df = pd.read_csv(url)
    print(f" -> Got {df.shape[0]} rows, {df.shape[1]} cols")
    return df


def map_disp_to_label(s):
    """Map disposition string to binary label."""
    if pd.isna(s):
        return np.nan
    su = str(s).upper()
    if "CONFIRMED" in su or su.strip() == "CP":
        return 1
    if "FALSE" in su or su.strip() == "FP":
        return 0
    if "CAND" in su or su.strip() == "PC":
        return np.nan
    return np.nan


def print_class_balance(y, label="data"):
    vals, counts = np.unique(y, return_counts=True)
    print(f"\nClass balance ({label}):")
    for v, c in zip(vals, counts):
        print(f"  {v}: {c} ({c/len(y):.1%})")


In [None]:
# Step 1: Try user URL (CP only)
df = fetch_csv(USER_URL)

if LABEL_COL not in df.columns:
    raise ValueError(f"Column {LABEL_COL} not found.")

# Map label
df["label_raw"] = df[LABEL_COL]
df["label"] = df["label_raw"].apply(map_disp_to_label)

# If only one class -> fallback
if df["label"].dropna().nunique() < 2:
    print("\n⚠️ Only one class found — downloading full dataset instead.")
    df = fetch_csv(FALLBACK_URL)
    df["label_raw"] = df[LABEL_COL]
    df["label"] = df["label_raw"].apply(map_disp_to_label)

# Keep only numeric columns + label
df = df[FEATURES + ["label"]]
df = df.apply(pd.to_numeric, errors="coerce")
df = df.dropna(subset=["label"])
df = df.dropna(how="all", subset=FEATURES).reset_index(drop=True)

print(f"\nFinal cleaned dataset: {df.shape}")
print_class_balance(df["label"], "Full dataset")
df.head()


Downloading from:
  https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+tfopwg_disp,pl_rade,pl_trandep,pl_orbper,pl_trandurh,pl_insol,pl_eqt,st_rad,st_logg,st_teff,st_tmag,st_dist+from+toi+where+tfopwg_disp='CP'&format=csv
 -> Got 684 rows, 12 cols

⚠️ Only one class found — downloading full dataset instead.
Downloading from:
  https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+tfopwg_disp,pl_rade,pl_trandep,pl_orbper,pl_trandurh,pl_insol,pl_eqt,st_rad,st_logg,st_teff,st_tmag,st_dist+from+toi&format=csv
 -> Got 7703 rows, 12 cols

Final cleaned dataset: (1881, 12)

Class balance (Full dataset):
  0.0: 1197 (63.6%)
  1.0: 684 (36.4%)


Unnamed: 0,pl_rade,pl_trandep,pl_orbper,pl_trandurh,pl_insol,pl_eqt,st_rad,st_logg,st_teff,st_tmag,st_dist,label
0,3.062985,358.41578,9.139804,3.091294,243.011763,1006.993283,1.58,4.34,5958.2,9.0197,129.804,1.0
1,13.088532,1105.760867,5.742625,2.860293,,,3.86959,3.26574,5664.0,8.8849,263.729,0.0
2,2.43432,513.0,15.507786,4.565,88.0718,853.0,1.17,4.30881,6122.0,8.4362,88.4343,1.0
3,3.576683,1199.754148,17.471308,4.428482,53.370032,689.356881,0.974669,4.49925,5783.54,8.0888,57.2651,1.0
4,2.28501,257.347459,0.540933,1.674337,,,1.21935,4.25342,5525.0,9.7345,133.171,0.0


In [None]:
X = df[FEATURES]
y = df["label"].astype(int)

# First split test (10%)
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.1, random_state=RANDOM_SEED, stratify=y
)

# Then split validation (20% of total)
val_frac = 0.2 / 0.9
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=val_frac, random_state=RANDOM_SEED, stratify=y_trainval
)

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")
print_class_balance(y_train, "Train")
print_class_balance(y_val, "Val")
print_class_balance(y_test, "Test")


Train: 1316 | Val: 376 | Test: 189

Class balance (Train):
  0: 838 (63.7%)
  1: 478 (36.3%)

Class balance (Val):
  0: 239 (63.6%)
  1: 137 (36.4%)

Class balance (Test):
  0: 120 (63.5%)
  1: 69 (36.5%)


In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train_p = preprocessor.fit_transform(X_train)
X_val_p = preprocessor.transform(X_val)
X_test_p = preprocessor.transform(X_test)


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# LightGBM tuning
param_lgb = {
    "n_estimators": [200, 400, 800],
    "learning_rate": [0.01, 0.05, 0.1],
    "num_leaves": [15, 31, 63],
    "max_depth": [-1, 5, 10],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0],
}
lgb = LGBMClassifier(random_state=RANDOM_SEED)
rs_lgb = RandomizedSearchCV(
    lgb, param_distributions=param_lgb, n_iter=20,
    scoring="roc_auc", cv=5, random_state=RANDOM_SEED, n_jobs=-1
)
rs_lgb.fit(X_train_p, y_train)
best_lgb = rs_lgb.best_estimator_
print("\nBest LGB params:")
pprint(rs_lgb.best_params_)

# Gradient Boosting tuning
param_gb = {
    "n_estimators": [100, 200, 400],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5],
    "subsample": [0.8, 1.0]
}
gb = GradientBoostingClassifier(random_state=RANDOM_SEED)
rs_gb = RandomizedSearchCV(
    gb, param_distributions=param_gb, n_iter=15,
    scoring="roc_auc", cv=5, random_state=RANDOM_SEED, n_jobs=-1
)
rs_gb.fit(X_train_p, y_train)
best_gb = rs_gb.best_estimator_
print("\nBest GB params:")
pprint(rs_gb.best_params_)


[LightGBM] [Info] Number of positive: 478, number of negative: 838
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 1316, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.363222 -> initscore=-0.561407
[LightGBM] [Info] Start training from score -0.561407

Best LGB params:
{'colsample_bytree': 0.9,
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 400,
 'num_leaves': 15,
 'subsample': 0.7}

Best GB params:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}


In [None]:
stack = StackingClassifier(
    estimators=[("lgbm", best_lgb), ("gb", best_gb)],
    final_estimator=LogisticRegression(max_iter=5000),
    n_jobs=-1,
    cv=5
)
stack.fit(X_train_p, y_train)


In [None]:
def evaluate_model(y_true, y_pred, y_proba, tag="Validation"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred)
    auc_ = roc_auc_score(y_true, y_proba)
    brier = brier_score_loss(y_true, y_proba)
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n=== {tag} Metrics ===")
    print(f"Accuracy : {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")
    print(f"F1 Score : {f1:.3f}")
    print(f"ROC AUC  : {auc_:.3f}")
    print(f"Brier    : {brier:.3f}")
    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    return dict(acc=acc, prec=prec, rec=rec, f1=f1, auc=auc_, brier=brier)

y_val_pred = stack.predict(X_val_p)
y_val_proba = stack.predict_proba(X_val_p)[:, 1]
val_metrics = evaluate_model(y_val, y_val_pred, y_val_proba)



=== Validation Metrics ===
Accuracy : 0.846
Precision: 0.776
Recall   : 0.810
F1 Score : 0.793
ROC AUC  : 0.925
Brier    : 0.120

Confusion Matrix:
 [[207  32]
 [ 26 111]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88       239
           1       0.78      0.81      0.79       137

    accuracy                           0.85       376
   macro avg       0.83      0.84      0.83       376
weighted avg       0.85      0.85      0.85       376





In [None]:
# Combine train + val
X_trainval_p = np.vstack([X_train_p, X_val_p])
y_trainval = np.concatenate([y_train, y_val])

stack_final = StackingClassifier(
    estimators=[("lgbm", best_lgb), ("gb", best_gb)],
    final_estimator=LogisticRegression(max_iter=5000),
    n_jobs=-1,
    cv=5
)
stack_final.fit(X_trainval_p, y_trainval)

# Evaluate on test
y_test_pred = stack_final.predict(X_test_p)
y_test_proba = stack_final.predict_proba(X_test_p)[:, 1]
test_metrics = evaluate_model(y_test, y_test_pred, y_test_proba, tag="Test")



=== Test Metrics ===
Accuracy : 0.836
Precision: 0.771
Recall   : 0.783
F1 Score : 0.777
ROC AUC  : 0.927
Brier    : 0.118

Confusion Matrix:
 [[104  16]
 [ 15  54]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       120
           1       0.77      0.78      0.78        69

    accuracy                           0.84       189
   macro avg       0.82      0.82      0.82       189
weighted avg       0.84      0.84      0.84       189





In [None]:
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", stack_final)
])

joblib.dump(final_pipeline, "toi_stacking_pipeline.joblib")
print("✅ Saved model pipeline to 'toi_stacking_pipeline.joblib'")


✅ Saved model pipeline to 'toi_stacking_pipeline.joblib'


Generated 100 rows of testing data

Data shape: (10000, 11)

First few rows:
    pl_rade  pl_trandep  pl_orbper  pl_trandurh   pl_insol       pl_eqt  \
0  2.453146    0.016742   2.520983          NaN  51.406386   239.599766   
1  1.476078    0.006048   1.305151     4.918091   1.700402  1777.796878   
2  2.768071    0.004519   8.185811     2.191484  10.269201          NaN   
3  5.575734         NaN        NaN     4.743789   5.060509  1247.911513   
4  1.367080    0.029561   3.249797     4.061615  29.347424  1789.635825   

     st_rad   st_logg      st_teff    st_tmag     st_dist  
0  1.048725  4.932559  3215.820518  11.229046  590.065367  
1  1.402005  4.819418  6995.665006  12.476056  142.176942  
2  0.935091  4.988666          NaN  13.422004   81.946035  
3  1.197339  4.641754  5218.812909  15.317126   96.202304  
4  0.769992  4.031526  5083.667368  14.371584   55.263951  

Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 

In [None]:
import requests

url = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+*+from+toi&format=csv"
response = requests.get(url)

# Save to file
with open("toi_data.csv", "wb") as f:
    f.write(response.content)

print("✅ File saved as toi_data.csv")


✅ File saved as toi_data.csv
