<a href="https://colab.research.google.com/github/NASA-Hackathon-Imaginarium-Team/AI-Team/blob/main/LightGBM_TESS_PC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [112]:
from lightgbm import LGBMClassifier
import pandas as pd

In [185]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# read + drop irrelevant error-columns
url = "https://raw.githubusercontent.com/NASA-Hackathon-Imaginarium-Team/Data-Team/refs/heads/main/data_without_candidates/TESS%20Objects%20of%20Interest%20-%20Filtered.csv"
df = pd.read_csv(url)

df = df.drop([
    "ra", "st_pmra", "st_pmraerr1", "st_pmraerr2", "st_pmralim",
    "st_pmdec", "st_pmdecerr1", "st_pmdecerr2", "pl_tranmiderr1",
    "pl_tranmiderr2", "pl_orbpererr1", "pl_orbpererr2",
    "pl_trandurherr1", "pl_trandurherr2", "pl_trandeperr1", "pl_trandeperr2",
    "pl_radeerr1", "pl_radeerr2", "st_tmagerr1", "st_tmagerr2",
    "st_disterr1", "st_disterr2", "st_tefferr1", "st_tefferr2",
    "st_loggerr1", "st_loggerr2", "st_raderr1", "st_raderr2"
], axis=1)

# clean + map target robustly
df["tfopwg_disp"] = df["tfopwg_disp"].astype(str).str.strip()
df["tfopwg_disp"] = df["tfopwg_disp"].map({"KP": 1, "FP": 0})

print("rows before dropping missing target:", len(df))
df = df.dropna(subset=["tfopwg_disp"])
df["tfopwg_disp"] = df["tfopwg_disp"].astype(int)
print("rows after dropping missing target:", len(df))

# inspect missingness and drop extremely sparse columns
missing_frac = df.isna().mean().sort_values(ascending=False)
print("Top missing fractions:\n", missing_frac.head(15))

# Choose a threshold to remove columns with too many NaNs (adjust threshold if needed)
threshold = 0.5
cols_keep = missing_frac[missing_frac < threshold].index.tolist()
# Ensure target is kept
if "tfopwg_disp" not in cols_keep:
    cols_keep.append("tfopwg_disp")
df = df[cols_keep].copy()

# separate feature types (exclude target)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if "tfopwg_disp" in num_cols:
    num_cols.remove("tfopwg_disp")
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

# simple imputation for features (no leaking)
num_imp = SimpleImputer(strategy="median")
cat_imp = SimpleImputer(strategy="most_frequent")

if num_cols:
    df[num_cols] = num_imp.fit_transform(df[num_cols])
if cat_cols:
    df[cat_cols] = cat_imp.fit_transform(df[cat_cols])

# final X / y split ready for modelling
y_train = df["tfopwg_disp"]
X_train = df.drop(columns=["tfopwg_disp"])

print("Final shape X:", X_train.shape, "y distribution:\n", y_train.value_counts())

# example pipeline (preprocessing + classifier) you can use directly
num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

clf_pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(class_weight="balanced", random_state=42))
])



rows before dropping missing target: 2562
rows after dropping missing target: 1780
Top missing fractions:
 st_logg           0.112360
st_rad            0.067416
pl_rade           0.066854
pl_eqt            0.047191
pl_insol          0.029213
st_dist           0.023596
st_teff           0.019663
st_pmdeclim       0.016292
pl_orbper         0.011798
pl_trandurhlim    0.000000
pl_trandurh       0.000000
pl_orbperlim      0.000000
pl_tranmidlim     0.000000
dec               0.000000
pl_tranmid        0.000000
dtype: float64
Final shape X: (1780, 24) y distribution:
 tfopwg_disp
0    1197
1     583
Name: count, dtype: int64


In [147]:
from sklearn.model_selection import train_test_split

# first split: train + temp (test + cv)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_train, y_train, test_size=0.4, random_state=42, shuffle=False
)

# second split: temp -> test + cv
X_test, X_cv, y_test, y_cv = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False
)


In [161]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import numpy as np
import lightgbm as lgb

# Define the base model (minimal change: removed incorrect num_class argument)
lgb_model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    random_state=42
)

# Define the parameter search space
param_dist = {
    'num_leaves': randint(20, 80),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.09),
    'n_estimators': randint(200, 800),
    'min_child_samples': randint(20, 200),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0.5, 1.5),
}

# Run randomized hyperparameter search
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("Running hyperparameter search...")
random_search.fit(X_train, y_train)

print("\nBest hyperparameters found:")
print(random_search.best_params_)

best_model = random_search.best_estimator_


Running hyperparameter search...
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Number of positive: 243, number of negative: 825
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3302
[LightGBM] [Info] Number of data points in the train set: 1068, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227528 -> initscore=-1.222322
[LightGBM] [Info] Start training from score -1.222322

Best hyperparameters found:
{'colsample_bytree': np.float64(0.989465534702127), 'learning_rate': np.float64(0.03555288772637191), 'max_depth': 8, 'min_child_samples': 101, 'n_estimators': 362, 'num_leaves': 35, 'reg_alpha': np.float64(0.49722873130541034), 'reg_lambda': np.float64(0.7638878790160181), 'subsample': np.float64(0.6072301454462083)}


In [182]:
from lightgbm import LGBMClassifier

# create LightGBM model
lgb_model = LGBMClassifier(
    objective='binary',
    n_estimators=803,
    learning_rate=0.16889517350680194,
    num_leaves=56,
    max_depth=4,
    subsample=0.6394356762960909,
    colsample_bytree=0.7776004057997312,
    reg_lambda = 0.18870710834137938,
    random_state=42,
    n_jobs=-1
)

# fit on training data
lgb_model.fit(X_train, y_train)

# predict probabilities on test set
y_test_pred = lgb_model.predict_proba(X_test)[:, 1]
y_train_pred = lgb_model.predict_proba(X_train)[:, 1]
y_cv_pred = lgb_model.predict_proba(X_cv)[:, 1]



[LightGBM] [Info] Number of positive: 243, number of negative: 825
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3302
[LightGBM] [Info] Number of data points in the train set: 1068, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.227528 -> initscore=-1.222322
[LightGBM] [Info] Start training from score -1.222322


In [183]:
from sklearn.metrics import log_loss

train_loss = log_loss(y_train, y_train_pred)
cv_loss = log_loss(y_cv, y_cv_pred)

print(f"Train Loss: {train_loss:.4f}")
print(f"Cross validation Loss: {cv_loss:.4f}")


Train Loss: 0.0003
Cross validation Loss: 0.8307


In [184]:
from sklearn.metrics import accuracy_score

# predict labels
y_label_pred = lgb_model.predict(X_test)

# compute accuracy
accuracy = accuracy_score(y_test, y_label_pred)
print("Accuracy:", accuracy * 100)


Accuracy: 75.84269662921348


In [145]:
from sklearn.metrics import classification_report

def print_classification_reports(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict_proba(X_train)[:, 1]
    y_test_pred = model.predict_proba(X_test)[:, 1]
    y_train_pred_class = (y_train_pred >= 0.5).astype(int)
    y_test_pred_class = (y_test_pred >= 0.5).astype(int)

    print("Training classification report:")
    print(classification_report(y_train, y_train_pred_class))
    print("\nTesting classification report:")
    print(classification_report(y_test, y_test_pred_class))

print_classification_reports(lgb_model, X_train, y_train, X_test, y_test)

Training classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       825
           1       1.00      1.00      1.00       243

    accuracy                           1.00      1068
   macro avg       1.00      1.00      1.00      1068
weighted avg       1.00      1.00      1.00      1068


Testing classification report:
              precision    recall  f1-score   support

           0       0.66      0.93      0.77       162
           1       0.91      0.60      0.72       194

    accuracy                           0.75       356
   macro avg       0.79      0.77      0.75       356
weighted avg       0.80      0.75      0.75       356



In [120]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)

with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)