<a href="https://colab.research.google.com/github/NASA-Hackathon-Imaginarium-Team/AI-Team/blob/main/lightGBM_model_kepler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from lightgbm import LGBMClassifier
import pandas as pd

In [2]:
# Reading the csv file
url = "https://raw.githubusercontent.com/NASA-Hackathon-Imaginarium-Team/Data-Team/refs/heads/main/data_without_candidates/Kepler%20Objects%20of%20Interest%20-%20Filtered.csv"
df = pd.read_csv(url)

df = df.drop(["ra", "koi_steff", "koi_steff_err1", "koi_steff_err2", "koi_slogg",
              "koi_slogg_err1", "koi_slogg_err2", "koi_srad_err1", "koi_kepmag",
              "koi_kepmag", "koi_tce_plnt_num", "koi_insol_err1",
              "koi_insol_err2", "koi_insol", "koi_teq", "koi_prad_err2", "koi_prad_err1",
              "koi_prad", "koi_depth_err1", "koi_duration_err2", "koi_duration_err1",
              "koi_impact_err2", "koi_impact_err1", "koi_time0bk_err2", "koi_time0bk_err1",
              "koi_time0bk", "koi_period_err1", "koi_period_err1"], axis=1)

# Making the target values 1 and 0
df["koi_disposition"] = df["koi_disposition"].map({
    "CONFIRMED": 1,
    "FALSE POSITIVE": 0
})

print("Number of rows before drop na:", len(df))
df = df.dropna()  # remove any row with missing values
print("Number of rows before after na:", len(df))
# Making an x y split
y_train = df["koi_disposition"]
X_train = df.drop(columns=["koi_disposition"])

Number of rows before drop na: 7585
Number of rows before after na: 6462


In [3]:
from sklearn.model_selection import train_test_split

# first split: train + temp (test + cv)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_train, y_train, test_size=0.4, random_state=42, shuffle=False
)

# second split: temp -> test + cv
X_test, X_cv, y_test, y_cv = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False
)


In [59]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import numpy as np
import lightgbm as lgb

# Define the base model
num_classes = len(np.unique(y_train))
lgb_model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=num_classes,
    boosting_type='gbdt',
    random_state=42
)

# Define the parameter search space
param_dist = {
    'num_leaves': randint(20, 150),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.3),
    'n_estimators': randint(500, 1000),
    'min_child_samples': randint(10, 100),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
}

# Run randomized hyperparameter search
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=30,              # try 30 random combinations (increase for better tuning)
    scoring='accuracy',
    cv=3,                   # 3-fold cross-validation
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("Running hyperparameter search...")
random_search.fit(X_train, y_train)

# Get the best model
print("\nBest hyperparameters found:")
print(random_search.best_params_)

best_model = random_search.best_estimator_


Running hyperparameter search...
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2712
[LightGBM] [Info] Number of data points in the train set: 3877, number of used features: 15
[LightGBM] [Info] Start training from score -1.033703
[LightGBM] [Info] Start training from score -0.439571

Best hyperparameters found:
{'colsample_bytree': np.float64(0.5806106436270022), 'learning_rate': np.float64(0.2889092957027719), 'max_depth': 11, 'min_child_samples': 15, 'n_estimators': 783, 'num_leaves': 47, 'reg_alpha': np.float64(0.2184404372168336), 'reg_lambda': np.float64(0.4165099478703662), 'subsample': np.float64(0.9416401294594341)}


In [91]:
from lightgbm import LGBMClassifier

# create LightGBM model
lgb_model = LGBMClassifier(
    objective='binary',
    n_estimators=783,
    learning_rate=0.2889092957027719,
    num_leaves=47,
    max_depth=11,
    subsample=0.9416401294594341,
    colsample_bytree=0.5806106436270022,
    random_state=42,
    reg_lambda =0.4165099478703662,
    n_jobs=-1
)

# fit on training data
lgb_model.fit(X_train, y_train)

# predict probabilities on test set
y_test_pred = lgb_model.predict_proba(X_test)[:, 1]
y_train_pred = lgb_model.predict_proba(X_train)[:, 1]
y_cv_pred = lgb_model.predict_proba(X_cv)[:, 1]

[LightGBM] [Info] Number of positive: 2498, number of negative: 1379
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2712
[LightGBM] [Info] Number of data points in the train set: 3877, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.644313 -> initscore=0.594132
[LightGBM] [Info] Start training from score 0.594132


In [92]:
from sklearn.metrics import log_loss

train_loss = log_loss(y_train, y_train_pred)
cv_loss = log_loss(y_cv, y_cv_pred)

print(f"Train Loss: {train_loss:.4f}")
print(f"Cross validation Loss: {cv_loss:.4f}")


Train Loss: 0.0000
Cross validation Loss: 0.0009


In [93]:
from sklearn.metrics import accuracy_score

# predict labels
y_label_pred = lgb_model.predict(X_test)

# compute accuracy
accuracy = accuracy_score(y_test, y_label_pred)
print("Accuracy:", accuracy * 100)


Accuracy: 99.84520123839009


In [94]:
from sklearn.metrics import classification_report

def print_classification_reports(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict_proba(X_train)[:, 1]
    y_test_pred = model.predict_proba(X_test)[:, 1]
    y_train_pred_class = (y_train_pred >= 0.5).astype(int)
    y_test_pred_class = (y_test_pred >= 0.5).astype(int)

    print("Training classification report:")
    print(classification_report(y_train, y_train_pred_class))
    print("\nTesting classification report:")
    print(classification_report(y_test, y_test_pred_class))

print_classification_reports(lgb_model, X_train, y_train, X_test, y_test)

Training classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1379
           1       1.00      1.00      1.00      2498

    accuracy                           1.00      3877
   macro avg       1.00      1.00      1.00      3877
weighted avg       1.00      1.00      1.00      3877


Testing classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1081
           1       0.99      1.00      1.00       211

    accuracy                           1.00      1292
   macro avg       1.00      1.00      1.00      1292
weighted avg       1.00      1.00      1.00      1292



In [96]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)

with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)