<a href="https://colab.research.google.com/github/NASA-Hackathon-Imaginarium-Team/AI-Team/blob/main/lightGBM_model_kepler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from lightgbm import LGBMClassifier
import pandas as pd

In [2]:
# Reading the csv file
url = "https://raw.githubusercontent.com/NASA-Hackathon-Imaginarium-Team/Data-Team/refs/heads/main/data_without_candidates/Kepler%20Objects%20of%20Interest%20-%20Filtered.csv"
df = pd.read_csv(url)

df = df.drop(["ra", "koi_steff", "koi_steff_err1", "koi_steff_err2", "koi_slogg",
              "koi_slogg_err1", "koi_slogg_err2", "koi_srad_err1", "koi_kepmag",
              "koi_kepmag", "koi_tce_plnt_num", "koi_insol_err1",
              "koi_insol_err2", "koi_insol", "koi_teq", "koi_prad_err2", "koi_prad_err1",
              "koi_prad", "koi_depth_err1", "koi_duration_err2", "koi_duration_err1",
              "koi_impact_err2", "koi_impact_err1", "koi_time0bk_err2", "koi_time0bk_err1",
              "koi_time0bk", "koi_period_err1", "koi_period_err1"], axis=1)

# Making the target values 1 and 0
df["koi_disposition"] = df["koi_disposition"].map({
    "CONFIRMED": 1,
    "FALSE POSITIVE": 0
})

print("Number of rows before drop na:", len(df))
df = df.dropna()  # remove any row with missing values
print("Number of rows before after na:", len(df))
# Making an x y split
y_train = df["koi_disposition"]
X_train = df.drop(columns=["koi_disposition"])

Number of rows before drop na: 7585
Number of rows before after na: 6462


In [3]:
from sklearn.model_selection import train_test_split

# first split: train + temp (test + cv)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_train, y_train, test_size=0.4, random_state=42, shuffle=False
)

# second split: temp -> test + cv
X_test, X_cv, y_test, y_cv = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False
)


In [43]:
from lightgbm import LGBMClassifier

# create LightGBM model
lgb_model = LGBMClassifier(
    objective='binary',
    n_estimators=1000,
    learning_rate=0.04,
    num_leaves=127,
    max_depth=12,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# fit on training data
lgb_model.fit(X_train, y_train)

# predict probabilities on test set
y_test_pred = lgb_model.predict_proba(X_test)[:, 1]
y_train_pred = lgb_model.predict_proba(X_train)[:, 1]
y_cv_pred = lgb_model.predict_proba(X_cv)[:, 1]



[LightGBM] [Info] Number of positive: 2498, number of negative: 1379
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2712
[LightGBM] [Info] Number of data points in the train set: 3877, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.644313 -> initscore=0.594132
[LightGBM] [Info] Start training from score 0.594132


In [44]:
from sklearn.metrics import log_loss

train_loss = log_loss(y_train, y_train_pred)
cv_loss = log_loss(y_cv, y_cv_pred)

print(f"Train Loss: {train_loss:.4f}")
print(f"Cross validation Loss: {cv_loss:.4f}")


Train Loss: 0.0000
Cross validation Loss: 0.0032


In [45]:
from sklearn.metrics import accuracy_score

# predict labels
y_label_pred = lgb_model.predict(X_test)

# compute accuracy
accuracy = accuracy_score(y_test, y_label_pred)
print("Accuracy:", accuracy * 100)


Accuracy: 99.84520123839009


In [46]:
from sklearn.metrics import classification_report

def print_classification_reports(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict_proba(X_train)[:, 1]
    y_test_pred = model.predict_proba(X_test)[:, 1]
    y_train_pred_class = (y_train_pred >= 0.5).astype(int)
    y_test_pred_class = (y_test_pred >= 0.5).astype(int)

    print("Training classification report:")
    print(classification_report(y_train, y_train_pred_class))
    print("\nTesting classification report:")
    print(classification_report(y_test, y_test_pred_class))

print_classification_reports(lgb_model, X_train, y_train, X_test, y_test)

Training classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1379
           1       1.00      1.00      1.00      2498

    accuracy                           1.00      3877
   macro avg       1.00      1.00      1.00      3877
weighted avg       1.00      1.00      1.00      3877


Testing classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1081
           1       0.99      1.00      1.00       211

    accuracy                           1.00      1292
   macro avg       1.00      1.00      1.00      1292
weighted avg       1.00      1.00      1.00      1292



In [8]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)

with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)