<a href="https://colab.research.google.com/github/NASA-Hackathon-Imaginarium-Team/AI-Team/blob/main/lightGBM_model_kepler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
from lightgbm import LGBMClassifier
import pandas as pd

In [47]:
# Reading the csv file
url = "https://raw.githubusercontent.com/NASA-Hackathon-Imaginarium-Team/Data-Team/refs/heads/main/data_without_candidates/Kepler%20Objects%20of%20Interest%20-%20Filtered.csv"
df = pd.read_csv(url)

# Making the target values 1 and 0
df["koi_disposition"] = df["koi_disposition"].map({
    "CONFIRMED": 1,
    "FALSE POSITIVE": 0
})

print("Number of rows before drop na:", len(df))
df = df.dropna()  # remove any row with missing values
print("Number of rows before after na:", len(df))
# Making an x y split
y_train = df["koi_disposition"]
X_train = df.drop(columns=["koi_disposition"])

Number of rows before drop na: 7585
Number of rows before after na: 6447


In [48]:
from sklearn.model_selection import train_test_split

# split into train and test only
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, shuffle = False
)

In [49]:
from lightgbm import LGBMClassifier

# create LightGBM model
lgb_model = LGBMClassifier(
    objective='binary',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=127,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# fit on training data
lgb_model.fit(X_train, y_train)

# predict probabilities on test set
y_pred = lgb_model.predict_proba(X_test)[:, 1]

[LightGBM] [Info] Number of positive: 2696, number of negative: 2461
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8926
[LightGBM] [Info] Number of data points in the train set: 5157, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.522785 -> initscore=0.091201
[LightGBM] [Info] Start training from score 0.091201


In [50]:
from sklearn.metrics import accuracy_score

# predict labels
y_label_pred = lgb_model.predict(X_test)

# compute accuracy
accuracy = accuracy_score(y_test, y_label_pred)
print("Accuracy:", accuracy * 100)

Accuracy: 100.0


In [51]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)

with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)