In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import joblib
import pickle

from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)


In [8]:
data=pd.read_csv('data/df_KOI_full_outliers.csv')
data.head()

Unnamed: 0,koi_period,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_steff,koi_slogg,koi_srad,koi_kepmag,koi_disposition,koi_prad_log,koi_depth_log,koi_teq_log,koi_insol_log,koi_model_snr_log
0,9.488036,2.9575,615.8,2.26,793.0,93.59,35.8,0,0,0,0,5455.0,4.467,0.927,15.347,CONFIRMED,1.181727,6.424545,6.677083,4.549552,3.605498
1,54.418383,4.507,874.8,2.83,443.0,9.11,25.8,0,0,0,0,5455.0,4.467,0.927,15.347,CONFIRMED,1.342865,6.775138,6.095825,2.313525,3.288402
2,19.89914,1.7822,10829.0,14.6,638.0,39.3,76.3,0,0,0,0,5853.0,4.544,0.868,15.436,CANDIDATE,2.747271,9.290075,6.459904,3.696351,4.347694
3,1.736952,2.40641,8079.2,33.46,1395.0,891.96,505.6,0,1,0,0,5805.0,4.564,0.791,15.597,FALSE POSITIVE,3.539799,8.997172,7.241366,6.794542,6.227722
4,2.525592,1.6545,603.3,2.75,1406.0,926.16,40.9,0,0,0,0,6031.0,4.438,1.046,15.509,CONFIRMED,1.321756,6.404071,7.249215,6.832126,3.735286


In [9]:
data.columns

Index(['koi_period', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq',
       'koi_insol', 'koi_model_snr', 'koi_fpflag_nt', 'koi_fpflag_ss',
       'koi_fpflag_co', 'koi_fpflag_ec', 'koi_steff', 'koi_slogg', 'koi_srad',
       'koi_kepmag', 'koi_disposition', 'koi_prad_log', 'koi_depth_log',
       'koi_teq_log', 'koi_insol_log', 'koi_model_snr_log'],
      dtype='object')

In [10]:
data['koi_disposition'] = data['koi_disposition'].map({
    'FALSE POSITIVE': 0,
    'CANDIDATE': 1,
    'CONFIRMED': 2
})


In [11]:
# Define target
target_col = "koi_disposition"

# Original cols to drop (since log versions exist)
orig_cols = ["koi_prad", "koi_depth", "koi_teq", 
             "koi_insol", "koi_model_snr"]

# Build feature matrix and target
X = data.drop(columns=[target_col] + orig_cols)
y = data[target_col]

X

Unnamed: 0,koi_period,koi_duration,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_steff,koi_slogg,koi_srad,koi_kepmag,koi_prad_log,koi_depth_log,koi_teq_log,koi_insol_log,koi_model_snr_log
0,9.488036,2.95750,0,0,0,0,5455.0,4.467,0.927,15.347,1.181727,6.424545,6.677083,4.549552,3.605498
1,54.418383,4.50700,0,0,0,0,5455.0,4.467,0.927,15.347,1.342865,6.775138,6.095825,2.313525,3.288402
2,19.899140,1.78220,0,0,0,0,5853.0,4.544,0.868,15.436,2.747271,9.290075,6.459904,3.696351,4.347694
3,1.736952,2.40641,0,1,0,0,5805.0,4.564,0.791,15.597,3.539799,8.997172,7.241366,6.794542,6.227722
4,2.525592,1.65450,0,0,0,0,6031.0,4.438,1.046,15.509,1.321756,6.404071,7.249215,6.832126,3.735286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9558,0.527699,3.22210,0,1,1,0,5638.0,4.529,0.903,14.082,3.412797,7.365307,7.644441,8.412173,6.118758
9559,1.739849,3.11400,0,0,0,0,6119.0,4.444,1.031,14.757,0.542324,3.901973,7.383368,7.369481,2.451005
9560,0.681402,0.86500,0,0,1,0,6173.0,4.447,1.041,15.385,0.727549,4.650144,7.704812,8.650746,2.587764
9561,333.486169,3.19900,0,0,0,0,4989.0,2.992,7.824,10.998,3.010621,6.461624,6.324359,3.164631,2.708050


In [12]:
data.describe()

Unnamed: 0,koi_period,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_steff,koi_slogg,koi_srad,koi_kepmag,koi_disposition,koi_prad_log,koi_depth_log,koi_teq_log,koi_insol_log,koi_model_snr_log
count,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0,9563.0
mean,75.677541,5.621791,22906.64,99.087369,1077.563317,7491.296,250.926153,0.159992,0.232772,0.197532,0.120046,5709.158946,4.314988,1.701141,14.264589,0.781031,1.822796,6.625411,6.753558,4.959849,3.705344
std,1334.813701,6.471867,80794.1,3018.881075,840.905757,156524.0,781.906019,0.366617,0.42262,0.398158,0.325032,781.696883,0.425036,6.011691,1.385444,0.863242,1.369629,2.316183,0.680769,2.559853,1.579078
min,0.241843,0.052,0.0,0.08,25.0,0.0,0.0,0.0,0.0,0.0,0.0,2661.0,0.047,0.109,6.966,0.0,0.076961,0.0,3.258097,0.0,0.0
25%,2.733379,2.4375,166.8,1.43,553.0,22.15,12.3,0.0,0.0,0.0,0.0,5333.5,4.2325,0.8355,13.44,0.0,0.887891,5.122773,6.317165,3.141994,2.587764
50%,9.751921,3.7922,421.1,2.39,878.0,141.6,23.0,0.0,0.0,0.0,0.0,5767.0,4.438,1.0,14.52,0.0,1.22083,6.045242,6.778785,4.960044,3.178054
75%,40.715305,6.277,1342.55,13.115,1353.0,806.955,71.15,0.0,0.0,0.0,0.0,6099.0,4.539,1.313,15.322,2.0,2.647238,7.20307,7.210818,6.694506,4.278747
max,129995.7784,138.54,1541400.0,200346.0,14667.0,10947550.0,9054.7,1.0,1.0,1.0,1.0,15896.0,5.364,229.908,20.003,2.0,12.207806,14.248202,9.593424,16.208627,9.11115


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

# 3. Transform test data (DON'T fit again!)
X_test_scaled = scaler.transform(X_test)




pickle.dump(scaler, open("scaler.pkl", "wb"))

In [15]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [16]:
from xgboost import XGBClassifier  # <-- Step 1: Import

# === Define Models & Hyperparameters ===
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)  # <-- Step 2: Add model
}

params = {
    "Random Forest": {
        'n_estimators': [100, 200],
        'max_depth': [None, 20],
        'max_features': ['sqrt', 'log2']
    },

    "Decision Tree": {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },

    "Gradient Boosting": {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.05],
        'max_depth': [3, 5]
    },

    "AdaBoost": {
        'n_estimators': [50, 100],
        'learning_rate': [0.1, 1.0]
    },

    "Logistic Regression": {
        'C': [0.1, 1.0, 10.0],
        'solver': ['lbfgs', 'liblinear']
    },

    "XGBoost": {  # <-- Step 3: Add hyperparameters
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.05],
        'max_depth': [3, 5, 7]
    }
}

# === Train and Evaluate ===
best_model = None
best_score = 0
model_scores = {}

for name, model in models.items():
    print(f"\n🔧 Training {name}...")
    grid = GridSearchCV(model, params[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"✅ {name} Results:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"Best Params: {grid.best_params_}")
    
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    model_scores[name] = {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'best_params': grid.best_params_
    }

    if acc > best_score:
        best_score = acc
        best_model = grid.best_estimator_

# === Save Best Model ===
print(f"\n🏆 Best Model: {type(best_model).__name__} with Accuracy: {best_score:.4f}")
pickle.dump(best_model, open("best_model.pkl", "wb"))



🔧 Training Random Forest...


✅ Random Forest Results:
Accuracy:  0.9252
Precision: 0.9018
Recall:    0.8977
F1-Score:  0.8996
Best Params: {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 200}
[[947   9   3]
 [  9 307  66]
 [  6  50 516]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       959
           1       0.84      0.80      0.82       382
           2       0.88      0.90      0.89       572

    accuracy                           0.93      1913
   macro avg       0.90      0.90      0.90      1913
weighted avg       0.92      0.93      0.92      1913


🔧 Training Decision Tree...
✅ Decision Tree Results:
Accuracy:  0.9090
Precision: 0.8831
Recall:    0.8708
F1-Score:  0.8750
Best Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5}
[[944  11   4]
 [  8 274 100]
 [  6  45 521]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       959
           1       0.83      0.72      0.77

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

✅ Logistic Regression Results:
Accuracy:  0.8944
Precision: 0.8604
Recall:    0.8540
F1-Score:  0.8560
Best Params: {'C': 10.0, 'solver': 'liblinear'}
[[940  10   9]
 [  0 269 113]
 [  5  65 502]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       959
           1       0.78      0.70      0.74       382
           2       0.80      0.88      0.84       572

    accuracy                           0.89      1913
   macro avg       0.86      0.85      0.86      1913
weighted avg       0.90      0.89      0.89      1913


🔧 Training AdaBoost...
✅ AdaBoost Results:
Accuracy:  0.9049
Precision: 0.8732
Recall:    0.8711
F1-Score:  0.8716
Best Params: {'learning_rate': 1.0, 'n_estimators': 100}
[[938  15   6]
 [  0 286  96]
 [  5  60 507]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       959
           1       0.79      0.75      0.77       382
           2       0.83      0.89      0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


✅ XGBoost Results:
Accuracy:  0.9279
Precision: 0.9032
Recall:    0.9032
F1-Score:  0.9031
Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
[[945  11   3]
 [  4 314  64]
 [  4  52 516]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       959
           1       0.83      0.82      0.83       382
           2       0.89      0.90      0.89       572

    accuracy                           0.93      1913
   macro avg       0.90      0.90      0.90      1913
weighted avg       0.93      0.93      0.93      1913


🏆 Best Model: XGBClassifier with Accuracy: 0.9279


In [17]:
y_pred

array([1, 1, 2, ..., 2, 0, 0], shape=(1913,))