In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import joblib
import pickle

from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)


In [38]:
data=pd.read_csv('data/df_KOI_capped.csv')
data.head()

Unnamed: 0,koi_period,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_steff,koi_slogg,koi_srad,koi_kepmag,koi_disposition
0,9.488036,2.9575,615.8,2.26,793.0,93.59,35.8,0,0,0,0,5455.0,4.467,0.927,15.347,CONFIRMED
1,54.418383,4.507,874.8,2.83,443.0,9.11,25.8,0,0,0,0,5455.0,4.467,0.927,15.347,CONFIRMED
2,19.89914,1.7822,3104.2375,14.6,638.0,39.3,76.3,0,0,0,0,5853.0,4.544,0.868,15.436,CANDIDATE
3,1.736952,2.40641,3104.2375,30.63625,1395.0,891.96,159.3625,0,1,0,0,5805.0,4.564,0.791,15.597,FALSE POSITIVE
4,2.525592,1.6545,603.3,2.75,1406.0,926.16,40.9,0,0,0,0,6031.0,4.438,1.046,15.509,CONFIRMED


In [39]:
data['koi_disposition'] = data['koi_disposition'].map({
    'FALSE POSITIVE': 0,
    'CANDIDATE': 1,
    'CONFIRMED': 2
})


In [40]:
X = data.drop('koi_disposition', axis=1)
y=data['koi_disposition']



In [41]:
data.describe()

Unnamed: 0,koi_period,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_steff,koi_slogg,koi_srad,koi_kepmag,koi_disposition
count,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0,9564.0
mean,28.20943,4.841954,983.51153,8.798627,1026.311637,541.574355,51.898302,0.208595,0.232748,0.197512,0.120033,5700.771644,4.358883,1.129717,14.276929,0.781159
std,35.53913,3.252243,1135.99106,11.429235,627.129609,733.548542,55.983546,4.76729,0.422605,0.398142,0.325018,647.930473,0.262971,0.434656,1.336585,0.863287
min,0.241843,0.052,0.0,0.08,25.0,0.0,0.0,0.0,0.0,0.0,0.0,4184.0,3.773375,0.119875,10.617,0.0
25%,2.733684,2.43775,166.8,1.43,553.0,22.16,12.3,0.0,0.0,0.0,0.0,5333.0,4.23275,0.83575,13.44,0.0
50%,9.752831,3.7926,421.1,2.39,878.0,141.6,23.0,0.0,0.0,0.0,0.0,5767.0,4.438,1.0,14.52,0.0
75%,40.715178,6.2765,1341.775,13.1125,1352.5,806.7975,71.125,0.0,0.0,0.0,0.0,6099.0,4.539,1.313,15.322,2.0
max,97.687418,12.034625,3104.2375,30.63625,2551.75,1983.75375,159.3625,465.0,1.0,1.0,1.0,7248.0,4.998375,2.028875,18.145,2.0


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

# 3. Transform test data (DON'T fit again!)
X_test_scaled = scaler.transform(X_test)




pickle.dump(scaler, open("scaler.pkl", "wb"))

In [44]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [45]:
# === Define Models & Hyperparameters ===
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
}

params = {
    "Random Forest": {
        'n_estimators': [100, 200],
        'max_depth': [None, 20],
        'max_features': ['sqrt', 'log2']
    },

    "Decision Tree": {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },

    "Gradient Boosting": {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.05],
        'max_depth': [3, 5]
    },

    "AdaBoost": {
        'n_estimators': [50, 100],
        'learning_rate': [0.1, 1.0]
    },

    "Logistic Regression": {
        'C': [0.1, 1.0, 10.0],
        'solver': ['lbfgs', 'liblinear']
    }
}


# === Train and Evaluate ===
best_model = None
best_score = 0
model_scores = {}

for name, model in models.items():
    print(f"\n🔧 Training {name}...")
    grid = GridSearchCV(model, params[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"✅ {name} Results:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"Best Params: {grid.best_params_}")
    
    # Assuming y_true and y_pred are available
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    

    model_scores[name] = {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'best_params': grid.best_params_
    }

    if acc > best_score:
        best_score = acc
        best_model = grid.best_estimator_

# === Save Best Model ===
print(f"\n🏆 Best Model: {type(best_model).__name__} with Accuracy: {best_score:.4f}")
pickle.dump(best_model, open("best_model.pkl", "wb"))


🔧 Training Random Forest...
✅ Random Forest Results:
Accuracy:  0.9111
Precision: 0.8857
Recall:    0.8832
F1-Score:  0.8842
Best Params: {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 100}
[[921  15   3]
 [ 10 315  80]
 [  5  57 507]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       939
           1       0.81      0.78      0.80       405
           2       0.86      0.89      0.87       569

    accuracy                           0.91      1913
   macro avg       0.89      0.88      0.88      1913
weighted avg       0.91      0.91      0.91      1913


🔧 Training Decision Tree...
✅ Decision Tree Results:
Accuracy:  0.8913
Precision: 0.8614
Recall:    0.8536
F1-Score:  0.8560
Best Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5}
[[920  13   6]
 [ 10 283 112]
 [  5  62 502]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       939
           1  

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

✅ Logistic Regression Results:
Accuracy:  0.8055
Precision: 0.7720
Recall:    0.7381
F1-Score:  0.7373
Best Params: {'C': 1.0, 'solver': 'liblinear'}
[[884   9  46]
 [ 41 166 198]
 [ 20  58 491]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       939
           1       0.71      0.41      0.52       405
           2       0.67      0.86      0.75       569

    accuracy                           0.81      1913
   macro avg       0.77      0.74      0.74      1913
weighted avg       0.81      0.81      0.79      1913


🔧 Training AdaBoost...
✅ AdaBoost Results:
Accuracy:  0.8840
Precision: 0.8538
Recall:    0.8425
F1-Score:  0.8451
Best Params: {'learning_rate': 1.0, 'n_estimators': 100}
[[918  11  10]
 [  2 269 134]
 [  5  60 504]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       939
           1       0.79      0.66      0.72       405
           2       0.78      0.89      0.

In [46]:
y_pred

array([2, 1, 2, ..., 2, 1, 0], shape=(1913,))