In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('NASA_Exoplanet_transit.csv')

In [3]:
le = LabelEncoder()
df['koi_pdisposition_encoded'] = le.fit_transform(df['koi_pdisposition'])

In [7]:

Y = df['koi_pdisposition_encoded']

# drop labels
df.drop(columns=['kepid', 'kepoi_name', 'koi_disposition', 'koi_pdisposition', 'koi_pdisposition_encoded'])

features = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_model_snr', 'koi_teq']

X = df[features]

scaler = RobustScaler()
scaled_X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(scaled_X, Y, random_state=100, test_size=.3, shuffle=True)

X_train = pd.DataFrame(X_train)
Y_train = pd.Series(Y_train)


In [4]:
def train_model_w_kfold(model):
    print(model)

    accuraies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    kf = KFold(n_splits=10)

    for train_index, test_index in kf.split(X_train):
        X_train_kfold, X_test_kfold = X_train.iloc[train_index], X_train.iloc[test_index]
        Y_train_kfold, Y_test_kfold = Y_train.iloc[train_index], Y_train.iloc[test_index]
        # X_train_kfold, X_test_kfold = X_train[train_index], X_train[test_index]
        # Y_train_kfold, Y_test_kfold = Y_train[train_index], Y_train[test_index]


        model.fit(X_train_kfold, Y_train_kfold)

        Y_prediction_kfold = model.predict(X_test_kfold)

        accuracy = accuracy_score(Y_test_kfold, Y_prediction_kfold)
        accuraies.append(accuracy)

        precision = precision_score(Y_test_kfold, Y_prediction_kfold)
        precisions.append(precision)

        recall = recall_score(Y_test_kfold, Y_prediction_kfold)
        recalls.append(recall)

        f1 = f1_score(Y_test_kfold, Y_prediction_kfold)
        f1_scores.append(f1)

        # print(f"Cross-Validation Accuracy: {accuracy}")

    print(f"Cross-Validation Average Accuracy: {(sum(accuraies) / len(accuraies)) * 100 :.2f}")
    print(f"Cross-Validation Average Accuracy: {(sum(precisions) / len(precisions)) * 100 :.2f}")
    print(f"Cross-Validation Average Accuracy: {(sum(recalls) / len(recalls)) * 100 :.2f}")
    print(f"Cross-Validation Average f1_score: {(sum(f1_scores) / len(f1_scores)) * 100 :.2f}")

    model.fit(X_train, Y_train)
    Y_prediction = model.predict(X_test)
    final_accuracy = accuracy_score(Y_test, Y_prediction)
    final_precision = precision_score(Y_test, Y_prediction)
    final_recall = recall_score(Y_test, Y_prediction)
    final_f1 = f1_score(Y_test, Y_prediction)

    print(f"Test Accuracy:  {final_accuracy * 100 :.2f}")
    print(f"Test precision: {final_precision * 100 :.2f}")
    print(f"Test recall:    {final_recall * 100 :.2f}")
    print(f"Test F1_score:  {final_f1 * 100 :.2f}")

In [5]:
Logistic_Regression_model = LogisticRegression(solver='newton-cg', max_iter=1000)
KNeighbors_model = KNeighborsClassifier(n_neighbors=5)
Decision_Tree_model = DecisionTreeClassifier()
BernoulliNB_model = BernoulliNB()
Random_Forest_model = RandomForestClassifier()

models = [Logistic_Regression_model, KNeighbors_model, Decision_Tree_model, BernoulliNB_model, Random_Forest_model]

In [8]:
for model in models:
    train_model_w_kfold(model)
    print()

LogisticRegression(max_iter=1000, solver='newton-cg')
Cross-Validation Average Accuracy: 91.19
Cross-Validation Average Accuracy: 93.26
Cross-Validation Average Accuracy: 89.02
Cross-Validation Average f1_score: 91.08
Test Accuracy:  91.64
Test precision: 94.50
Test recall:    88.84
Test F1_score:  91.58

KNeighborsClassifier()
Cross-Validation Average Accuracy: 96.39
Cross-Validation Average Accuracy: 96.55
Cross-Validation Average Accuracy: 96.27
Cross-Validation Average f1_score: 96.41
Test Accuracy:  96.97
Test precision: 97.92
Test recall:    96.12
Test F1_score:  97.01

DecisionTreeClassifier()
Cross-Validation Average Accuracy: 98.69
Cross-Validation Average Accuracy: 98.67
Cross-Validation Average Accuracy: 98.73
Cross-Validation Average f1_score: 98.69
Test Accuracy:  98.71
Test precision: 98.97
Test recall:    98.50
Test F1_score:  98.74

BernoulliNB()
Cross-Validation Average Accuracy: 97.70
Cross-Validation Average Accuracy: 97.18
Cross-Validation Average Accuracy: 98.29
Cr