In [1]:
import time
import random
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [3]:
raw_csv_data = pd.read_csv('koi_data.csv') 
df = raw_csv_data.copy() 

In [4]:
df.head()

Unnamed: 0,kepoi_name,koi_disposition,koi_period,koi_impact,koi_duration,koi_depth,koi_ror,koi_srho,koi_prad,koi_sma,...,koi_fwm_srao,koi_fwm_sdeco,koi_fwm_prao,koi_fwm_pdeco,koi_dicco_mra,koi_dicco_mdec,koi_dicco_msky,koi_dikco_mra,koi_dikco_mdec,koi_dikco_msky
0,K00752.01,CONFIRMED,9.48804,0.146,2.9575,615.8,0.02234,3.20796,2.26,0.0853,...,0.43,0.94,-0.0002,-0.00055,-0.01,0.2,0.2,0.08,0.31,0.32
1,K00752.02,CONFIRMED,54.41838,0.586,4.507,874.8,0.02795,3.02368,2.83,0.2734,...,-0.63,1.23,0.00066,-0.00105,0.39,0.0,0.39,0.49,0.12,0.5
2,K00754.01,FALSE POSITIVE,1.73695,1.276,2.40641,8079.2,0.38739,0.2208,33.46,0.0267,...,-0.111,0.002,0.00302,-0.00142,-0.249,0.147,0.289,-0.257,0.099,0.276
3,K00755.01,CONFIRMED,2.52559,0.701,1.6545,603.3,0.02406,1.98635,2.75,0.0374,...,-0.01,0.23,8e-05,-7e-05,0.03,-0.09,0.1,0.07,0.02,0.07
4,K00114.01,FALSE POSITIVE,7.36179,1.169,5.022,233.7,0.18339,0.00485,39.21,0.082,...,-13.45,24.09,0.00303,-0.00555,-4.506,7.71,8.93,-4.537,7.713,8.948


In [5]:
display(df.groupby('koi_disposition').count().loc[:,['kepoi_name']])

Unnamed: 0_level_0,kepoi_name
koi_disposition,Unnamed: 1_level_1
CONFIRMED,2104
FALSE POSITIVE,3098


In [6]:
df.set_index('kepoi_name', inplace=True)
df.shape

(5202, 42)

In [7]:
y = df.koi_disposition.values
df.drop('koi_disposition', axis=1, inplace=True)
X = df.values
print(X.shape, y.shape)

(5202, 41) (5202,)


In [8]:
def run_classifier(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [11]:
accuracies_gnb = []
accuracies_dt = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    gnb = GaussianNB()
    accuracies_gnb.append(run_classifier(gnb, X_train, X_test, y_train, y_test))
    
    dt = DecisionTreeClassifier(random_state=42)
    accuracies_dt.append(run_classifier(dt, X_train, X_test, y_train, y_test))

print(f'Acurácia Média GNB: {sum(accuracies_gnb) / len(accuracies_gnb):.5f}')
print(f'Acurácia Média DT: {sum(accuracies_dt) / len(accuracies_dt):.5f}')

Acurácia Média GNB: 0.79815
Acurácia Média DT: 0.94483
