In [1]:
%run "00-common"

import sklearn as sk
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

import sklearn.model_selection as cv
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis

df = pd.read_csv('../feature_selection.csv')
df.head()

Unnamed: 0,koi_period,koi_period_err1,koi_period_err2,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_duration_err1,koi_duration_err2,koi_depth,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,dec,koi_disposition
0,101.110701,0.000953,-0.000953,0.00806,-0.00806,0.046,0.21,-0.21,878.1,1.61,0.1,-0.08,229.0,0.65,0.14,-0.11,24.1,4133.0,74.0,-82.0,0.023,-0.033,0.561,0.033,41.452209,1
1,4.800654,4e-06,-4e-06,0.000745,-0.000745,0.785,0.055,-0.055,15304.0,16.91,6.01,-2.0,1347.0,778.45,793.72,-255.78,298.0,6715.0,161.0,-241.0,0.056,-0.224,1.241,0.441,38.999008,0
2,39.593105,0.000615,-0.000615,0.014,-0.014,0.0044,0.598,-0.598,156.6,1.22,0.51,-0.11,546.0,21.06,26.65,-6.05,5.7,6046.0,172.0,-218.0,0.044,-0.298,0.972,0.411,41.659611,0
3,31.158825,5.7e-05,-5.7e-05,0.00138,-0.00138,0.029,0.0504,-0.0504,959.0,3.56,0.57,-0.46,639.0,39.46,17.49,-11.96,73.6,5951.0,107.0,-119.0,0.12,-0.12,1.155,0.184,38.710232,1
4,613.82905,0.0326,-0.0326,0.0319,-0.0319,0.2616,1.4,-1.4,148.2,2.85,1.25,-1.41,307.0,2.1,2.78,-1.62,10.2,5636.0,173.0,-148.0,0.459,-0.224,2.357,1.027,43.824032,0


In [2]:
y = df['koi_disposition']
X = df.drop('koi_disposition', axis=1)
(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.3, stratify=y, random_state=1)

In [3]:
cv = 50

clf1 = Pipeline(steps=[
    ('Scaler', preprocessing.PowerTransformer() ),
    ('NCA', NeighborhoodComponentsAnalysis(random_state=1)),
    ('NB', GaussianNB())])
clf2 = Pipeline(steps=[
    ('Scaler', preprocessing.StandardScaler()),
    ('KNN', KNeighborsClassifier(n_neighbors=7, weights='distance')),
     ])
clf3 = DecisionTreeClassifier(criterion='entropy')

for clf, label in zip([clf1, clf2, clf3], ['Naive Bayes', 'Knn (3)', 'Dec. Tree', ]):
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)
    
    print("F1 score: %0.3f [%s]" % (scores.mean(), label))

F1 score: 0.832 [Naive Bayes]
F1 score: 0.787 [Knn (3)]
F1 score: 0.807 [Dec. Tree]


In [4]:
eclf = VotingClassifier(estimators=[('nb', clf1), ('knn3', clf2), ('dt', clf3)], voting='hard')
scores = cross_val_score(eclf, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)

print("F1 score: %0.3f [%s]" % (scores.mean() , "Majority Voting"))

F1 score: 0.850 [Majority Voting]


In [5]:
eclf.fit(X_train, y_train)
pred = eclf.predict(X_test)

print("\nConfusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))
print("\nF1 score on test set: ", sk.metrics.f1_score(y_test, pred))


Confusion matrix on test set:
 [[377  31]
 [ 23 169]]

Accuracy on test set:  0.91

F1 score on test set:  0.8622448979591837


In [6]:
eclf2 = VotingClassifier(estimators=[('nb', clf1), ('knn3', clf2), ('dt', clf3)], voting='soft', weights=[2,1,2])
scores = cross_val_score(eclf, X, y, cv=cv, scoring='f1', n_jobs=-1)

print("F1 score: %0.3f [%s]" % (scores.mean(), "Weighted Voting"))

F1 score: 0.868 [Weighted Voting]


In [7]:
eclf2.fit(X_train, y_train)
pred = eclf2.predict(X_test)

print("\nConfusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))
print("\nF1 score on test set: ", sk.metrics.f1_score(y_test, pred))


Confusion matrix on test set:
 [[371  37]
 [ 23 169]]

Accuracy on test set:  0.9

F1 score on test set:  0.8492462311557789


In [9]:
import pickle

with open('pickles/voting.pkl', 'wb') as out:
    pickle.dump(eclf, out)

with open('pickles/voting-weighted.pkl', 'wb') as out:
    pickle.dump(eclf2, out)