In [1]:
%run "00-common"

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (f1_score, confusion_matrix)
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier, NeighborhoodComponentsAnalysis)
from sklearn.pipeline import (make_pipeline, Pipeline)
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('../feature_selection.csv')
df.head()

Unnamed: 0,koi_period,koi_period_err1,koi_period_err2,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_duration_err1,koi_duration_err2,koi_depth,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,dec,koi_disposition
0,101.110701,0.000953,-0.000953,0.00806,-0.00806,0.046,0.21,-0.21,878.1,1.61,0.1,-0.08,229.0,0.65,0.14,-0.11,24.1,4133.0,74.0,-82.0,0.023,-0.033,0.561,0.033,41.452209,1
1,4.800654,4e-06,-4e-06,0.000745,-0.000745,0.785,0.055,-0.055,15304.0,16.91,6.01,-2.0,1347.0,778.45,793.72,-255.78,298.0,6715.0,161.0,-241.0,0.056,-0.224,1.241,0.441,38.999008,0
2,39.593105,0.000615,-0.000615,0.014,-0.014,0.0044,0.598,-0.598,156.6,1.22,0.51,-0.11,546.0,21.06,26.65,-6.05,5.7,6046.0,172.0,-218.0,0.044,-0.298,0.972,0.411,41.659611,0
3,31.158825,5.7e-05,-5.7e-05,0.00138,-0.00138,0.029,0.0504,-0.0504,959.0,3.56,0.57,-0.46,639.0,39.46,17.49,-11.96,73.6,5951.0,107.0,-119.0,0.12,-0.12,1.155,0.184,38.710232,1
4,613.82905,0.0326,-0.0326,0.0319,-0.0319,0.2616,1.4,-1.4,148.2,2.85,1.25,-1.41,307.0,2.1,2.78,-1.62,10.2,5636.0,173.0,-148.0,0.459,-0.224,2.357,1.027,43.824032,0


In [4]:
n = 50
random_state = 1

X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]

In [5]:
# Split into train/test
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, stratify=y,
                     random_state=random_state)

In [None]:
best = 0
bestn = 1
bestm = ""
res = {}
for n_neighbors in range(1,n) :
    pca = make_pipeline(StandardScaler(),
                        PCA(random_state=random_state))
    lda = make_pipeline(StandardScaler(),
                        LinearDiscriminantAnalysis())
    nca = make_pipeline(StandardScaler(),
                        NeighborhoodComponentsAnalysis(random_state=random_state))
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    wknn = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
    # Make a list of the methods to be compared
    dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]

    for i, (name, model) in enumerate(dim_reduction_methods):
        model.fit(X_train, y_train)
        knn.fit(model.transform(X_train), y_train)
        wknn.fit(model.transform(X_train), y_train)
        out_knn = knn.predict(model.transform(X_test))
        f1_knn = f1_score(y_test, out_knn)
        out_wknn = wknn.predict(model.transform(X_test))
        f1_wknn = f1_score(y_test, out_wknn)
        if max(f1_knn, f1_wknn) > best:
            best = max(f1_knn, f1_wknn)
            bestn = n_neighbors
            bestm = name
            if f1_wknn > f1_knn:
                bestm = "Weighted " + bestm
        if name not in res.keys():
            res[name] = [f1_knn]
            res['W'+name] = [f1_wknn]
        else:
            res[name].append(f1_knn)
            res['W'+name].append(f1_wknn)
print("The best value was found with ", bestm, " and parameter k = ", bestn, " with a f1 score of ", best)

In [None]:
xaxis = range(1,n)
plt.plot(xaxis, res['PCA'], 'r-', label = 'PCA')
plt.plot(xaxis, res['LDA'], 'g-', label = 'LDA')
plt.plot(xaxis, res['NCA'], 'b-', label = 'NCA')
plt.plot(xaxis, res['WPCA'], 'r--', label = 'Weighted PCA')
plt.plot(xaxis, res['WLDA'], 'g--', label = 'Weighted LDA')
plt.plot(xaxis, res['WNCA'], 'b--', label = 'Weighted NCA')
plt.xlabel("K value")
plt.ylabel("F1 Score")
plt.grid(True)
plt.legend()
plt.savefig("../figures/knn.pdf")

In [None]:
knn = Pipeline(steps=[
    ('Scaler', StandardScaler()),
    ('KNN', KNeighborsClassifier(n_neighbors=1)),
     ])
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(pred)
confusion_matrix(y_test, pred)