In [1]:
import pandas as pd
import numpy as np
import sklearn, sklearn.ensemble, sklearn.decomposition, sklearn.neighbors, sklearn.model_selection, sklearn.svm
from sklearn.preprocessing import StandardScaler
import baikal, baikal.sklearn

In [2]:
RandomForest = baikal.make_step(sklearn.ensemble.RandomForestClassifier)

In [3]:
KNN = baikal.make_step(sklearn.neighbors.KNeighborsClassifier)

In [4]:
SVM = baikal.make_step(sklearn.svm.SVC)

In [5]:
PCA = baikal.make_step(sklearn.decomposition.PCA)

In [6]:
Scaler = baikal.make_step(StandardScaler)

In [7]:
X = np.load('X.npy')

In [8]:
Y = np.load('Y.npy')

In [9]:
X.size

16032

In [10]:
SEED = 5321

In [11]:
def build():
    x = baikal.Input()
    y_t = baikal.Input()
    x_s = Scaler(name="scaler")(x)
    y_p = SVM(random_state=SEED, name="classifier")(x_s, y_t)
    model = baikal.Model(x, y_p, y_t)
    return model

def build_pca():
    x = baikal.Input()
    y_t = baikal.Input()
    x_s = Scaler(name="scaler")(x)
    h = PCA(random_state=SEED, name="pca")(x_s)
    y_p = SVM(random_state=SEED, name="classifier")(h, y_t)
    model = baikal.Model(x, y_p, y_t)
    return model

In [12]:
param_grid = [
    {
        "classifier": [
            SVM(random_state=SEED)
        ],
        "classifier__C": [0.01, 0.1, 1],
        "classifier__kernel": ["rbf", "poly", "sigmoid"],
    },
    {
        "classifier": [
            KNN()
        ],
        "classifier__n_neighbors": [3, 4, 5, 6, 7],
        "classifier__weights": ["distance", "uniform"],
    },
    {
        "classifier": [RandomForest(random_state=SEED)],
        "classifier__n_estimators": [10, 50, 100, 150, 200, 250],
    },
]

In [25]:
param_grid_pca = [
    {
        "classifier": [
            SVM(random_state=SEED)
        ],
        "classifier__C": [0.01, 0.1, 1],
        "classifier__kernel": ["rbf", "poly", "sigmoid"],
        "pca__n_components": [1, 2, 3, 4, 5],
    },
    {
        "classifier": [
            KNN()
        ],
        "classifier__n_neighbors": [3, 4, 5, 6, 7],
        "classifier__weights": ["distance", "uniform"],
        "pca__n_components": [1, 2, 3, 4, 5],
    },
    {
        "classifier": [RandomForest(random_state=SEED)],
        "classifier__n_estimators": [10, 50, 100, 150, 200, 250],
        "pca__n_components": [1, 2, 3, 4, 5],
    },
]

In [26]:
cv = sklearn.model_selection.StratifiedKFold(n_splits=3, random_state=SEED)



In [27]:
raw_model = baikal.sklearn.SKLearnWrapper(build)
pca_model = baikal.sklearn.SKLearnWrapper(build_pca)

In [16]:
raw_baikal = sklearn.model_selection.GridSearchCV(
    raw_model,
    param_grid,
    cv=cv,
    scoring="accuracy",
    return_train_score=True
)

In [17]:
raw_baikal.fit(X, Y)
print("Best score:", raw_baikal.best_score_)
print("Best parameters", raw_baikal.best_params_)

Best score: 0.9925149700598803
Best parameters {'classifier': RandomForestClassifier(name='classifier', n_outputs=1,
bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None, oob_score=False, random_state=5321, verbose=0, warm_start=False), 'classifier__n_estimators': 200}


In [28]:
pca_baikal = sklearn.model_selection.GridSearchCV(
    pca_model,
    param_grid_pca,
    cv=cv,
    scoring="accuracy",
    return_train_score=True
)

In [29]:
pca_baikal.fit(X, Y)
print("Best score:", pca_baikal.best_score_)
print("Best parameters", pca_baikal.best_params_)

Best score: 0.9805389221556887
Best parameters {'classifier': SVC(name='classifier', n_outputs=1,
C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid', max_iter=-1, probability=False, random_state=5321, shrinking=True, tol=0.001, verbose=False), 'classifier__C': 0.1, 'classifier__kernel': 'sigmoid', 'pca__n_components': 5}


In [30]:
pca_baikal.best_estimator_.model.predict(X[:1])

array([0])

In [31]:
pd.DataFrame(pca_baikal.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__kernel,param_pca__n_components,param_classifier__n_neighbors,param_classifier__weights,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.084395,0.002980,0.032323,0.001901,"SVC(name='classifier', n_outputs=1,\nC=0.1, br...",0.01,rbf,1,,,...,0.609281,0.666168,0.648703,0.027935,103,0.677395,0.699102,0.671407,0.682635,0.011898
1,0.067659,0.004856,0.030775,0.002199,"SVC(name='classifier', n_outputs=1,\nC=0.1, br...",0.01,rbf,2,,,...,0.970060,0.920659,0.958583,0.027504,75,0.954341,0.957335,0.983533,0.965070,0.013112
2,0.066872,0.005893,0.027699,0.001083,"SVC(name='classifier', n_outputs=1,\nC=0.1, br...",0.01,rbf,3,,,...,0.956587,0.929641,0.954591,0.019608,82,0.965569,0.961826,0.975299,0.967565,0.005679
3,0.074334,0.003756,0.028145,0.001763,"SVC(name='classifier', n_outputs=1,\nC=0.1, br...",0.01,rbf,4,,,...,0.983533,0.953593,0.972555,0.013464,16,0.979042,0.976796,0.991766,0.982535,0.006592
4,0.079304,0.004570,0.030030,0.001234,"SVC(name='classifier', n_outputs=1,\nC=0.1, br...",0.01,rbf,5,,,...,0.988024,0.952096,0.973054,0.015267,15,0.979790,0.982036,0.991766,0.984531,0.005198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,0.490487,0.023547,0.031768,0.000420,RandomForestClassifier(name='RandomForestClass...,,,1,,,...,0.649701,0.517964,0.596307,0.056601,118,1.000000,1.000000,1.000000,1.000000,0.000000
121,0.397853,0.005448,0.022130,0.000212,RandomForestClassifier(name='RandomForestClass...,,,2,,,...,0.979042,0.896707,0.952595,0.039538,86,1.000000,1.000000,1.000000,1.000000,0.000000
122,0.424987,0.006832,0.022218,0.000213,RandomForestClassifier(name='RandomForestClass...,,,3,,,...,0.979042,0.943114,0.970060,0.019403,40,1.000000,1.000000,1.000000,1.000000,0.000000
123,0.512545,0.105902,0.020851,0.000151,RandomForestClassifier(name='RandomForestClass...,,,4,,,...,0.979042,0.914671,0.957086,0.029998,77,1.000000,1.000000,1.000000,1.000000,0.000000


In [42]:
def classifier_latex(df, with_pca=False):
    pf = "param_classifier__"
    pc = "param_pca__"
    classifiers = []
    for row in pd.DataFrame(df).iterrows():
        r = row[1]
        classifier = str(row[1]['param_classifier'])
        if "SVC" in classifier:
            text = ["SVM", f"C: {r[pf + 'C']}", f"Kernel: {r[pf + 'kernel']}"]
        if "KNeighbors" in classifier:
            text = ["KNN", f"N: {r[pf + 'n_neighbors']}", f"Weights: {r[pf + 'weights']}"]
        if "Forest" in classifier:
            text = ["RF", f"N: {r[pf + 'n_estimators']}", ""]
        if with_pca:
            text.append(f"K: {r[pc + 'n_components']}")

        classifiers.append(text + [f"{r['mean_test_score']:.3f} $\pm$ {r['std_test_score']:.3f}"])
        
    columns = ["Classifier", "Parameters", "", "Score"]
    if with_pca:
        columns.insert(2, "PCA components")
    return pd.DataFrame(classifiers, columns=columns).to_latex(escape=False, index=False)

In [43]:
print(classifier_latex(raw_baikal.cv_results_))

\begin{tabular}{llll}
\toprule
Classifier & \multicolumn{2}{l}{Parameters} &              Score \\
\midrule
       SVM &    C: 0.01 &        Kernel: rbf &  0.975 $\pm$ 0.013 \\
       SVM &    C: 0.01 &       Kernel: poly &  0.879 $\pm$ 0.035 \\
       SVM &    C: 0.01 &    Kernel: sigmoid &  0.975 $\pm$ 0.011 \\
       SVM &     C: 0.1 &        Kernel: rbf &  0.989 $\pm$ 0.003 \\
       SVM &     C: 0.1 &       Kernel: poly &  0.975 $\pm$ 0.009 \\
       SVM &     C: 0.1 &    Kernel: sigmoid &  0.989 $\pm$ 0.009 \\
       SVM &       C: 1 &        Kernel: rbf &  0.985 $\pm$ 0.007 \\
       SVM &       C: 1 &       Kernel: poly &  0.985 $\pm$ 0.008 \\
       SVM &       C: 1 &    Kernel: sigmoid &  0.989 $\pm$ 0.004 \\
       KNN &       N: 3 &  Weights: distance &  0.983 $\pm$ 0.007 \\
       KNN &       N: 3 &   Weights: uniform &  0.984 $\pm$ 0.006 \\
       KNN &       N: 4 &  Weights: distance &  0.984 $\pm$ 0.006 \\
       KNN &       N: 4 &   Weights: uniform &  0.985 $\pm$ 0.00

In [44]:
print(classifier_latex(pca_baikal.cv_results_, True))

\begin{tabular}{lllll}
\toprule
Classifier & Parameters & \multicolumn{2}{l}{PCA components} &              Score \\
\midrule
       SVM &    C: 0.01 &        Kernel: rbf &  K: 1 &  0.649 $\pm$ 0.028 \\
       SVM &    C: 0.01 &        Kernel: rbf &  K: 2 &  0.959 $\pm$ 0.028 \\
       SVM &    C: 0.01 &        Kernel: rbf &  K: 3 &  0.955 $\pm$ 0.020 \\
       SVM &    C: 0.01 &        Kernel: rbf &  K: 4 &  0.973 $\pm$ 0.013 \\
       SVM &    C: 0.01 &        Kernel: rbf &  K: 5 &  0.973 $\pm$ 0.015 \\
       SVM &    C: 0.01 &       Kernel: poly &  K: 1 &  0.594 $\pm$ 0.042 \\
       SVM &    C: 0.01 &       Kernel: poly &  K: 2 &  0.833 $\pm$ 0.058 \\
       SVM &    C: 0.01 &       Kernel: poly &  K: 3 &  0.876 $\pm$ 0.040 \\
       SVM &    C: 0.01 &       Kernel: poly &  K: 4 &  0.881 $\pm$ 0.041 \\
       SVM &    C: 0.01 &       Kernel: poly &  K: 5 &  0.884 $\pm$ 0.041 \\
       SVM &    C: 0.01 &    Kernel: sigmoid &  K: 1 &  0.488 $\pm$ 0.015 \\
       SVM &    C: 0.01 &  

In [46]:
from sklearn.metrics import classification_report
print(classification_report(Y, raw_baikal.best_estimator_.predict(X)))
print(classification_report(Y, pca_baikal.best_estimator_.predict(X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       501
           1       1.00      1.00      1.00       501
           2       1.00      1.00      1.00       501
           3       1.00      1.00      1.00       501

    accuracy                           1.00      2004
   macro avg       1.00      1.00      1.00      2004
weighted avg       1.00      1.00      1.00      2004

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       501
           1       0.98      1.00      0.99       501
           2       0.98      1.00      0.99       501
           3       1.00      0.99      1.00       501

    accuracy                           0.99      2004
   macro avg       0.99      0.99      0.99      2004
weighted avg       0.99      0.99      0.99      2004

