# Model selection on standard deviation columns

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../data/train_data.csv')

X = df.drop(['class4', 'class2'], axis=1)
X = X.loc[:, X.columns[range(1, X.shape[1] - 1, 2)]]

X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns = X.columns)

y_class2 = df['class2']
y_class4 = df['class4']

## Classifiers

In [3]:
classifiers = [
    ('logistic', LogisticRegression()),
    ('kNeighbour', KNeighborsClassifier(3)),
    ('svcLinear', SVC(kernel="linear", C=0.025, probability=True)),
    ('svc', SVC(gamma=2, C=1, probability=True)),
    ('gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ('decissionTree', DecisionTreeClassifier(max_depth=5)),
    ('rfc', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ('mlp', MLPClassifier(alpha=1, max_iter=1000)),
    ('ada', AdaBoostClassifier()),
    ('gaussianNB', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis())]

## Dimensionality reduction techinques

In [4]:
# Variance boundry for VarianceThreshold
# More info: https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold
p = 0.7
variance = p * (1 - p)

In [5]:
dimension_reductions_y2 = [
    ('iso', Isomap(n_components=20)),
    ('lle', LocallyLinearEmbedding(n_components=20)), 
    ('llemodified', LocallyLinearEmbedding(n_components=20, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=20)),
    ('lda', LinearDiscriminantAnalysis(n_components=1)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=20)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=20)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=20))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=20))]

dimension_reductions_y4 = [
    ('iso', Isomap(n_components=20)),
    ('lle', LocallyLinearEmbedding(n_components=20)), 
    ('llemodified', LocallyLinearEmbedding(n_components=20, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=20)),
    ('lda', LinearDiscriminantAnalysis(n_components=2)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=20)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=20)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=20))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=30))]

## Computations

### Functions

In [6]:
def k_fold_cross_validation(ml_pipeline, X, y, n=5, k=10, score='accuracy'):
    """Perform N repeated K-fold cross-validation

    Keyword arguments:
    ml_pipeline -- Intance of scikit-learn's Pipeline
    X -- Data to perform cross-validation
    y -- Labels of the data
    n -- Amount of times cross-validation is repeated (default is 5)
    k -- Amount of folds that the data is splitted to perform 
         cross-validation (default is 10)
    score -- Scoring type as a string for scikit-learn's 
             cross_val_score method (default is accuracy)
    
    Return:
    Two element numpy array where first value is mean of cross-validation scores
    and second is standard deviation of cross-validation scores.
    """
    cv = RepeatedStratifiedKFold(n_splits = n, 
                                 n_repeats = k, 
                                 random_state = 1)
    n_scores = cross_val_score(ml_pipeline, X, y, 
                               scoring = score, cv = cv, 
                               n_jobs = -1)
    
    return(np.array([np.mean(n_scores), np.std(n_scores)]))

### Process

In [7]:
columns = ['accuracy_mean', 'accuracy_std', 
           'accuracy_scaled_mean', 'accuracy_scaled_std']
statistics_y2 = pd.DataFrame(index = columns)
statistics_y4 = pd.DataFrame(index = columns)

#### Binary

In [8]:
y = y_class2

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y2[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y2:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y2[ column ] = data

In [9]:
statistics_transpose_y2 = statistics_y2.transpose(copy=True)
statistics_transpose_y2

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.794651,0.040920,0.799535,0.039161
logistic_iso,0.756977,0.037661,0.798140,0.032704
logistic_lle,0.742093,0.066175,0.795581,0.039882
logistic_llemodified,0.811163,0.037826,0.804651,0.036401
logistic_svd,0.797907,0.041529,0.800698,0.038355
...,...,...,...,...
qda_kbest,0.786279,0.036393,0.786279,0.036393
qda_kbestmutual,0.780465,0.036671,0.781163,0.036748
qda_select,0.728837,0.052542,0.786977,0.049956
qda_selecttree,0.793488,0.043604,0.800930,0.043423


In [16]:
statistics_transpose_y2.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,154.0,154.0,154.0,154.0
mean,0.739355,0.039629,0.759388,0.040741
std,0.09404,0.012922,0.072847,0.007756
min,0.5,0.0,0.508372,0.013164
25%,0.739128,0.038052,0.757674,0.037708
50%,0.776977,0.041728,0.785116,0.041372
75%,0.797384,0.045572,0.80093,0.043855
max,0.824419,0.076599,0.826977,0.089011


#### multi-class

In [11]:
y = y_class4

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y4[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y4:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y4[ column ] = data

In [12]:
statistics_transpose_y4 = statistics_y4.transpose(copy=True)
statistics_transpose_y4

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.582093,0.050653,0.604419,0.038389
logistic_iso,0.538372,0.045587,0.570465,0.048135
logistic_lle,0.499302,0.003610,0.500465,0.002279
logistic_llemodified,0.502093,0.005547,0.505581,0.007442
logistic_svd,0.584419,0.046209,0.596744,0.038596
...,...,...,...,...
qda_kbest,0.565116,0.044552,0.565116,0.044552
qda_kbestmutual,0.565814,0.042857,0.566047,0.042846
qda_select,0.483023,0.045762,0.579070,0.053539
qda_selecttree,0.585581,0.036010,0.571860,0.043613


In [20]:
statistics_transpose_y4.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,154.0,154.0,154.0,154.0
mean,0.546684,0.040622,0.551942,0.037932
std,0.052517,0.021281,0.066422,0.016512
min,0.294419,0.0,0.107907,0.0
25%,0.506686,0.03835,0.527442,0.03355
50%,0.555349,0.042963,0.565581,0.040474
75%,0.584186,0.048629,0.595349,0.046277
max,0.626977,0.163924,0.623023,0.10694


### Save results

In [21]:
statistics_transpose_y2.to_csv('../data/experiment_first_round/CV_binary_std_data.csv', index_label="model_name")
statistics_transpose_y4.to_csv('../data/experiment_first_round/CV_multinomial_std_data.csv', index_label="model_name")