# Model selection on mean columns

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../data/train_data.csv')

X = df.drop(['class4', 'class2'], axis=1)
X = X.loc[:, X.columns[range(2, X.shape[1], 2)]]

X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns = X.columns)

y_class2 = df['class2']
y_class4 = df['class4']

## Classifiers

In [3]:
classifiers = [
    ('logistic', LogisticRegression()),
    ('kNeighbour', KNeighborsClassifier(3)),
    ('svcLinear', SVC(kernel="linear", C=0.025, probability=True)),
    ('svc', SVC(gamma=2, C=1, probability=True)),
    ('gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ('decissionTree', DecisionTreeClassifier(max_depth=5)),
    ('rfc', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ('mlp', MLPClassifier(alpha=1, max_iter=1000)),
    ('ada', AdaBoostClassifier()),
    ('gaussianNB', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis())]

## Dimensionality reduction techinques

In [4]:
# Variance boundry for VarianceThreshold
# More info: https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold
p = 0.7
variance = p * (1 - p)

In [5]:
dimension_reductions_y2 = [
    ('iso', Isomap(n_components=30)),
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=10)),
    ('lda', LinearDiscriminantAnalysis(n_components=1)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=20))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=20))]

dimension_reductions_y4 = [
    ('iso', Isomap(n_components=10)),
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=5)),
    ('lda', LinearDiscriminantAnalysis(n_components=2)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=10))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10))]

## Computations

### Functions

In [6]:
def k_fold_cross_validation(ml_pipeline, X, y, n=5, k=10, score='accuracy'):
    """Perform N repeated K-fold cross-validation

    Keyword arguments:
    ml_pipeline -- Intance of scikit-learn's Pipeline
    X -- Data to perform cross-validation
    y -- Labels of the data
    n -- Amount of times cross-validation is repeated (default is 5)
    k -- Amount of folds that the data is splitted to perform 
         cross-validation (default is 10)
    score -- Scoring type as a string for scikit-learn's 
             cross_val_score method (default is accuracy)
    
    Return:
    Two element numpy array where first value is mean of cross-validation scores
    and second is standard deviation of cross-validation scores.
    """
    cv = RepeatedStratifiedKFold(n_splits = n, 
                                 n_repeats = k, 
                                 random_state = 1)
    n_scores = cross_val_score(ml_pipeline, X, y, 
                               scoring = score, cv = cv, 
                               n_jobs = -1)
    
    return(np.array([np.mean(n_scores), np.std(n_scores)]))

### Process

In [7]:
columns = ['accuracy_mean', 'accuracy_std', 
           'accuracy_scaled_mean', 'accuracy_scaled_std']
statistics_y2 = pd.DataFrame(index = columns)
statistics_y4 = pd.DataFrame(index = columns)

#### Binary

In [8]:
y = y_class2

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y2[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y2:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y2[ column ] = data

In [9]:
statistics_transpose_y2 = statistics_y2.transpose(copy=True)
statistics_transpose_y2

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.825581,0.035038,0.869535,0.031741
logistic_iso,0.769767,0.043817,0.853256,0.037129
logistic_lle,0.776512,0.044992,0.855116,0.032741
logistic_llemodified,0.838140,0.034871,0.862326,0.035678
logistic_svd,0.823023,0.029443,0.852558,0.035149
...,...,...,...,...
qda_kbest,0.778372,0.044272,0.778372,0.044272
qda_kbestmutual,0.776744,0.045155,0.776744,0.045155
qda_select,0.779302,0.050658,0.790698,0.046103
qda_selecttree,0.854419,0.031992,0.852791,0.031598


In [16]:
statistics_transpose_y2.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,154.0,154.0,154.0,154.0
mean,0.765642,0.036723,0.819561,0.038127
std,0.109837,0.015545,0.070775,0.009319
min,0.5,0.0,0.516512,0.01919
25%,0.758953,0.033761,0.798023,0.032444
50%,0.801395,0.038148,0.840698,0.036199
75%,0.835581,0.043005,0.860988,0.041962
max,0.871163,0.103419,0.891163,0.083387


#### multi-class

In [11]:
y = y_class4

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y4[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y4:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y4[ column ] = data

In [12]:
statistics_transpose_y4 = statistics_y4.transpose(copy=True)
statistics_transpose_y4

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.635581,0.043978,0.673488,0.039249
logistic_iso,0.559535,0.048484,0.651395,0.038035
logistic_lle,0.500000,0.000000,0.502093,0.005547
logistic_llemodified,0.500000,0.004028,0.501163,0.006262
logistic_svd,0.596279,0.042249,0.636279,0.045515
...,...,...,...,...
qda_kbest,0.574186,0.040541,0.574186,0.040541
qda_kbestmutual,0.570000,0.048027,0.570000,0.048027
qda_select,0.570465,0.049738,0.590233,0.051718
qda_selecttree,0.634884,0.046044,0.626977,0.043435


In [20]:
statistics_transpose_y4.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,154.0,154.0,154.0,154.0
mean,0.552711,0.044736,0.5973,0.041696
std,0.107747,0.039614,0.069127,0.021864
min,0.060465,0.0,0.164419,0.0
25%,0.527674,0.034998,0.567965,0.036437
50%,0.572093,0.042734,0.613837,0.041241
75%,0.614186,0.049166,0.640814,0.047607
max,0.662791,0.257627,0.676047,0.15696


### Save results

In [21]:
statistics_transpose_y2.to_csv('../data/experiment_first_round/CV_binary_mean_data.csv', index_label="model_name")
statistics_transpose_y4.to_csv('../data/experiment_first_round/CV_multinomial_mean_data.csv', index_label="model_name")