# Model selection on entire data set

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../data/train_data.csv')

X = df.drop(['class4', 'class2'], axis=1)
X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns = X.columns)

y_class2 = df['class2']
y_class4 = df['class4']

## Classifiers

In [3]:
classifiers = [
    ('logistic', LogisticRegression()),
    ('kNeighbour', KNeighborsClassifier(3)),
    ('svcLinear', SVC(kernel="linear", C=0.025, probability=True)),
    ('svc', SVC(gamma=2, C=1, probability=True)),
    ('gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ('decissionTree', DecisionTreeClassifier(max_depth=5)),
    ('rfc', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ('mlp', MLPClassifier(alpha=1, max_iter=1000)),
    ('ada', AdaBoostClassifier()),
    ('gaussianNB', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis())]

## Dimensionality reduction techinques

In [4]:
# Variance boundry for VarianceThreshold
# More info: https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold
p = 0.7
variance = p * (1 - p)

In [5]:
dimension_reductions_y2 = [
    ('iso', Isomap(n_components=35)),
    ('lle', LocallyLinearEmbedding(n_components=31)), 
    ('llemodified', LocallyLinearEmbedding(n_components=31, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=28)),
    ('lda', LinearDiscriminantAnalysis(n_components=1)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=63)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=90)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=45))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=70))]

dimension_reductions_y4 = [
    ('iso', Isomap(n_components=70)),
    ('lle', LocallyLinearEmbedding(n_components=50)), 
    ('llemodified', LocallyLinearEmbedding(n_components=50, method='modified', n_neighbors=90)),
    ('svd', TruncatedSVD(n_components=60)),
    ('lda', LinearDiscriminantAnalysis(n_components=2)),
    ('pca', PCA()),
    ('kpca', KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=1)),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=20)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=40)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=70))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=50))]

## Computations

### Functions

In [6]:
def k_fold_cross_validation(ml_pipeline, X, y, n=5, k=10, score='accuracy'):
    """Perform N repeated K-fold cross-validation

    Keyword arguments:
    ml_pipeline -- Intance of scikit-learn's Pipeline
    X -- Data to perform cross-validation
    y -- Labels of the data
    n -- Amount of times cross-validation is repeated (default is 5)
    k -- Amount of folds that the data is splitted to perform 
         cross-validation (default is 10)
    score -- Scoring type as a string for scikit-learn's 
             cross_val_score method (default is accuracy)
    
    Return:
    Two element numpy array where first value is mean of cross-validation scores
    and second is standard deviation of cross-validation scores.
    """
    cv = RepeatedStratifiedKFold(n_splits = n, 
                                 n_repeats = k, 
                                 random_state = 1)
    n_scores = cross_val_score(ml_pipeline, X, y, 
                               scoring = score, cv = cv, 
                               n_jobs = -1)
    
    return(np.array([np.mean(n_scores), np.std(n_scores)]))

### Process

In [7]:
columns = ['accuracy_mean', 'accuracy_std', 
           'accuracy_scaled_mean', 'accuracy_scaled_std']
statistics_y2 = pd.DataFrame(index = columns)
statistics_y4 = pd.DataFrame(index = columns)

#### Binary

In [8]:
y = y_class2

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y2[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y2:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y2[ column ] = data

In [9]:
statistics_transpose_y2 = statistics_y2.transpose(copy=True)
statistics_transpose_y2

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.858605,0.039009,0.891395,0.035772
logistic_iso,0.754651,0.046526,0.842791,0.041421
logistic_lle,0.767674,0.049085,0.858837,0.029417
logistic_llemodified,0.858372,0.032282,0.875581,0.030165
logistic_svd,0.854651,0.037300,0.878140,0.033235
...,...,...,...,...
qda_kbest,0.830698,0.039064,0.830698,0.039064
qda_kbestmutual,0.813953,0.040011,0.813953,0.040011
qda_select,0.825116,0.040478,0.816744,0.043728
qda_selecttree,0.864884,0.039184,0.870930,0.033073


In [10]:
statistics_transpose_y2.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,154.0,154.0,154.0,154.0
mean,0.760394,0.037059,0.802134,0.035324
std,0.121822,0.015213,0.112894,0.011185
min,0.5,0.0,0.5,0.005696
25%,0.754302,0.034294,0.799302,0.030682
50%,0.80814,0.039828,0.84314,0.035765
75%,0.848779,0.045036,0.871279,0.041516
max,0.871628,0.087425,0.895116,0.073633


#### multi-class

In [11]:
y = y_class4

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y4[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y4:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y4[ column ] = data

In [12]:
statistics_transpose_y4 = statistics_y4.transpose(copy=True)
statistics_transpose_y4

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.653023,0.034756,0.657907,0.046633
logistic_iso,0.504651,0.049552,0.564419,0.048460
logistic_lle,0.501860,0.009114,0.507209,0.010629
logistic_llemodified,0.513488,0.011478,0.509535,0.009781
logistic_svd,0.656047,0.042507,0.658372,0.047767
...,...,...,...,...
qda_kbest,0.574884,0.047896,0.574884,0.047896
qda_kbestmutual,0.616744,0.051477,0.617442,0.052583
qda_select,0.618372,0.040793,0.624651,0.043198
qda_selecttree,0.647442,0.045952,0.640930,0.043485


In [13]:
statistics_transpose_y4.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,154.0,154.0,154.0,154.0
mean,0.533928,0.039952,0.583799,0.039429
std,0.135123,0.031063,0.070312,0.022714
min,0.060465,0.0,0.226047,0.0
25%,0.5,0.031599,0.523081,0.03297
50%,0.568605,0.042488,0.604535,0.041541
75%,0.61,0.047865,0.643023,0.046688
max,0.660465,0.257138,0.674186,0.150952


### Save results

In [14]:
statistics_transpose_y2.to_csv('../data/experiment_first_round/CV_binary_all_data.csv', index_label="model_name")
statistics_transpose_y4.to_csv('../data/experiment_first_round/CV_multinomial_all_data.csv', index_label="model_name")