# Second round of model selection

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier

from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import PCA

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../data/train_data.csv')

X = df.drop(['class4', 'class2'], axis=1)
X_means = X.loc[:, X.columns[range(2, X.shape[1], 2)]]

X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns = X.columns)
X_scaled_means = X_scaled.loc[:, X.columns[range(2, X.shape[1], 2)]]

y_class2 = df['class2']
y_class4 = df['class4']

## Classifiers

In [3]:
classifiers = [
    ('logistic', LogisticRegression()),
    ('gaussian', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ('rfc', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ('mlp', MLPClassifier(alpha=1, max_iter=1000))]

## Functions

In [4]:
def k_fold_cross_validation(ml_pipeline, X, y, n=10, k=10, score='accuracy'):
    """Perform N repeated K-fold cross-validation

    Keyword arguments:
    ml_pipeline -- Intance of scikit-learn's Pipeline
    X -- Data to perform cross-validation
    y -- Labels of the data
    n -- Amount of times cross-validation is repeated (default is 10)
    k -- Amount of folds that the data is splitted to perform 
         cross-validation (default is 10)
    score -- Scoring type as a string for scikit-learn's 
             cross_val_score method (default is accuracy)
    
    Return:
    Two element numpy array where first value is mean of cross-validation scores
    and second is standard deviation of cross-validation scores.
    """
    cv = RepeatedStratifiedKFold(n_splits = n, 
                                 n_repeats = k, 
                                 random_state = 1)
    n_scores = cross_val_score(ml_pipeline, X, y, 
                               scoring = score, cv = cv, 
                               n_jobs = -1)
    
    return(np.array([np.mean(n_scores), np.std(n_scores)]))

## Entire data set

### Dimensionality reduction techinques

In [5]:
# Variance boundry for VarianceThreshold
# More info: https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold
p = 0.7
variance = p * (1 - p)

In [6]:
dimension_reductions_y2 = [
    ('lle', LocallyLinearEmbedding(n_components=31)), 
    ('llemodified', LocallyLinearEmbedding(n_components=31, method='modified', n_neighbors=90)),
    ('pca', PCA()),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=63)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=90)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selectree', SelectFromModel(ExtraTreesClassifier(n_estimators=45))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=70))]

dimension_reductions_y4 = [
    ('lle', LocallyLinearEmbedding(n_components=50)), 
    ('llemodified', LocallyLinearEmbedding(n_components=50, method='modified', n_neighbors=90)),
    ('pca', PCA()),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=20)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=40)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=70))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=50))]

### Computations

In [7]:
columns = ['accuracy_mean', 'accuracy_std', 
           'accuracy_scaled_mean', 'accuracy_scaled_std']
statistics_y2 = pd.DataFrame(index = columns)
statistics_y4 = pd.DataFrame(index = columns)

#### Binary

In [8]:
y = y_class2

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y2[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y2:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y2[ column ] = data

In [9]:
statistics_transpose_y2 = statistics_y2.transpose(copy=True)
statistics_transpose_y2

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.861395,0.057899,0.894419,0.045384
logistic_lle,0.784884,0.059325,0.853721,0.052032
logistic_llemodified,0.866512,0.052686,0.875349,0.04928
logistic_pca,0.868372,0.054562,0.894419,0.045384
logistic_sel,0.863488,0.054007,0.894419,0.045384
logistic_kbest,0.86814,0.058377,0.891395,0.046634
logistic_kbestmutual,0.863023,0.055219,0.89814,0.045901
logistic_select,0.85907,0.054423,0.878372,0.05209
logistic_selectree,0.858837,0.053565,0.882326,0.049203
logistic_rfe,0.862558,0.052011,0.877674,0.049401


In [28]:
statistics_transpose_y2.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,40.0,40.0,40.0,40.0
mean,0.795506,0.059512,0.849314,0.053695
std,0.082385,0.009737,0.050325,0.008952
min,0.555814,0.039943,0.638837,0.042843
25%,0.776453,0.05394,0.824709,0.045384
50%,0.817442,0.06045,0.864302,0.053127
75%,0.848488,0.064225,0.885872,0.058902
max,0.868372,0.082545,0.896977,0.085413


#### multi-class

In [11]:
y = y_class4

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y4[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y4:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y4[ column ] = data

In [12]:
statistics_transpose_y4 = statistics_y4.transpose(copy=True)
statistics_transpose_y4

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.655814,0.062054,0.663721,0.059964
logistic_lle,0.500465,0.015937,0.505814,0.015205
logistic_llemodified,0.512558,0.02078,0.510465,0.018126
logistic_pca,0.625116,0.06049,0.663488,0.059877
logistic_sel,0.653488,0.062791,0.663488,0.059967
logistic_kbest,0.588837,0.067151,0.614419,0.059125
logistic_kbestmutual,0.650465,0.061481,0.665116,0.064448
logistic_select,0.634651,0.056898,0.656047,0.072898
logistic_selecttree,0.64186,0.060911,0.677442,0.066241
logistic_rfe,0.649767,0.062818,0.65907,0.068533


In [27]:
statistics_transpose_y4.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,40.0,40.0,40.0,40.0
mean,0.535413,0.059917,0.630791,0.056971
std,0.1759,0.038465,0.048365,0.01259
min,0.060465,0.011393,0.497907,0.01326
25%,0.563372,0.056249,0.614826,0.054751
50%,0.586744,0.060505,0.643953,0.059586
75%,0.634942,0.066114,0.663488,0.062778
max,0.655814,0.258139,0.682558,0.071528


#### Save results

In [34]:
statistics_transpose_y2.to_csv('../data/experiment_second_round/CV_binary_all_data.csv', index_label="model_name")
statistics_transpose_y4.to_csv('../data/experiment_second_round/CV_multinomial_all_data.csv', index_label="model_name")

## Mean data set

### Dimensionality reduction techinques

In [15]:
# Variance boundry for VarianceThreshold
# More info: https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold
p = 0.7
variance = p * (1 - p)

In [16]:
dimension_reductions_y2 = [
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('pca', PCA()),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=20))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=20))]

dimension_reductions_y4 = [
    ('lle', LocallyLinearEmbedding(n_components=10)), 
    ('llemodified', LocallyLinearEmbedding(n_components=10, method='modified', n_neighbors=90)),
    ('pca', PCA()),
    ('sel', VarianceThreshold(threshold=variance)),
    ('kbest', SelectKBest(f_classif, k=10)), 
    ('kbestmutual', SelectKBest(mutual_info_classif, k=10)),
    ('select', SelectFromModel(LinearSVC(penalty="l2"))),
    ('selecttree', SelectFromModel(ExtraTreesClassifier(n_estimators=10))),
    ('rfe', RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10))]

### Computations

In [17]:
columns = ['accuracy_mean', 'accuracy_std', 
           'accuracy_scaled_mean', 'accuracy_scaled_std']
statistics_y2 = pd.DataFrame(index = columns)
statistics_y4 = pd.DataFrame(index = columns)

#### Binary

In [18]:
y = y_class2

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y2[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y2:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y2[ column ] = data

In [19]:
statistics_transpose_y2 = statistics_y2.transpose(copy=True)
statistics_transpose_y2

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.861395,0.057899,0.894419,0.045384
logistic_lle,0.766977,0.064531,0.845581,0.056442
logistic_llemodified,0.833953,0.051059,0.864884,0.051866
logistic_pca,0.868372,0.054562,0.894419,0.045384
logistic_sel,0.863488,0.054007,0.894419,0.045384
logistic_kbest,0.797209,0.061272,0.807907,0.058058
logistic_kbestmutual,0.789302,0.06009,0.794419,0.06203
logistic_select,0.857442,0.051966,0.87814,0.051175
logistic_selecttree,0.856047,0.049519,0.876744,0.053082
logistic_rfe,0.857442,0.057405,0.863721,0.054937


In [20]:
statistics_transpose_y2.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,40.0,40.0,40.0,40.0
mean,0.795506,0.059512,0.849314,0.053695
std,0.082385,0.009737,0.050325,0.008952
min,0.555814,0.039943,0.638837,0.042843
25%,0.776453,0.05394,0.824709,0.045384
50%,0.817442,0.06045,0.864302,0.053127
75%,0.848488,0.064225,0.885872,0.058902
max,0.868372,0.082545,0.896977,0.085413


#### multi-class

In [21]:
y = y_class4

for model_used in classifiers:
    model = Pipeline([model_used])

    not_scaled = k_fold_cross_validation(model, X, y)
    scaled = k_fold_cross_validation(model, X_scaled, y)

    data = np.concatenate((not_scaled, scaled))
    statistics_y4[ model_used[0] ] = data

    for feature_selection in dimension_reductions_y4:
        model = Pipeline([feature_selection, model_used])

        not_scaled = k_fold_cross_validation(model, X, y)
        scaled = k_fold_cross_validation(model, X_scaled, y)

        column = model_used[0] + '_' + feature_selection[0]
        data = np.concatenate((not_scaled, scaled))
        statistics_y4[ column ] = data

In [22]:
statistics_transpose_y4 = statistics_y4.transpose(copy=True)
statistics_transpose_y4

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
logistic,0.655814,0.062054,0.663721,0.059964
logistic_lle,0.5,0.011628,0.501628,0.01326
logistic_llemodified,0.503488,0.014477,0.51,0.01809
logistic_pca,0.625116,0.06049,0.663488,0.059877
logistic_sel,0.653488,0.062791,0.663488,0.059967
logistic_kbest,0.587442,0.060521,0.585349,0.057912
logistic_kbestmutual,0.60093,0.067219,0.623488,0.070897
logistic_select,0.637907,0.056032,0.658372,0.070515
logistic_selecttree,0.643721,0.060544,0.676512,0.061273
logistic_rfe,0.623721,0.057096,0.648605,0.063082


In [36]:
statistics_transpose_y4.describe()

Unnamed: 0,accuracy_mean,accuracy_std,accuracy_scaled_mean,accuracy_scaled_std
count,40.0,40.0,40.0,40.0
mean,0.535413,0.059917,0.630791,0.056971
std,0.1759,0.038465,0.048365,0.01259
min,0.060465,0.011393,0.497907,0.01326
25%,0.563372,0.056249,0.614826,0.054751
50%,0.586744,0.060505,0.643953,0.059586
75%,0.634942,0.066114,0.663488,0.062778
max,0.655814,0.258139,0.682558,0.071528


#### Save results

In [35]:
statistics_transpose_y2.to_csv('../data/experiment_second_round/CV_binary_mean_data.csv', index_label="model_name")
statistics_transpose_y4.to_csv('../data/experiment_second_round/CV_multinomial_mean_data.csv', index_label="model_name")