We consider two operations:

 * Selecting best predictors
 * Dividing feature matrix on train and test
 
We refute commutative property of these operations in terms of cross-validation error of a classifier. 

In [30]:
import numpy as np
from sklearn.covariance import empirical_covariance as cov
from sklearn import datasets
from sklearn import preprocessing
from sklearn.svm import SVC

** DataSets **

In [69]:
uniform_vector = np.random.uniform(-1, 1, size=1000)
uniform_matrix = np.random.rand(50,2000) # uniform distribution over [0,1), given range (d0, d1, ...)
standart_normal_matrix = np.random.randn(50,2000) # standart normal, given range (d0, d1, ...)
normal_vector = np.random.normal(0.5, 2, 50)
normal_matrix = np.random.normal(0, 1, (50,2000)) # normally distributed samples of given mean and variance

In [70]:
x_size = normal_matrix.shape[1]
y_size = normal_matrix.shape[0]
predictors  = normal_matrix
target = normal_vector.reshape(-1,1)
target = preprocessing.binarize(target, threshold = 0.5)

In [74]:
def find_indices_of_best_predictors(predictors, target):
    x_size = predictors.shape[1]
    y_size = predictors.shape[0]
    
    covariances = np.zeros(x_size)
    for ind, column in enumerate(predictors.T):
        covariances[ind] = np.cov(np.vstack((column.T, target.T)))[0,1]

    indices_of_sorted = sorted(range(len(covariances)), key = lambda x: np.abs(covariances[x]))
    indices_of_sorted = np.transpose(indices_of_sorted)

    NUMBER_OF_PREDICTORS = 100
    indices_best_predictors = indices_of_sorted[-NUMBER_OF_PREDICTORS:-1]
    return indices_best_predictors

In [72]:
def divide_on_train_and_test(predictors, target, fraction = 0.7):
    random_train_indices = np.random.choice([True, False], predictors.shape[0], p = [fraction, 1 - fraction]) 
    
    random_train_matrix = predictors[random_train_indices, : ]
    random_train_target = target[random_train_indices, :].reshape(sum(random_train_indices),)
    
    random_test_matrix = predictors[~random_train_indices, : ]
    random_test_target = target[~random_train_indices, :].reshape(sum(~random_train_indices),)
    
    return ((random_train_matrix, random_train_target), (random_test_matrix, random_test_target))

In [73]:
def classify_and_check(train_matrix, train_target, test_matrix, test_target):
    model = SVC()
    model.fit(train_matrix, train_target.reshape(train_target.shape[0],)) 
    
    prediction = model.predict(test_matrix) 
    error = sum(prediction != test_target) / len(test_target)
    return error

In [68]:
tries = 100
measures_of_error = np.zeros(tries)
for one_try in range(tries):
    features_matrix = predictors[ : , find_indices_of_best_predictors(predictors, target)]
    (train_matrix, train_target), (test_matrix, test_target) = divide_on_train_and_test(selected_features_matrix, target)
    measures_of_error[one_try] = classify_and_check(train_matrix, train_target, test_matrix, test_target)
         
measures_of_error.mean()

0.048483147485469467

In [75]:
tries = 100
measures_of_error = np.zeros(tries)
for one_try in range(tries):
    (train_matrix, train_target), (test_matrix, test_target) = divide_on_train_and_test(predictors, target)
    indices_of_best_predictors = find_indices_of_best_predictors(train_matrix, train_target)
    train_matrix = train_matrix[: , indices_of_best_predictors]
    test_matrix = test_matrix[: , indices_of_best_predictors]
    
    measures_of_error[one_try] = classify_and_check(train_matrix, train_target, test_matrix, test_target)
    
measures_of_error.mean()

0.65241005712524069