In [1]:
import numpy as np
from sklearn.covariance import empirical_covariance as cov
from sklearn import datasets
from sklearn import preprocessing
from sklearn.svm import SVC

** First, check the average fraction of unique elements in bootstrapped sample with respect to the total number. **

In [6]:
tries = 1000
datasample = np.random.rand(100, 1)
measures = np.zeros(tries)
for one_try in range(tries):
    bootstrapped = np.random.randint(0, len(datasample), size = len(datasample))
    measures[one_try] = len(np.unique(datasample[bootstrapped])) / len(bootstrapped)

print(measures.mean(), " approximately equal to 1 - 1/e = ", 1 - 1/np.e)

0.63299  approximately equal to 1 - 1/e =  0.6321205588285577


** Check that bootstrap is not applicable in counting conditional error rate E[X|T] **

In [25]:
uniform_vector = np.random.uniform(-1, 1, size=1000)
uniform_matrix = np.random.rand(50,2000) # uniform distribution over [0,1), given range (d0, d1, ...)
standart_normal_matrix = np.random.randn(50,2000) # standart normal, given range (d0, d1, ...)
normal_vector = np.random.normal(0.5, 2, 200)
normal_matrix = np.random.normal(0, 1, (200,2000)) # normally distributed samples of given mean and variance

x_size = normal_matrix.shape[1]
y_size = normal_matrix.shape[0]
predictors  = normal_matrix
target = normal_vector.reshape(-1,1)
target = preprocessing.binarize(target, threshold = 0.5)

In [26]:
def classify_and_check(train_matrix, train_target, test_matrix, test_target):
    train_target = train_target.reshape(train_matrix.shape[0],)
    test_target = test_target.reshape(test_matrix.shape[0],)
    model = SVC()
    model.fit(train_matrix, train_target.reshape(train_target.shape[0],)) 
    
    prediction = model.predict(test_matrix) 
    error = sum(prediction != test_target) / len(test_target)
    return error

In [27]:
def split_according_to_bootstrap(predictors, target):
    random_bootstrap_indices = np.random.randint(0, predictors.shape[0], size = predictors.shape[0])
    unique_bootstrap_indices = np.unique(random_bootstrap_indices)
    not_presented_indices = [ind for ind in range(predictors.shape[0]) if not ind in unique_bootstrap_indices]
    
    random_train_matrix = predictors[unique_bootstrap_indices, : ]
    random_train_target = target[unique_bootstrap_indices, : ]
    
    random_test_matrix = predictors[not_presented_indices, : ]
    random_test_target = target[not_presented_indices, : ]
    
    return ((random_train_matrix, random_train_target), (random_test_matrix, random_test_target))

** Try to apply bootstrap and evaluate error on the whole dataset including the samples from the bootstrapped dataset **

In [28]:
tries = 100
measures_of_error = np.zeros(tries)
for one_try in range(tries):
    indices_of_bootstrapped_samples = np.random.randint(0, predictors.shape[0], size = predictors.shape[0])
    train_matrix = predictors[indices_of_bootstrapped_samples, : ]
    train_target = target[indices_of_bootstrapped_samples, :]
    measures_of_error[one_try] = classify_and_check(train_matrix, train_target, predictors, target)
         
measures_of_error.mean()

0.17749999999999999

** Not try to evaluate error only on those samples, which are not presented in the BD **

In [32]:
tries = 400
measures_of_error = np.zeros(tries)
for one_try in range(tries):
    ((train_matrix, train_target), (test_matrix, test_target)) = split_according_to_bootstrap(predictors, target)
    measures_of_error[one_try] = classify_and_check(train_matrix, train_target, test_matrix, test_target)
         
measures_of_error.mean()

0.47913317094185998