In [1]:
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer as Vectorizer
from concurrent.futures import ProcessPoolExecutor
from classifiers import TransparentMultinomialNB as Classifier
from utils import ce_squared, load_imdb, ClassifierArchive
from itertools import starmap, repeat, chain
from time import time
import numpy as np
import pickle

In [2]:
def produce_modifications(X_train, y_train, train_indices, target_indices, X_val, y_val_na):
    for i in target_indices:

        if i in train_indices:
            mod0 = np.copy(y_train)
            mod0[i] = 1 - mod0[i]
            yield X_train, mod0, train_indices, X_val, y_val_na

            mod1 = list(train_indices)
            mod1.remove(i)
            yield X_train, y_train, mod1, X_val, y_val_na

        else:
            mod0 = list(train_indices)
            mod0.append(i)
            yield X_train, y_train, mod0, X_val, y_val_na

            mod1 = np.copy(y_train)
            mod1[i] = 1 - mod1[i]
            yield X_train, mod1, mod0, X_val, y_val_na

In [3]:
def test_modification(test):
    X_train, y_train, train_indices, X_val, y_val_na = test
    
    clf = Classifier()
    clf.fit(X_train[train_indices],y_train[train_indices])
    new_error = ce_squared(y_val_na, clf.predict_proba(X_val))
    
    return new_error, y_train, train_indices

In [20]:
# Loading
t0 = time()

vect = Vectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))

X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)

y_test_na = y_test[:, np.newaxis]
y_test_na = np.append(y_test_na, 1-y_test_na, axis=1)

clf = Classifier()
clf.fit(X_train, y_train)
ctrl_clf = clf
ctrl_error = ce_squared(y_test_na, clf.predict_proba(X_test))
ctrl_acc = clf.score(X_test, y_test)

Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 4.8937928676605225s
(25000, 27315)
n_samples: 25000, n_features: 27315 

Extracting features from the test dataset using the same vectorizer
done in 4.719479084014893s
n_samples: 25000, n_features: 27315 



In [21]:
# Split the train dataset in 2 for validation
split = int(X_train.shape[0] / 2)
#split = 200

X_val = csr_matrix(X_train[split:])
y_val = np.copy(y_train[split:])

X_train = csr_matrix(X_train[:split])
y_train = np.copy(y_train[:split])

y_val_na = y_val[:, np.newaxis]
y_val_na = np.append(y_val_na, 1-y_val_na, axis=1)

duration = time() - t0
print("Loading the dataset took {:0.2f}s.".format(duration), '\n')

Loading the dataset took 214.04s. 



In [22]:
start_ind = 0
batch_size = 10
end_ind = start_ind + batch_size

clf = Classifier()
clf.fit(X_train, y_train)
best_error = ce_squared(y_val_na, clf.predict_proba(X_val))
best_y_train = np.copy(y_train)
best_train_indices = list(range(X_train.shape[0]))

In [23]:
print(best_error)

0.137557676597


In [24]:
while end_ind <= X_train.shape[0]:
    target_indices = range(start_ind, end_ind)
    mods = produce_modifications(X_train, best_y_train, best_train_indices, target_indices, X_val, y_val_na)
    test_results = list(map(test_modification, mods))
    test_results.append((best_error, best_y_train, best_train_indices))
    best_error, best_y_train, best_train_indices = min(test_results, key=lambda x: x[0])
    print('Training round: 0,\tProcessed: {:5d} samples,\tcurrent error is {:0.4f}'.format(end_ind, best_error))
    start_ind += batch_size
    end_ind += batch_size

best_clf = Classifier()
best_clf.fit(X_train[best_train_indices], best_y_train[best_train_indices])
test_acc = best_clf.score(X_test, y_test)
print('Training round: 0,\tTest accuracy is {:0.3f},\tCotrol accuracy is {:0.3f}'.format(test_acc, ctrl_acc))

Training round: 0,	Processed:    10 samples,	current error is 0.1375
Training round: 0,	Processed:    20 samples,	current error is 0.1373
Training round: 0,	Processed:    30 samples,	current error is 0.1372
Training round: 0,	Processed:    40 samples,	current error is 0.1371
Training round: 0,	Processed:    50 samples,	current error is 0.1371
Training round: 0,	Processed:    60 samples,	current error is 0.1370
Training round: 0,	Processed:    70 samples,	current error is 0.1368
Training round: 0,	Processed:    80 samples,	current error is 0.1368
Training round: 0,	Processed:    90 samples,	current error is 0.1367
Training round: 0,	Processed:   100 samples,	current error is 0.1366
Training round: 0,	Processed:   110 samples,	current error is 0.1366
Training round: 0,	Processed:   120 samples,	current error is 0.1365
Training round: 0,	Processed:   130 samples,	current error is 0.1364
Training round: 0,	Processed:   140 samples,	current error is 0.1363
Training round: 0,	Processed:   15

In [25]:
clf_arch = ClassifierArchive(ctrl_clf, best_clf, best_train_indices, best_y_train, vect)

In [26]:
with open('clf8-mb.arch', 'wb') as f:
    pickle.dump(clf_arch, f)

for i in range(2, 11):
    start_ind = 0
    end_ind = start_ind + batch_size

    while end_ind <= X_train.shape[0]:
        target_indices = range(start_ind, end_ind)
        mods = produce_modifications(X_train, best_y_train, best_train_indices, target_indices, X_val, y_val_na)
        test_results = list(map(test_modification, mods))
        test_results.append((best_error, best_y_train, best_train_indices))
        best_error, best_y_train, best_train_indices = min(test_results, key=lambda x: x[0])

        print('Training round: {},\tProcessed: {:5d} samples,\tcurrent error is {:0.6f}'.format(i, end_ind, best_error))
        start_ind += batch_size
        end_ind += batch_size

    best_clf = Classifier()
    best_clf.fit(X_train[best_train_indices], best_y_train[best_train_indices])
    test_acc = best_clf.score(X_test, y_test)
    print('Training round: {},\tTest accuracy is {:0.3f},\tCotrol accuracy is {:0.3f}'.format(i, test_acc, ctrl_acc))

    with open('clf8-mb.arch', 'rb') as f:
        clf_arch = pickle.load(f)

    clf_arch.add_classifier(best_clf, best_train_indices, best_y_train, i)

    with open('clf8-mb.arch', 'wb') as f:
        pickle.dump(clf_arch, f)

print('Experiment is done.')

Training round: 2,	Processed:    10 samples,	current error is 0.065156
Training round: 2,	Processed:    20 samples,	current error is 0.065150
Training round: 2,	Processed:    30 samples,	current error is 0.065132
Training round: 2,	Processed:    40 samples,	current error is 0.065101
Training round: 2,	Processed:    50 samples,	current error is 0.065051
Training round: 2,	Processed:    60 samples,	current error is 0.065051
Training round: 2,	Processed:    70 samples,	current error is 0.065008
Training round: 2,	Processed:    80 samples,	current error is 0.065004
Training round: 2,	Processed:    90 samples,	current error is 0.064985
Training round: 2,	Processed:   100 samples,	current error is 0.064969
Training round: 2,	Processed:   110 samples,	current error is 0.064939
Training round: 2,	Processed:   120 samples,	current error is 0.064930
Training round: 2,	Processed:   130 samples,	current error is 0.064894
Training round: 2,	Processed:   140 samples,	current error is 0.064881
Traini

In [33]:
for i in range(len(clf_arch)):
    print(len(clf_arch.train_indices[i]))

12402
11984
11354
10704
10114
9654
9294
9015
8852
8722


In [34]:
for i in range(len(clf_arch)):
    print(np.sum(clf_arch.modified_labels[i]))

6250
6246
6214
6215
6220
6233
6231
6220
6194
6186


In [35]:
for i in range(len(clf_arch)):
    print(np.sum(clf_arch.modified_labels[i][clf_arch.train_indices[i]]))

6187
5999
5651
5325
5066
4857
4670
4528
4451
4388


In [36]:
for i in range(len(clf_arch)):
    print(ce_squared(y_val_na, clf_arch.classifiers[i].predict_proba(X_val)))

0.0651951916206
0.0429136353848
0.0319437852425
0.0254964752084
0.0219368172648
0.0194672002715
0.0176191440747
0.0163348128063
0.015533781584
0.0149415340372


In [37]:
for i in range(len(clf_arch)):
    print(clf_arch.classifiers[i].score(X_test, y_test))

0.86132
0.85372
0.85104
0.84768
0.84636
0.8448
0.84196
0.84252
0.8432
0.8416
