In [1]:
from classifiers import TransparentMultinomialNB
from utils import ce_squared, load_imdb, ColoredWeightedDoc, TopInstances
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display, display_html
from time import time
import numpy as np

In [2]:
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))

X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)

feature_names = vect.get_feature_names()
y_test_na = y_test[:, np.newaxis]
y_test_na = np.append(y_test_na, 1-y_test_na, axis=1)
y_modified = np.copy(y_train)


duration = time() - t0

print("Loading took {:0.2f}s.\n".format(duration))

Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function word_tokenize at 0x7f2083734510>,
        vocabulary=None).
done in 55.95361614227295s
(25000, 29712)
n_samples: 25000, n_features: 29712

Extracting features from the test dataset using the same vectorizer
done in 68.23144555091858s
n_samples: 25000, n_features: 29712

Loading took 125.88s.



# Experiment

In [3]:
clf = TransparentMultinomialNB()
train_indices = list(range(10))

clf.fit(X_train[train_indices], y_train[train_indices])

best_clf = clf
current_error = ce_squared(y_test_na, clf.predict_proba(X_test))

for i in range(10,1000):
    train_indices.append(i)
    
    clf = TransparentMultinomialNB()
    clf.fit(X_train[train_indices], y_modified[train_indices])        
    y_error = ce_squared(y_test_na, clf.predict_proba(X_test))
    
    y_modified[i] = 1 - y_modified[i]
    clf = TransparentMultinomialNB()
    clf.fit(X_train[train_indices], y_modified[train_indices])  
    y0_error = ce_squared(y_test_na, clf.predict_proba(X_test))

    if y_error < current_error and y_error < y0_error:            
        current_error = y_error
        y_modified[i] = 1 - y_modified[i]
        clf = TransparentMultinomialNB()
        clf.fit(X_train[train_indices], y_modified[train_indices]) 
        best_clf = clf
        print("i = {}\tnew error = {:0.5f}".format(i, y_error))
    
    elif y0_error < current_error and y0_error < y_error: # switch back the label
        current_error = y0_error
        best_clf = clf
        print("i = {}\tnew error = {:0.5f}".format(i, y0_error))
    
    else:
        train_indices.pop()

i = 10	new error = 0.49657
i = 11	new error = 0.48198
i = 12	new error = 0.46519
i = 13	new error = 0.40152
i = 14	new error = 0.36584
i = 15	new error = 0.33211
i = 135	new error = 0.33108
i = 149	new error = 0.32845
i = 245	new error = 0.32772
i = 264	new error = 0.32399
i = 279	new error = 0.32353
i = 280	new error = 0.32138
i = 294	new error = 0.31517
i = 299	new error = 0.30909
i = 310	new error = 0.30657
i = 312	new error = 0.30560
i = 316	new error = 0.29619
i = 329	new error = 0.29400
i = 331	new error = 0.29295
i = 378	new error = 0.29249
i = 408	new error = 0.29098
i = 467	new error = 0.28877
i = 525	new error = 0.28873
i = 824	new error = 0.28630
i = 882	new error = 0.28527


In [4]:
ctrl_clf = TransparentMultinomialNB()
ctrl_clf.fit(X_train[:1000],y_train[:1000])

TransparentMultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [5]:
x = ctrl_clf.predict_proba(X_test) - best_clf.predict_proba(X_test)
x = np.absolute(x[:,0])
i = np.argsort(x)[0]

In [6]:
#neg_evi, pos_evi = best_clf.predict_evidences(X_test)
#i = TopInstances(neg_evi, pos_evi, best_clf.get_bias()).most_negatives()[0]
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, best_clf.get_weights()))
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, ctrl_clf.get_weights()))