In [1]:
from classifiers import TransparentMultinomialNB
from utils import ce_squared, load_imdb, ColoredWeightedDoc, TopInstances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from IPython.display import display, display_html
from time import time
from pickle import load
import numpy as np


In [71]:
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))

X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)

feature_names = vect.get_feature_names()
y_test_na = y_test[:, np.newaxis]
y_test_na = np.append(y_test_na, 1-y_test_na, axis=1)
y_modified = np.copy(y_train)


duration = time() - t0

print("Loading took {:0.2f}s.\n".format(duration))

Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 8.166166305541992s
(25000, 27272)
n_samples: 25000, n_features: 27272

Extracting features from the test dataset using the same vectorizer
done in 6.553291082382202s
n_samples: 25000, n_features: 27272

Loading took 24.11s.



# Experiment

In [3]:
clf = TransparentMultinomialNB()
train_indices = list(range(10))

clf.fit(X_train[train_indices], y_train[train_indices])

best_clf = clf
current_error = ce_squared(y_test_na, clf.predict_proba(X_test))

for i in range(10,1000):
    train_indices.append(i)
    
    clf = TransparentMultinomialNB()
    clf.fit(X_train[train_indices], y_modified[train_indices])        
    y_error = ce_squared(y_test_na, clf.predict_proba(X_test))
    
    y_modified[i] = 1 - y_modified[i]
    clf = TransparentMultinomialNB()
    clf.fit(X_train[train_indices], y_modified[train_indices])  
    y0_error = ce_squared(y_test_na, clf.predict_proba(X_test))

    if y_error < current_error and y_error < y0_error:            
        current_error = y_error
        y_modified[i] = 1 - y_modified[i]
        clf = TransparentMultinomialNB()
        clf.fit(X_train[train_indices], y_modified[train_indices]) 
        best_clf = clf
        print("i = {}\tnew error = {:0.5f}".format(i, y_error))
    
    elif y0_error < current_error and y0_error < y_error: # switch back the label
        current_error = y0_error
        best_clf = clf
        print("i = {}\tnew error = {:0.5f}".format(i, y0_error))
    
    else:
        train_indices.pop()

i = 10	new error = 0.49657
i = 11	new error = 0.48198
i = 12	new error = 0.46519
i = 13	new error = 0.40152
i = 14	new error = 0.36584
i = 15	new error = 0.33211
i = 135	new error = 0.33108
i = 149	new error = 0.32845
i = 245	new error = 0.32772
i = 264	new error = 0.32399
i = 279	new error = 0.32353
i = 280	new error = 0.32138
i = 294	new error = 0.31517
i = 299	new error = 0.30909
i = 310	new error = 0.30657
i = 312	new error = 0.30560
i = 316	new error = 0.29619
i = 329	new error = 0.29400
i = 331	new error = 0.29295
i = 378	new error = 0.29249
i = 408	new error = 0.29098
i = 467	new error = 0.28877
i = 525	new error = 0.28873
i = 824	new error = 0.28630
i = 882	new error = 0.28527


In [4]:
ctrl_clf = TransparentMultinomialNB()
ctrl_clf.fit(X_train[:1000],y_train[:1000])

TransparentMultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

sklearn.feature_extraction.text.CountVectorizer

In [16]:
with open('clf.arch', 'rb') as f:
    clf_arch = load(f)
    
with open('clf.arch', 'rb') as f:
    a = load(f)

ctrl_clf = clf_arch.ctrl_clf
best_clf = clf_arch.classifiers[-1]
y_modified = clf_arch.modified_labels[-1]
train_indices = clf_arch.train_indices[-1]


In [70]:
print('Number of samples used :', len(train_indices))
changed_labels = np.array(list(filter(lambda x: x[0]!=x[1], zip(y_modified[train_indices], y_train[train_indices]))))
print('Number of labels modified:', len(changed_labels))

Number of samples used : 381
Number of labels modified: 60


In [69]:
changes = changed_labels[:,0] - changed_labels[:,1]
print('1 to 0 :', len(list(filter(lambda x: x<0, changes))))
print('0 to 1 :', len(list(filter(lambda x: x>0, changes))))

1 to 0 : 37
0 to 1 : 23


# Visualization

In [6]:
x = ctrl_clf.predict_proba(X_test) - best_clf.predict_proba(X_test)
x = np.absolute(x[:,0])
inds = np.argsort(x)
i = inds[24998]

display_html("<b>"+'Probability Matrix'+"<b>", raw=True)
print(ctrl_clf.predict_proba(X_test)[i]) 
print(best_clf.predict_proba(X_test)[i])
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, ctrl_clf.get_weights()))
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, best_clf.get_weights()))

[  2.32123686e-32   1.00000000e+00]
[  1.00000000e+00   7.51055188e-18]


In [7]:
best_pred = best_clf.predict(X_test)
ctrl_pred = ctrl_clf.predict(X_test)
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
print(classification_report(y_test, ctrl_pred))
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
print(classification_report(y_test, best_pred))

             precision    recall  f1-score   support

          0       0.78      0.88      0.83     12500
          1       0.86      0.75      0.80     12500

avg / total       0.82      0.82      0.82     25000



             precision    recall  f1-score   support

          0       0.84      0.83      0.84     12500
          1       0.83      0.85      0.84     12500

avg / total       0.84      0.84      0.84     25000



In [13]:
best_weights = best_clf.get_weights()
ctrl_weights = ctrl_clf.get_weights()

best_ws = np.argsort(best_weights)
ctrl_ws  = np.argsort(ctrl_weights)

display_html("<b>"+'Best Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[:10]]))

display_html("<b>"+'Control Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[:10]]))

3.0920182951 -3.33122866843


Top Positive
excellent ([  0.  21.]) wonderful ([  0.  20.]) zorro ([  0.  10.]) definitely ([ 0.  9.]) superb ([ 0.  7.]) incredible ([ 0.  7.]) enjoyed ([ 0.  6.]) emotions ([ 0.  6.]) enjoy ([  1.  13.]) genius ([ 0.  6.])

Top Negative
worst ([ 27.   0.]) boring ([ 17.   0.]) bad ([ 83.   4.]) nothing ([ 15.   0.]) terrible ([ 14.   0.]) dibiase ([ 14.   0.]) awful ([ 13.   0.]) pin ([ 12.   0.]) hogan ([ 11.   0.]) predictable ([ 11.   0.])


Top Positive
edie ([   0.  109.]) antwone ([  0.  88.]) din ([  0.  82.]) gunga ([  0.  66.]) goldsworthy ([  0.  65.]) gypo ([  0.  60.]) yokai ([  0.  60.]) paulie ([   1.  118.]) flavia ([  0.  51.]) visconti ([  0.  51.])

Top Negative
boll ([ 143.    1.]) uwe ([ 101.    1.]) slater ([ 49.   0.]) tashan ([ 45.   0.]) hobgoblins ([ 45.   0.]) kareena ([ 41.   0.]) kornbluth ([ 39.   0.]) sarne ([ 37.   0.]) gram ([ 37.   0.]) delia ([ 36.   0.])
