# Experiment 8 Results

In [1]:
from pickle import load
from utils import ce_squared, load_imdb, ColoredWeightedDoc
from IPython.display import display, display_html
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from classifiers import TransparentMultinomialNB
from time import time
import numpy as np

## Load Data

In [2]:
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))

X_train, y_train, X_val, y_val, X_test, y_test, train_corpus, val_corpus, test_corpus = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)

feature_names = vect.get_feature_names()

y_test_na = y_test[:, np.newaxis]
y_test_na = np.append(y_test_na, 1-y_test_na, axis=1)

y_val_na = y_val[:, np.newaxis]
y_val_na = np.append(y_val_na, 1-y_val_na, axis=1)

duration = time() - t0

print("Loading took {:0.2f}s.\n".format(duration))

Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 5.492542266845703s
(12500, 18308)
n_samples: 12500, n_features: 18308 

Extracting features from the test dataset using the same vectorizer
done in 4.991715908050537s
n_samples: 25000, n_features: 18308 

[ 1766 11919  8909 ...,  5390   860  7270]
Loading took 12.12s.



In [3]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(12500, 18308)
(12500, 18308)
(25000, 18308)
(12500,)
(12500,)
(25000,)


In [4]:
with open('clf8.arch', 'rb') as f:
    clf_arch = load(f)

In [5]:
clf_arch.stats()

<class 'classifiers.TransparentMultinomialNB'> 

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [6]:
# Loading the last classifier in the archive
ctrl_clf = clf_arch.ctrl_clf
best_clf = clf_arch.classifiers[-1]
train_indices_set = set(clf_arch.train_indices[-1])
train_indices = clf_arch.train_indices[-1]
y_modified = np.copy(clf_arch.modified_labels[-1])
round_tag = clf_arch.round_tags[-1] + 1 

In [7]:
first_clf = clf_arch.classifiers[0]

## Visualizations

In [8]:
print('Number of samples used :', len(train_indices))
changed_labels = np.array(list(filter(lambda x: x[0]!=x[1], zip(y_modified[train_indices], y_train[train_indices]))))
print('Number of labels modified:', len(changed_labels))

Number of samples used : 8347
Number of labels modified: 6029


In [9]:
changes = changed_labels[:,0] - changed_labels[:,1]
print('1 to 0 :', len(list(filter(lambda x: x<0, changes))))
print('0 to 1 :', len(list(filter(lambda x: x>0, changes))))

1 to 0 : 0
0 to 1 : 6029


In [10]:
test_acc = ctrl_clf.score(X_test, y_test)
print('Control test accuracy is {}'.format(test_acc), '\n')

clf = TransparentMultinomialNB()
clf.fit(X_train, y_train)

val_acc = clf.score(X_val, y_val)
print('Initial validation accuracy is {}'.format(val_acc), '\n')

test_acc = clf.score(X_test, y_test)
print('Initial test accuracy is {}'.format(test_acc), '\n')

val_acc = first_clf.score(X_val, y_val)
print('First validation accuracy is {}'.format(val_acc), '\n')

test_acc = first_clf.score(X_test, y_test)
print('First test accuracy is {}'.format(test_acc), '\n')

val_acc = best_clf.score(X_val, y_val)
print('Best validation accuracy is {}'.format(val_acc), '\n')

test_acc = best_clf.score(X_test, y_test)
print('Best test accuracy is {}'.format(test_acc), '\n')

test_acc = ce_squared(y_val_na, first_clf.predict_proba(X_val))
print('First validation error is {}'.format(test_acc), '\n')

test_acc = ce_squared(y_test_na, first_clf.predict_proba(X_test))
print('First test error is {}'.format(test_acc), '\n')

clf = TransparentMultinomialNB()
clf.fit(X_train[train_indices], y_modified[train_indices])

test_acc = ce_squared(y_val_na, best_clf.predict_proba(X_val))
print('Best validation error is {}'.format(test_acc), '\n')

test_acc = ce_squared(y_test_na, best_clf.predict_proba(X_test))
print('Best test error is {}'.format(test_acc))

Control test accuracy is 0.5 

Initial validation accuracy is 0.0 

Initial test accuracy is 0.5 

First validation accuracy is 0.59912 

First test accuracy is 0.69436 

Best validation accuracy is 0.99984 

Best test accuracy is 0.57748 

First validation error is 0.3500657418774514 

First test error is 0.26877112836601397 

Best validation error is 0.0001604393209642369 



  self.class_log_prior_ = (np.log(self.class_count_)


Best test error is 0.3923594148407452


In [11]:
best_pred = best_clf.predict(X_test)
ctrl_pred = ctrl_clf.predict(X_test)
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
print(classification_report(y_test, ctrl_pred))
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
print(classification_report(y_test, best_pred))

             precision    recall  f1-score   support

          0       0.50      1.00      0.67     12500
          1       0.00      0.00      0.00     12500

avg / total       0.25      0.50      0.33     25000



  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.96      0.16      0.28     12500
          1       0.54      0.99      0.70     12500

avg / total       0.75      0.58      0.49     25000



In [12]:
best_weights = best_clf.get_weights()
ctrl_weights = ctrl_clf.get_weights()

best_ws = np.argsort(best_weights)
ctrl_ws  = np.argsort(ctrl_weights)

display_html("<b>"+'Control Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[:10]]))

display_html("<b>"+'Best Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[:10]]))

Top Positive
reappearing ([ 5.  0.]) chiefs ([ 5.  0.]) idaho ([ 5.  0.]) fanaticism ([ 5.  0.]) oxford ([ 5.  0.]) concur ([ 5.  0.]) medications ([ 5.  0.]) unimaginably ([ 5.  0.]) sidetracked ([ 5.  0.]) bloodletting ([ 5.  0.])

Top Negative
the ([ 163405.       0.]) and ([ 74393.      0.]) of ([ 69009.      0.]) to ([ 68975.      0.]) br ([ 52636.      0.]) is ([ 50085.      0.]) it ([ 48393.      0.]) in ([ 43755.      0.]) this ([ 40922.      0.]) that ([ 37640.      0.])


Top Positive
pia ([  0.  30.]) connery ([  0.  29.]) hamlet ([  0.  27.]) wolverine ([  0.  24.]) oprah ([  0.  24.]) tired ([   4.  119.]) spice ([  0.  22.]) fabulous ([  0.  22.]) gem ([  1.  44.]) funniest ([  2.  64.])

Top Negative
cortes ([ 12.   0.]) unisol ([ 11.   0.]) shen ([ 10.   0.]) ripley ([ 20.   1.]) jigsaw ([ 30.   2.]) ziering ([ 9.  0.]) pasolini ([ 26.   2.]) aztec ([ 25.   2.]) whaling ([ 7.  0.]) hartnett ([ 14.   1.])


In [13]:
x = ctrl_clf.predict_proba(X_test) - best_clf.predict_proba(X_test)
x = np.absolute(x[:,0])
inds = np.argsort(x)
i = inds[-1]

display_html("<b>"+'Probability Matrix'+"<b>", raw=True)
print(ctrl_clf.predict_proba(X_test)[i]) 
print(best_clf.predict_proba(X_test)[i])
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, ctrl_clf.get_weights()))
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, best_clf.get_weights()))

[ 1.  0.]
[  2.27865237e-44   1.00000000e+00]
