# Experiment 3 Results

In [65]:
from pickle import load
from utils import ce_squared, load_imdb, ColoredWeightedDoc
from IPython.display import display, display_html
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from classifiers import TransparentMultinomialNB
from time import time
import numpy as np

## Load Data

In [2]:
t0 = time()

vect = TfidfVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))

X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)

feature_names = vect.get_feature_names()
y_test_na = y_test[:, np.newaxis]
y_test_na = np.append(y_test_na, 1-y_test_na, axis=1)
y_modified = np.copy(y_train)

duration = time() - t0

print("Loading took {:0.2f}s.\n".format(duration))

Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None).
done in 5.645135164260864s
(25000, 27272)
n_samples: 25000, n_features: 27272 

Extracting features from the test dataset using the same vectorizer
done in 5.47081995010376s
n_samples: 25000, n_features: 27272 

Loading took 12.76s.



In [3]:
X_val = csr_matrix(X_train[12500:])
y_val = np.copy(y_train[12500:])

X_train = csr_matrix(X_train[:12500])
y_train = np.copy(y_train[:12500])

y_val_na = y_val[:, np.newaxis]
y_val_na = np.append(y_val_na, 1-y_val_na, axis=1)

In [4]:
with open('clf4.arch', 'rb') as f:
    clf_arch = load(f)

In [5]:
clf_arch.stats()

<class 'classifiers.TransparentMultinomialNB'> 

[1, 8, 2, 9, 3, 10, 4, 5, 6, 7, 8, 9, 10] 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)


In [6]:
# Loading the last classifier in the archive
ctrl_clf = clf_arch.ctrl_clf
best_clf = clf_arch.classifiers[-1]
train_indices_set = set(clf_arch.train_indices[-1])
train_indices = clf_arch.train_indices[-1]
y_modified = np.copy(clf_arch.modified_labels[-1])
round_tag = clf_arch.round_tags[-1] + 1 

## Visualizations

In [7]:
print('Number of samples used :', len(train_indices))
changed_labels = np.array(list(filter(lambda x: x[0]!=x[1], zip(y_modified[train_indices], y_train[train_indices]))))
print('Number of labels modified:', len(changed_labels))

Number of samples used : 7759
Number of labels modified: 465


In [8]:
changes = changed_labels[:,0] - changed_labels[:,1]
print('1 to 0 :', len(list(filter(lambda x: x<0, changes))))
print('0 to 1 :', len(list(filter(lambda x: x>0, changes))))

1 to 0 : 210
0 to 1 : 255


In [69]:
val_acc = best_clf.score(X_val, y_val)
print('Validation accuracy is {}'.format(val_acc))

test_acc = best_clf.score(X_test, y_test)
print('Test accuracy is {}'.format(test_acc))

clf = TransparentMultinomialNB()
clf.fit(X_train, y_train)

test_acc = ctrl_clf.score(X_test, y_test)
print('Test accuracy is {}'.format(test_acc))

test_acc = clf.score(X_test, y_test)
print('Test accuracy is {}'.format(test_acc))

val_acc = clf.score(X_val, y_val)
print('Validation accuracy is {}'.format(val_acc))


Validation accuracy is 0.89664
Test accuracy is 0.86596
Test accuracy is 0.83216
Test accuracy is 0.8254
Validation accuracy is 0.85568


In [11]:
best_pred = best_clf.predict(X_test)
ctrl_pred = ctrl_clf.predict(X_test)
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
print(classification_report(y_test, ctrl_pred))
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
print(classification_report(y_test, best_pred))

             precision    recall  f1-score   support

          0       0.80      0.89      0.84     12500
          1       0.87      0.78      0.82     12500

avg / total       0.84      0.83      0.83     25000



             precision    recall  f1-score   support

          0       0.86      0.87      0.87     12500
          1       0.87      0.86      0.87     12500

avg / total       0.87      0.87      0.87     25000



In [12]:
best_weights = best_clf.get_weights()
ctrl_weights = ctrl_clf.get_weights()

best_ws = np.argsort(best_weights)
ctrl_ws  = np.argsort(ctrl_weights)

display_html("<b>"+'Control Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[:10]]))

display_html("<b>"+'Best Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[:10]]))

Top Positive
paulie ([  0.0572581   11.62001835]) edie ([ 0.          9.50310828]) antwone ([ 0.          7.72889767]) matthau ([  0.78248313  13.0938614 ]) goldsworthy ([ 0.          6.79732636]) din ([ 0.          6.77401576]) victoria ([  1.0431436   13.36768916]) wonderfully ([  2.47759161  22.57390329]) felix ([ 0.48385479  9.01038775]) gundam ([ 0.19220119  7.0279382 ])

Top Negative
waste ([ 101.13224211    5.46404149]) worst ([ 145.77726278   10.81685456]) pointless ([ 34.66318443   2.24914225]) seagal ([ 13.48847522   0.32485711]) boll ([ 10.67268495   0.0988785 ]) unfunny ([ 21.58484167   1.14247772]) awful ([ 97.85823623   8.45586203]) mst3k ([ 15.01247428   0.64075814]) unwatchable ([ 10.95370109   0.24189979]) stinker ([ 10.64923928   0.23665215])


Top Positive
superb ([  1.10649782  16.6795776 ]) wonderful ([  2.96593022  32.14253751]) amazing ([  2.40158022  24.02243591]) wonderfully ([ 0.22295821  7.67755475]) loved ([  3.21779012  27.36536967]) favorite ([  2.55847621  21.98618713]) fantastic ([  1.86691541  17.22070786]) touching ([ 0.70935884  9.8430855 ]) gem ([ 0.62315025  9.09394098]) refreshing ([ 0.12773228  5.89928259])

Top Negative
waste ([ 35.53800166   0.48004901]) worst ([ 48.52448864   1.49461174]) awful ([ 33.24351215   0.79430814]) poorly ([ 17.60087299   0.31477437]) pointless ([ 12.93774176   0.23736166]) terrible ([ 28.46097263   1.63882491]) pathetic ([ 11.63340441   0.29058524]) lame ([ 16.20249451   0.87599945]) unfunny ([ 8.08817749  0.        ]) worse ([ 25.26927848   1.99183594])


In [58]:
print(np.transpose(X_test[i].todense()).shape)
b = np.transpose(best_clf.get_weights()).shape
c = np.multiply(a,b)
b
#c.shape

(27272, 1)


(27272,)

In [43]:
x = ctrl_clf.predict_proba(X_test) - best_clf.predict_proba(X_test)
x = np.absolute(x[:,0])
inds = np.argsort(x)
i = inds[-1]

display_html("<b>"+'Probability Matrix'+"<b>", raw=True)
print(ctrl_clf.predict_proba(X_test)[i]) 
print(best_clf.predict_proba(X_test)[i])
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, np.multiply(X_test[i].todense(), ctrl_clf.get_weights()))) # dot product
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, np.multiply(X_test[i].todense(), best_clf.get_weights())))

[ 0.57250962  0.42749038]
[ 0.29450805  0.70549195]
