In [1]:
from classifiers import TransparentMultinomialNB
from utils import ce_squared, load_imdb, ColoredWeightedDoc, TopInstances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from IPython.display import display, display_html
from time import time
from pickle import load
import numpy as np
from scipy.sparse import csr_matrix

In [4]:
t0 = time()

vect = TfidfVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))

X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("../aclImdb", shuffle=True, vectorizer=vect)

feature_names = vect.get_feature_names()


duration = time() - t0

print("Loading took {:0.2f}s.\n".format(duration))

Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None).
done in 5.735958814620972s
(25000, 27272)
n_samples: 25000, n_features: 27272 

Extracting features from the test dataset using the same vectorizer
done in 5.422744274139404s
n_samples: 25000, n_features: 27272 

Loading took 22.36s.



In [5]:
ctrl_clf = TransparentMultinomialNB()
ctrl_clf.fit(X_train,y_train)
ctrl_clf.score(X_test, y_test)

0.83216000000000001

In [6]:
X_val = csr_matrix(X_train[12500:,:])
y_val = np.copy(y_train[12500:])
X_train = csr_matrix(X_train[:12500])
y_train = np.copy(y_train[:12500])

y_val_na = y_val[:, np.newaxis]
y_val_na = np.append(y_val_na, 1-y_val_na, axis=1)
y_modified = np.copy(y_train)

In [7]:
print(X_test.shape)
c = X_train.shape
b = np.copy(X_train)
#b.shape = c
print(type(y_test))

(25000, 27272)
<class 'numpy.ndarray'>


# Experiment

In [20]:
print(t1-t0,'\n',t2-t1,'\n',t3-t2,'\n',t4-t3,'\n',t5-t4,'\n',t7-t0)
assert (t1-t0)+(t7-t1) == t7-t0

0.004069328308105469 
 0.019451141357421875 
 5.7220458984375e-06 
 1.52587890625e-05 
 0.003274679183959961 
 0.04511833190917969


In [17]:
clf = TransparentMultinomialNB()
train_indices = list(range(10))

clf.fit(X_train[train_indices], y_train[train_indices])

best_clf = clf
current_error = ce_squared(y_val_na, clf.predict_proba(X_val))

for i in range(1):
    train_indices.append(i)
    
    t0 = time()
    clf = TransparentMultinomialNB()
    clf.fit(X_train[train_indices], y_modified[train_indices])        
    t1 = time()
    y_error = ce_squared(y_val_na, clf.predict_proba(X_val))
    t2 = time()
    
    y_modified[i] = 1 - y_modified[i]
    t3 = time()
    clf = TransparentMultinomialNB()
    t4 = time()
    clf.fit(X_train[train_indices], y_modified[train_indices])  
    t5 = time()
    y0_error = ce_squared(y_val_na, clf.predict_proba(X_val))
    t6 = time()
    if y_error < current_error and y_error < y0_error:            
        current_error = y_error
        y_modified[i] = 1 - y_modified[i]
        clf = TransparentMultinomialNB()
        clf.fit(X_train[train_indices], y_modified[train_indices]) 
        best_clf = clf
        #print("i = {}\tnew error = {:0.5f}".format(i, y_error))
    elif y0_error < current_error and y0_error < y_error: # switch back the label
        current_error = y0_error
        best_clf = clf
        #print("i = {}\tnew error = {:0.5f}".format(i, y0_error))
    else:
        train_indices.pop()
    
    t7 = time()

In [38]:
print(best_clf.score(X_val, y_val))
print(best_clf.score(X_test, y_test))
len(train_indices)

0.88112
0.86072


5337

In [2]:
with open('best.clf', 'rb') as f:
    best_clf = load(f)
    
with open('ctrl.clf','rb') as f:
    ctrl_clf = load(f)

# Visualization

In [39]:
x = ctrl_clf.predict_proba(X_test) - best_clf.predict_proba(X_test)
x = np.absolute(x[:,0])
inds = np.argsort(x)
i = inds[24998]

display_html("<b>"+'Probability Matrix'+"<b>", raw=True)
print(ctrl_clf.predict_proba(X_test)[i]) 
print(best_clf.predict_proba(X_test)[i])
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, ctrl_clf.get_weights()))
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
display(ColoredWeightedDoc(test_corpus[i], feature_names, best_clf.get_weights()))

[ 0.57250962  0.42749038]
[ 0.27419539  0.72580461]


In [40]:
best_pred = best_clf.predict(X_test)
ctrl_pred = ctrl_clf.predict(X_test)
display_html("<b>"+'Control Classifier'+"<b>", raw=True)
print(classification_report(y_test, ctrl_pred))
display_html("<b>"+'Best Classifier'+"<b>", raw=True)
print(classification_report(y_test, best_pred))

             precision    recall  f1-score   support

          0       0.80      0.89      0.84     12500
          1       0.87      0.78      0.82     12500

avg / total       0.84      0.83      0.83     25000



             precision    recall  f1-score   support

          0       0.86      0.86      0.86     12500
          1       0.86      0.86      0.86     12500

avg / total       0.86      0.86      0.86     25000



In [41]:
best_weights = best_clf.get_weights()
ctrl_weights = ctrl_clf.get_weights()

best_ws = np.argsort(best_weights)
ctrl_ws  = np.argsort(ctrl_weights)
#print(best_weights[ws[-1]], best_weights[ws[0]])


display_html("<b>"+'Best Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], best_clf.feature_count_[:,i])
                for i in best_ws[:10]]))

display_html("<b>"+'Control Classifier'+"<b>", raw=True)

print("Top Positive")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[-10:][::-1]]))

print("\nTop Negative")
print(" ".join(["{} ({})".format(feature_names[i], ctrl_clf.feature_count_[:,i])
                for i in ctrl_ws[:10]]))

Top Positive
wonderful ([  1.19551702  22.79785009]) superb ([  0.5528176   13.12307293]) excellent ([  1.98645737  24.17245081]) loved ([  1.50528922  19.84020344]) amazing ([  1.34020596  16.55996559]) favorite ([  1.30860228  16.29255888]) highly ([  1.33801318  14.29483079]) touching ([ 0.36013222  7.25338475]) wonderfully ([ 0.06938739  5.40624068]) paulie ([ 0.          4.56918776])

Top Negative
worst ([ 35.17669943   0.26319784]) awful ([ 23.74516724   0.2617469 ]) waste ([ 26.0352034    0.38849002]) terrible ([ 21.94523807   0.75216723]) stupid ([ 20.8121898    1.22352904]) worse ([ 17.94208567   0.9841256 ]) wasted ([ 10.39383326   0.265692  ]) poorly ([ 12.4327081    0.49691481]) boring ([ 23.62444942   1.74455798]) crap ([ 15.21951839   0.95344576])


Top Positive
paulie ([  0.0572581   11.62001835]) edie ([ 0.          9.50310828]) antwone ([ 0.          7.72889767]) matthau ([  0.78248313  13.0938614 ]) goldsworthy ([ 0.          6.79732636]) din ([ 0.          6.77401576]) victoria ([  1.0431436   13.36768916]) wonderfully ([  2.47759161  22.57390329]) felix ([ 0.48385479  9.01038775]) gundam ([ 0.19220119  7.0279382 ])

Top Negative
waste ([ 101.13224211    5.46404149]) worst ([ 145.77726278   10.81685456]) pointless ([ 34.66318443   2.24914225]) seagal ([ 13.48847522   0.32485711]) boll ([ 10.67268495   0.0988785 ]) unfunny ([ 21.58484167   1.14247772]) awful ([ 97.85823623   8.45586203]) mst3k ([ 15.01247428   0.64075814]) unwatchable ([ 10.95370109   0.24189979]) stinker ([ 10.64923928   0.23665215])
