In [1]:
%load_ext autoreload
%autoreload 2
from tox import *

In [2]:
bson_file = 'documents.bson'
label_key = 'document_type'
text_key = 'text'

# Process the raw data
dp = DataProcessor()
docs, y_all, counts = dp.load_bson(bson_file, label_key)
t0 = time.time()
vectorizer, X_all, feat_names = dp.vectorize(docs, text_key, min_df=2, max_ngram=2)
vec_time = time.time() - t0

y_train, X_train, y_test, X_test, y_unlab, X_unlab = dp.split_data(y_all, X_all, split=0.7, seed=0)
me = ModelEvaluator()

print(vec_time)
print(X_all.shape)
print(dp.label_dict)

150.45795798301697
(24085, 59099)
{'scientific_article_unpublished': 8, 'email': 0, 'scientific_article_published': 9, 'boardroom_minutes': 2, 'advertisement': 10, 'annual_report': 3, 'internal_memo': 1, 'newspaper_article': 6, 'deposition': 7, 'public_relations': 4, 'trade_association': 11, 'general_correspondance': 5}


In [3]:
# Multinomial Naive Bayes
MNB = MultinomialNB()
MNB_train_acc, MNB_train_time = me.train(MNB, y_train, X_train)
MNB_test_acc, MNB_test_prec, MNB_test_rec, MNB_test_time = me.test(MNB, y_test, X_test)
print('Time: ', MNB_train_time)
me.print_scores(dp, MNB_test_acc, MNB_test_prec, MNB_test_rec)

Time:  0.06094193458557129
Accuracy: 0.686258935663
email 
 F1:  0.965, P:  0.940, R:  0.992, 
internal_memo 
 F1:  0.000, P:  0.000, R:  0.000, 
boardroom_minutes 
 F1:  0.000, P:  0.000, R:  0.000, 
annual_report 
 F1:  0.000, P:  0.000, R:  0.000, 
public_relations 
 F1:  0.000, P:  0.000, R:  0.000, 
general_correspondance 
 F1:  0.056, P:  0.667, R:  0.029, 
newspaper_article 
 F1:  0.596, P:  0.683, R:  0.528, 
deposition 
 F1:  0.179, P:  0.909, R:  0.099, 
scientific_article_unpublished 
 F1:  0.692, P:  0.564, R:  0.896, 
scientific_article_published 
 F1:  0.020, P:  1.000, R:  0.010, 




  'precision', 'predicted', average, warn_for)
  dp.inv_label_dict[i], 2/(1/prec[i] + 1/rec[i]), prec[i], rec[i]))


In [4]:
# Bernoulli Naive Bayes
BNB = BernoulliNB()
BNB_train_acc, BNB_train_time = me.train(BNB, y_train, X_train)
BNB_test_acc, BNB_test_prec, BNB_test_rec, BNB_test_time = me.test(BNB, y_test, X_test)
print('Time: ' + str(BNB_train_time))
me.print_scores(dp, BNB_test_acc, BNB_test_prec, BNB_test_rec)

Time: 0.057497501373291016
Accuracy: 0.676727561557
email 
 F1:  0.966, P:  0.959, R:  0.973, 
internal_memo 
 F1:  0.000, P:  0.000, R:  0.000, 
boardroom_minutes 
 F1:  0.000, P:  0.000, R:  0.000, 
annual_report 
 F1:  0.000, P:  0.000, R:  0.000, 
public_relations 
 F1:  0.000, P:  0.000, R:  0.000, 
general_correspondance 
 F1:  0.000, P:  0.000, R:  0.000, 
newspaper_article 
 F1:  0.574, P:  0.631, R:  0.527, 
deposition 
 F1:  0.000, P:  0.000, R:  0.000, 
scientific_article_unpublished 
 F1:  0.693, P:  0.563, R:  0.900, 
scientific_article_published 
 F1:  0.000, P:  0.000, R:  0.000, 




  'precision', 'predicted', average, warn_for)
  dp.inv_label_dict[i], 2/(1/prec[i] + 1/rec[i]), prec[i], rec[i]))


In [5]:
# LinearSVC (liblinear SVM implementation, one-v-all)
SVMlin = LinearSVC()
SVMlin_train_acc, SVMlin_train_time = me.train(SVMlin, y_train, X_train)
SVMlin_test_acc, SVMlin_test_prec, SVMlin_test_rec, SVMlin_test_time = me.test(SVMlin, y_test, X_test)
print('Time: ' + str(SVMlin_train_time))
me.print_scores(dp, SVMlin_test_acc, SVMlin_test_prec, SVMlin_test_rec)

Time: 0.3663311004638672
Accuracy: 0.769658459095
email 
 F1:  0.983, P:  0.980, R:  0.987, 
internal_memo 
 F1:  0.577, P:  0.757, R:  0.467, 
boardroom_minutes 
 F1:  0.679, P:  0.818, R:  0.581, 
annual_report 
 F1:  0.533, P:  1.000, R:  0.364, 
public_relations 
 F1:  0.286, P:  1.000, R:  0.167, 
general_correspondance 
 F1:  0.455, P:  0.514, R:  0.409, 
newspaper_article 
 F1:  0.696, P:  0.677, R:  0.715, 
deposition 
 F1:  0.623, P:  0.940, R:  0.465, 
scientific_article_unpublished 
 F1:  0.762, P:  0.708, R:  0.824, 
scientific_article_published 
 F1:  0.471, P:  0.821, R:  0.330, 




In [6]:
# LinearSVC, class-weighted
SVMlincw = LinearSVC(class_weight='balanced')
SVMlincw_train_acc, SVMlincw_train_time = me.train(SVMlincw, y_train, X_train)
SVMlincw_test_acc, SVMlincw_test_prec, SVMlincw_test_rec, SVMlincw_test_time = me.test(SVMlincw, y_test, X_test)
print('Time: ' + str(SVMlincw_train_time))
me.print_scores(dp, SVMlincw_test_acc, SVMlincw_test_prec, SVMlincw_test_rec)

Time: 0.5032401084899902
Accuracy: 0.758538522637
email 
 F1:  0.980, P:  0.973, R:  0.987, 
internal_memo 
 F1:  0.512, P:  0.508, R:  0.517, 
boardroom_minutes 
 F1:  0.630, P:  0.548, R:  0.742, 
annual_report 
 F1:  0.762, P:  0.800, R:  0.727, 
public_relations 
 F1:  0.200, P:  0.250, R:  0.167, 
general_correspondance 
 F1:  0.436, P:  0.398, R:  0.482, 
newspaper_article 
 F1:  0.693, P:  0.703, R:  0.683, 
deposition 
 F1:  0.621, P:  0.833, R:  0.495, 
scientific_article_unpublished 
 F1:  0.754, P:  0.735, R:  0.773, 
scientific_article_published 
 F1:  0.528, P:  0.677, R:  0.433, 




In [7]:
lp = LabelPropagator()
hp_dists = SVMlin.decision_function(X_unlab)
conf_scores = lp.confidence_scores(hp_dists)

y_pred = SVMlin.predict(X_unlab).reshape(-1,1)
y_est = lp.propagate_labels(y_pred, conf_scores, conf_threshold=1.5)

print(y_est.shape[0], np.sum(y_est != -1))

15693 663
