# 20newsgroups example

In [46]:
%load_ext autoreload
%autoreload 2

import collections, pandas as pd, numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from libact.models import LogisticRegression as LibActLogReg
from libact.query_strategies import UncertaintySampling

import sys
sys.path.append('/notebook')

from actleto import ActiveLearner, ActiveLearnerUiWidget, AnnotatorWidget, make_libact_strategy_ctor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load and prepare data

In [18]:
train_dataset = fetch_20newsgroups(subset='train')
test_dataset = fetch_20newsgroups(subset='test')

In [26]:
vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, sublinear_tf=True)
X_train = vectorizer.fit_transform(train_dataset.data)
Y_train = train_dataset.target
print(X_train.shape)

X_test = vectorizer.transform(test_dataset.data)
Y_test = test_dataset.target
print(X_test.shape)

(11314, 38842)
(7532, 38842)


## Train-test without active learning

In [36]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)
print('Accuracy', accuracy_score(Y_test, Y_pred))
print('F1 macro', f1_score(Y_test, Y_pred, average='macro'))

Accuracy 0.841343600637
F1 macro 0.832177743559


## Train-test with active learning and human-in-the-loop

In [54]:
Y_seed = Y_train.copy().astype('O')
unknown_indexes = np.random.randint(Y_seed.shape[0], size = Y_seed.shape[0] - 20)
Y_seed[unknown_indexes] = None

In [64]:
active_learn_alg_ctor = make_libact_strategy_ctor(lambda trn_ds:
                                                  UncertaintySampling(trn_ds,
                                                                      model = LibActLogReg()))

active_learner = ActiveLearner(active_learn_alg_ctor = active_learn_alg_ctor,
                               y_dtype = 'int',
                               X_full_dataset = X_train, 
                               y_full_dataset = Y_seed,
                               X_test_dataset = X_test,
                               y_test_dataset = Y_test,
                               model_evaluate = LibActLogReg(),
                               eval_metrics = [accuracy_score, f1_score],
                               rnd_start_steps = 0)

X_helper = pd.DataFrame(data={ 'text' : train_dataset.data })

In [71]:
try:
    if active_learn_ui:
        active_learn_ui.stop()
except NameError:
    pass
    
active_learn_ui = ActiveLearnerUiWidget(active_learner = active_learner, 
                                        X_helper = X_helper,
                                        display_feature_table = False,
                                        drop_labels = [],
                                        visualize_columns = ['text'],
                                        y_labels = { label : i for i, label in enumerate(train_dataset.target_names) },
                                        save_path = './20ng_active_dump.npy',
                                        save_time = 120)

active_learn_ui

1!!!!!!!!!!!!!! ['text']


AttributeError: 'LogisticRegression' object has no attribute 'fit'