# 20newsgroups example

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

# Initialization

In [None]:
import collections, pandas as pd, numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from libact.models import LogisticRegression as LibActLogReg
from libact.query_strategies import UncertaintySampling

from actleto import ActiveLearner, ActiveLearnerUiWidget, make_libact_strategy_ctor

# Load and prepare data

In [None]:
# Loading the 20newsgroups dataset
train_dataset = fetch_20newsgroups(subset='train')
test_dataset = fetch_20newsgroups(subset='test')

In [None]:
# Preparing features
vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, sublinear_tf=True)

X_train = vectorizer.fit_transform(train_dataset.data)
Y_train = train_dataset.target

X_test = vectorizer.transform(test_dataset.data)
Y_test = test_dataset.target

# Train-test without active learning

In [None]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)
print('Accuracy', accuracy_score(Y_test, Y_pred))
print('F1 macro', f1_score(Y_test, Y_pred, average='macro'))

# Train-test with active learning and human-in-the-loop

In [None]:
# We choose 20 random examples as seed set for active learning.
Y_seed = Y_train.copy().astype('O')
unknown_indexes = np.random.randint(Y_seed.shape[0], size = Y_seed.shape[0] - 20)
Y_seed[unknown_indexes] = None

In [None]:
# Creating query strategy. The strategy selects examples for annotations from unlabeled dataset.
# We use standard uncertainty sampling strategy from libact and wrap it into adaptor.
active_learn_alg_ctor = make_libact_strategy_ctor(lambda trn_ds:
                                                  UncertaintySampling(trn_ds,
                                                                      model = LibActLogReg()))

# Creating ActiveLearning object that implements AL logic.
active_learner = ActiveLearner(active_learn_alg_ctor = active_learn_alg_ctor,
                               y_dtype = 'int',
                               X_full_dataset = X_train, 
                               y_full_dataset = Y_seed,
                               X_test_dataset = X_test,
                               y_test_dataset = Y_test,
                               model_evaluate = LibActLogReg(), # We use logreg from libact for choosing samples.
                               eval_metrics = [accuracy_score, f1_score],
                               rnd_start_steps = 0)

# Creaing X_helper object for visualization dataset for humans.
X_helper = pd.DataFrame(data={ 'text' : train_dataset.data })

In [None]:
# This try-catch block is needed to stop autosave thread in case we invoke the cell multiple times.
try:
    if active_learn_ui:
        active_learn_ui.stop()
except NameError:
    pass

# Creaing the active learner widget itself and configure it with active_learner, X_helper.
# The default visualizer will preview the visualize_columns with VisualizerTextArea.
active_learn_ui = ActiveLearnerUiWidget(active_learner = active_learner, 
                                        X_helper = X_helper,
                                        display_feature_table = False,
                                        drop_labels = [],
                                        visualize_columns = ['text'],
                                        y_labels = { label : i for i, label in enumerate(train_dataset.target_names) },
                                        save_path = './20ng_active_dump.npy',
                                        save_time = 120)

active_learn_ui