# Install dependencies for sequence tagging

In [None]:
!pip uninstall -y libact
!pip install git+https://github.com/IINemo/libact.git@seq

In [None]:
!pip uninstall -y actleto
!pip install git+https://github.com/IINemo/active_learning_toolbox.git@seq

In [None]:
!git clone https://github.com/IINemo/text_selector.git
!cd text_selector && git pull
!pip uninstall -y text_selector
!pip install -e ./text_selector

In [None]:
!jupyter nbextension install --py --symlink --sys-prefix text_selector
!jupyter nbextension enable --py --sys-prefix text_selector

In [None]:
!pip install flair
!pip install git+https://github.com/IINemo/bert_sequence_tagger.git

# Download dataset

In [None]:
!mkdir -p data
!cd data && wget http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtraining.tar.gz
!cd data && wget http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtest.tar.gz
    
!cd data && tar -xf ./Genia4ERtraining.tar.gz
!cd data && tar -xf ./Genia4ERtest.tar.gz

# Initialize

In [6]:
import collections, pandas as pd, numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from libact.query_strategies import UncertaintySampling

from actleto import ActiveLearner, ActiveLearnerUiWidget, make_libact_strategy_ctor
from actleto.annotator.visualizers.seq_annotation import SeqAnnotationVisualizer

# Load dataset

In [7]:
def find_in_between(offsets, start, end):
    res = []
    for i, offset in enumerate(offsets):
        if start <= offset and offset <= end:
            res.append(i)
    return res


def convert_y_to_bio_format(X, y):
    final_res = []
    for i, sent_y in enumerate(y):
        sent = X[i]
        offsets = []
        curr_offset = 0
        for index, word in enumerate(sent.split(' ')):
            offsets.append(curr_offset)
            curr_offset += len(word) + 1
        
        good_ys = ['O'] * len(sent)
        for w_y in sent_y:
            positions = find_in_between(offsets, w_y['start'], w_y['end'])
            good_ys[positions[0]] = 'B-' + w_y['tag']
            
            for pos in positions[1:]:
                good_ys[pos] = 'I-' + w_y['tag']
        
        final_res.append(good_ys)
    
    return final_res


def create_helper(X_train):
    X_helper = pd.DataFrame([' '.join(e) for e in X_train], columns=['texts'])
    return X_helper


def convert_y_to_dict_format(X, y):
    dict_annots = []
    for sent_x, sent_y in zip(X, y):
        offsets = []
        curr_offset = 0
        for index, word in enumerate(sent_x):
            offsets.append(curr_offset)
            curr_offset += len(word) + 1

        sent_dict_annots = []
        start_offset = -1
        last_offset = -1
        entity_tag = ''
        for i, tag in enumerate(sent_y):
            if tag.split('-')[0] == 'O':
                if start_offset != -1:
                    sent_dict_annots.append({'tag' : entity_tag, 
                                            'start' : start_offset, 
                                            'end' : last_offset})
                start_offset = -1
                
            if tag.split('-')[0] == 'B':
                if start_offset != -1:
                    sent_dict_annots.append({'tag' : entity_tag, 
                                            'start' : start_offset, 
                                            'end' : last_offset})
                
                start_offset = offsets[i]
                entity_tag = tag.split('-')[1]
                last_offset = offsets[i] + len(sent_x[i])
            elif tag.split('-')[0] == 'I':
                last_offset = offsets[i] + len(sent_x[i])
        
        if start_offset != -1:
            sent_dict_annots.append({'tag' : entity_tag,
                             'start' : start_offset,
                             'end' : last_offset})
        
        dict_annots.append(sent_dict_annots)
    
    return dict_annots

In [None]:
from bert_sequence_tagger.bert_utils import make_bert_tag_dict_from_flair_corpus, prepare_flair_corpus
from flair.datasets import ColumnCorpus


def prepare_corpus(corpus):
    X, y = [], []
    for X_i, y_i in prepare_flair_corpus(corpus):
        X.append(X_i)
        y.append(y_i)
    
    return X, y


data_folder = './data/'
corpus = ColumnCorpus(data_folder, {0 : 'text', 1 : 'ner'},
                                train_file='Genia4ERtask1.iob2',
                                test_file='Genia4EReval1.iob2',
                                dev_file='Genia4EReval1.iob2') # We do not need dev set

tags_vals, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)
    
X_train, y_train = prepare_corpus(corpus.train)
X_test, y_test = prepare_corpus(corpus.test)


y_train_dict = convert_y_to_dict_format(X_train, y_train)
X_helper = create_helper(X_train)

In [None]:
tags = list(set((tag.split('-')[1] for tag in tags_vals if len(tag.split('-')) > 1)))
tags

In [10]:
# Sample seed elements for emulated training

import random

sample_size = 100
random_sample = random.sample(list(range(len(y_train))), sample_size)

y_seed_dict = [None for _ in range(len(y_train_dict))]

for elem in random_sample:
    y_seed_dict[elem] = y_train_dict[elem]

# Create model and active learner

In [11]:
from libact.base.interfaces import ProbabilisticModel
from libact.base.dataset import Dataset
from libact.query_strategies import RandomSampling

import torch

import numpy as np
from sklearn.model_selection import train_test_split
from collections.abc import Iterable  
import gc


class LibActNN(ProbabilisticModel):
    def __init__(self, 
                 model_ctor, 
                 trainer_ctor,
                 batch_size=16,
                 bs_pred=256, 
                 retrain_epochs=3, 
                 iter_retrain=1,
                 train_from_scratch=True, 
                 valid_ratio=0.25,
                 string_input=True):
        self._model_ctor = model_ctor
        self._trainer_ctor = trainer_ctor
        self._model = None
        self._trainer = None
        self._batch_size = batch_size
        self._bs_pred = bs_pred
        self._retrain_epochs = retrain_epochs
        self._batch_size = batch_size
        self._iter_retrain = iter_retrain
        self._train_from_scratch = train_from_scratch
        self._valid_ratio = valid_ratio
        self._string_input = string_input
        
        self._iter = 0
        
    def _predict_core(self, X):
        if self._string_input:
            X = [sent.split(' ') for sent in X]
            
        torch.cuda.empty_cache()
        return self._model.predict(X)
    
    def predict_proba(self, X):
        return np.asarray(self._predict_core(X)[1]).reshape(-1, 1)
    
    def predict(self, X):
        return self._predict_core(X)[0]
    
    def train(self, libact_dataset, new_indexes=None):
        torch.cuda.empty_cache()
        collate_fn = lambda inpt: tuple(zip(*inpt))
        
        if (new_indexes is not None) and (self._iter % self._iter_retrain) != 0:
            libact_dataset = Dataset([libact_dataset.data[i][0] for i in new_indexes], 
                                     [libact_dataset.data[i][1] for i in new_indexes])
            n_epochs = 1
        else:
            n_epochs = self._retrain_epochs
            
        X, y = libact_dataset.format_sklearn()
        if self._string_input:
            y = convert_y_to_bio_format(X, y)
            X = [s.split(' ') for s in X]
        
        if self._valid_ratio > 0.:
            X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=self._valid_ratio)
            valid_data = list(zip(X_valid, y_valid))
        else:
            X_train, y_train = X, y
            valid_data = None
        
        train_data = list(zip(X_train, y_train))
        
        if (self._model is None) or self._train_from_scratch:
            self._model = self._model_ctor()
            self._trainer = self._trainer_ctor(self._model, len(X_train), 
                                               train_data, valid_data)

            gc.collect()
            torch.cuda.empty_cache()
            
        self._trainer.train(self._retrain_epochs)
        
        self._iter += 1
        
    def score(self):
        pass

In [12]:
from bert_sequence_tagger import BertForTokenClassificationCustom, SequenceTaggerBert, ModelTrainerBert
from bert_sequence_tagger.bert_utils import get_parameters_without_decay, get_model_parameters
from bert_sequence_tagger.metrics import f1_entity_level

from pytorch_transformers import BertTokenizer, AdamW

from torch.optim.lr_scheduler import ReduceLROnPlateau


CACHE_DIR = 'cache'

BATCH_SIZE = 16
MAX_LEN = 100
PRED_BATCH_SIZE = 1500
random_state = 2019
N_EPOCHS = 3

EARLY_STOPPING = 1
N_SAMPLES_PER_AL_ITER = 30
LEARNING_RATE = 5e-5
VALIDATION_RATIO = 0.1

ANNEAL_FACTOR = 0.5
PATIENCE = 2
WEIGHT_DECAY = 0.01

BERT_MODEL = 'bert-base-cased'


BERT_TOKENIZER = BertTokenizer.from_pretrained(BERT_MODEL, 
                                               cache_dir=CACHE_DIR, 
                                               do_lower_case=BERT_MODEL.endswith('uncased'))


def model_ctor():
    model = BertForTokenClassificationCustom.from_pretrained(BERT_MODEL,
                                                             cache_dir=CACHE_DIR, 
                                                             num_labels=len(tag2idx)).cuda()
    seq_tagger = SequenceTaggerBert(model, BERT_TOKENIZER, idx2tag=tags_vals, 
                                    tag2idx=tag2idx, pred_batch_size=PRED_BATCH_SIZE)
    return seq_tagger


def trainer_ctor(seq_tagger, corpus_len, train_data, val_data):
    optimizer = AdamW(get_model_parameters(seq_tagger._bert_model),
                      lr=LEARNING_RATE, betas=(0.9, 0.999), 
                      eps=1e-6, weight_decay=0.01, correct_bias=True)

    lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=ANNEAL_FACTOR, patience=PATIENCE)
    
    trainer = ModelTrainerBert(model=seq_tagger, 
                               optimizer=optimizer, 
                               lr_scheduler=lr_scheduler,
                               train_dataset=train_data, 
                               val_dataset=None,
                               validation_metrics=[f1_entity_level],
                               batch_size=BATCH_SIZE,
                               update_scheduler='ee',
                               keep_best_model=False,
                               max_grad_norm=1.)
    #validation_metrics=[f1_entity_level],
    #decision_metric=lambda metrics: metrics[0]
    #restore_bm_on_lr_change=False

    return trainer

In [None]:
active_learn_alg_ctor = make_libact_strategy_ctor(lambda trn_ds:
                                                  UncertaintySampling(trn_ds,
                                                                      model = LibActNN(model_ctor=model_ctor, 
                                                                                       trainer_ctor=trainer_ctor,
                                                                                       valid_ratio=VALIDATION_RATIO,
                                                                                       retrain_epochs=N_EPOCHS)),
                                                  max_samples_number=N_SAMPLES_PER_AL_ITER)

# Creating ActiveLearning object that implements AL logic.
active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor,
                               X_full_dataset=X_helper.texts.tolist(),
                               y_full_dataset=y_seed_dict,
                               rnd_start_steps=1)

active_learner.start()

# Creating widget for annotation

In [None]:
# This try-catch block is needed to stop autosave thread in 
#case we invoke the cell multiple times.
try:
    if active_learn_ui:
        active_learn_ui.stop()
except NameError:
    pass

# Creaing the active learner widget itself and configure 
# it with active_learner, X_helper.
active_learn_ui = ActiveLearnerUiWidget(active_learner=active_learner,
                                        X_helper=X_helper,
                                        display_feature_table=False,
                                        drop_labels=[],
                                        y_labels=None,
                                        save_path='./jnlpba.npy',
                                        save_time=120, 
                                        visualizer=SeqAnnotationVisualizer(tags=tags))

active_learn_ui