In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('active_learning_toolbox')

In [22]:
import os
import pandas as pd
import numpy as np
import warnings
from argparse import ArgumentParser
from scipy.stats import spearmanr

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
from compounds_utils import acquiring
from gensim.models import FastText

from compounds_utils import apply_distance, average_normalized, average_standard

from scipy.spatial import distance

pd.set_option('display.html.border', 2)

In [3]:
import logging

logger = logging.getLogger('actleto')

logPath = './'
fileName = 'actleto.log'
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")

if logger.hasHandlers():
    logger.handlers.clear()

fileHandler = logging.FileHandler(os.path.join(logPath, fileName))
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)

logger.setLevel(logging.INFO)

In [4]:
def make_train_data(w1vecs, w2vecs, compvecs):
    train = np.concatenate((np.array(w1vecs), np.array(w2vecs), np.array(compvecs)), axis=1)
    print('Classification data created with shape', train.shape)
    return StandardScaler().fit_transform(train)

# def make_train_data(w1vecs, w2vecs, compvecs):
#     dist = apply_distance(np.array(w1vecs), np.array(w2vecs), np.array(compvecs), distance.cosine, average_normalized)
#     dist = dist.reshape(-1, 1)
#     train = np.concatenate((np.array(w1vecs), np.array(w2vecs), np.array(compvecs), dist), axis=1)
#     return StandardScaler().fit_transform(train)

In [5]:
compounds_path = './workdir/annotation_small_selected.csv'
#model_path = './workdir/models/model_fasttext_300_mc5'
model_path = './workdir/models/model_fasttext_300_morecompounds'
model_words_path = './workdir/models/model_fasttext_nocompounds_300_mc5'

model = FastText.load(model_path)
model_words = None
#model_words = FastText.load(model_words_path)

In [6]:
comp = pd.read_csv(compounds_path)
w1, w2, c, true, comp = acquiring(comp, model, 'Катя (short list)', model_words=model_words)
vecs = make_train_data(w1, w2, c)

Number of examples:  201
Classification data created with shape (201, 900)


In [7]:
unsupervised_path = './workdir/compounds_AN_top10000.csv'
unlabeled_dataset = pd.read_csv(unsupervised_path, index_col=0)
unlabeled_dataset = unlabeled_dataset[:5730]
unlabeled_dataset.head()
unlabeled_dataset.to_csv('./workdir/compounds_5000.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
w1_u, w2_u, c_u, true_u, unlabeled_dataset = acquiring(unlabeled_dataset, model, 'Композициональность', 
                                    model_words=model_words, skip_invalid_labels=False)
vecs_unlabeled = make_train_data(w1_u, w2_u, c_u)

Number of examples:  5730
Classification data created with shape (5730, 900)


In [9]:
def swap_classes(cl):
    if cl == 1.:
        return 0.
    elif cl == 0.:
        return 1.

true = np.array([swap_classes(e) for e in true])

In [10]:
random_state = 42
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(vecs, true, list(range(comp.shape[0])), 
                                                                             test_size=.5, random_state=random_state)

In [11]:
X_train = np.concatenate((X_train, vecs_unlabeled), axis=0)
y_train = np.concatenate((y_train, np.array([None] * len(vecs_unlabeled))), axis=0)

In [12]:
X_helper = pd.concat((comp.loc[index_train, ['Часть 1', 'Часть 2']], unlabeled_dataset[['Часть 1', 'Часть 2']]), axis=0, ignore_index=True)

In [13]:
from sklearn.metrics import f1_score, accuracy_score

from libact.query_strategies import UncertaintySampling
from libact.models import LogisticRegression as LibActLogisticRegression
from sklearn.linear_model import LogisticRegression

from actleto import ActiveLearner, make_libact_strategy_ctor, MPErr, ActiveLearnerUiWidget

In [14]:
# Creating model for selecting examples from the unlabeled dataset.
# We use special wrapper to make sklearn model compatible with libact library.
query_model = LibActLogisticRegression()

# We use uncertainty sampling strategy from libact and wrap it with adaptor.
active_learn_alg_ctor = make_libact_strategy_ctor(lambda trn_ds: MPErr(trn_ds, model=query_model))

# Now we create model for evaluation. In this example it is the same model as the one
# for selecting examples.
evaluation_model = LogisticRegression()


def f1_macro(y_t, y_p):
    return f1_score(y_t, y_p, average = 'macro')

# We create ActiveLearner object that incupsulates the logic of active learning.
active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor,
                               y_dtype='int',
                               X_full_dataset=X_train, 
                               y_full_dataset=y_train,
                               X_test_dataset=X_test,
                               y_test_dataset=y_test,
                               model_evaluate=evaluation_model,
                               eval_metrics=[accuracy_score, f1_macro],
                               rnd_start_steps=0)

In [23]:
# We need this try catch if we use autosave feature. It stops autosave thread for the older annotator widgets if we
# invoke this cell for the second and next times.
try:
    if active_learn_ui:
        active_learn_ui.stop()
except NameError:
    pass
    
# Now we create the active learner widget itself and configure it with created active learning object,
# data for visualization (X_helper), and visualizer for images.
active_learn_ui = ActiveLearnerUiWidget(active_learner=active_learner, 
                                        X_helper=X_helper,
                                        display_feature_table=True,
                                        y_labels={'Композ.' : 0, 'Некомпоз.' : 1},
                                        save_path='annotations.npy',
                                        save_time=120)

active_learn_ui



ActiveLearnerUiWidget(children=(HBox(children=(Button(description='Next iteration', style=ButtonStyle()), Labe…