# This is when using colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip install git+https://github.com/modAL-python/modAL.git

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path('/content/drive/MyDrive/NLP_II_Interactive_Learning/code').resolve()))

# Common Variables and Imports

In [1]:
use_colab = False

In [2]:
import os, time, warnings, pickle
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.exceptions import ConvergenceWarning
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import uncertainty_sampling, entropy_sampling, margin_sampling, classifier_uncertainty, classifier_entropy, classifier_margin
from modAL.disagreement import vote_entropy_sampling
import torch

from utils import load_MNIST, log_metrics, initialize_random_number_generators, create_log_reg_model, save_model_and_metrics, load_file, save_file, train_active_learner, train_committee_learner, random_sampling, ranked_uc_and_dv_query

# Filter FutureWarnings to make outputs look more pleasant and ConvergenceWarnings which are given by sklearn LogisticRegressors when explicitly settings the multi_class to multinomial. Here, this could be omitted but I liked to leave it in for clarity to show that I'm not training 10 binary classifiers but one classifier with 10 outputs, each resembling the probabilities of a digit
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [3]:
parameters_path = 'shallow_classifier_parameters.pkl'
results_path = '../results'
exp_save_path = None

# If using google colab:
if use_colab is not None and use_colab:
    parameters_path = '/content/drive/MyDrive/NLP_II_Interactive_Learning/code/shallow_classifier_parameters.pkl'
    results_path = '/content/drive/MyDrive/NLP_II_Interactive_Learning/results'

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"using device; {device}")

# This can be used to specify the experiment, e.g. when trying to test something new
# Previously it was used to test the 3 experimental settings (differing sizes of the initial dataset) with the following dictionary:
# experiment_parameters = {"1": {"n_initial" : 10},
#                          "2": {"n_initial" : int(len(X_train)*0.10)},
#                          "3": {"n_initial" : int(len(X_train)*0.5)}}
experiment = "1"
dataset_name = "MNIST"

RANDOM_SEED = 42
model_parameters=load_file(parameters_path)
initialize_random_number_generators(RANDOM_SEED)

# To use early stopping, do the models have to be trained for every single epoch
max_iterations = model_parameters['max_iterations_per_epoch']
patience = 20 # Controls early stopping iterations without improvement

# Active Learning parameters
n_query_instances = 5 # Amount of instances that are queried at a time
n_initial = 75 # Initial set of labeled datapoints
n_query_epochs = 1000  # How many times the algorithm should sample n_query_instances samples
n_iter = 20 # During active learning, train for 5 epochs before querying a new sample

# For Committee based approaches
n_learners = 5

using device; cpu


In [5]:
exp_save_path = os.path.join(results_path, dataset_name, f"exp{experiment}")

# Load Dataset (MNIST)

This loads the vectorized version of MNIST and normalizes values from $[0,255]$ to the range $[0,1]$

The datasets are saved in the experiment folders for convenience and checking whether the splits, etc. are actually the same.

In [6]:
X_train, y_train, X_test, y_test, X_val, y_val, X_whole, y_whole = load_MNIST(random_seed=RANDOM_SEED, validation_split=0)
save_file(os.path.join(results_path, dataset_name,  f"exp{experiment}", "datasets.pkl"), {"X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test})

## Check how balanced the dataset is

Check how often each digit appears in MNIST and whether the data for each class is balanced

In [7]:
for i in range(10):
    print(f"Digit {i}: {np.count_nonzero(y_whole == i)} times")

Digit 0: 6903 times
Digit 1: 7877 times
Digit 2: 6990 times
Digit 3: 7141 times
Digit 4: 6824 times
Digit 5: 6313 times
Digit 6: 6876 times
Digit 7: 7293 times
Digit 8: 6825 times
Digit 9: 6958 times


## Creating an initial labelled dataset from random datapoints and the unlabelled pool

If one wants to start with the minimum amount of labeled datapoints: If n_initial is smaller than 10, the model cannot be initialized properly and will throw errors so exactly one sample from each class is picked from a random permutation as the initial training set.
Otherwise, they are just randomly selected

In [8]:
if n_initial == 10:
    initial_idx = []
    for cls in np.arange(10):
        cls_idxs = np.where(y_train == cls)[0]
        initial_idx.append(np.random.choice(cls_idxs))
    # construct the X and y initial with one item from each class from a random permutation. the initial idx should keep the original mnist index.
    # This is done to ensure that the model has an initial train set where it has seen each class
    # Sadly, otherwise it will throw errors
else:
    initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False) # Indices with which the initial train set is created with

X_initial = X_train[initial_idx]
y_initial = y_train[initial_idx]
pool_idx = np.setdiff1d(range(len(X_train)), initial_idx)

In [9]:
datasets = {'dataset_name': dataset_name,
            'X_initial': X_initial,
            'y_initial': y_initial,
            'X_train': X_train,
            'y_train': y_train,
            'X_test': X_test,
            'y_test': y_test,
            'X_val': X_val,
            'y_val': y_val,
            'pool_idx': pool_idx}

# Train a Logistic Regressor on the whole train dataset
Use multi_class='multinomial' solver: if it were 'ovr', each of the 10 output neurons would treat the corresponding number as a one-vs-rest szenario. So we would construct a Binary Distribution for each of the output neurons. But this doesn't take care of interdependencies between classes.
In General: 'ovr' would train a separate classifier for each number and 'multinomial' does a softmax regression

The train_full_model trains the model in a number of epochs. It was done this way to check if successively training a model for max_iter iterations for a number of epochs would provide the same results as training it once for max_iter*epoch iterations.

In [10]:
def train_full_model():
    best_loss = np.inf
    no_improvement_count = 0

    model_parameters['max_iterations_per_epoch'] = 1
    log_reg_full = create_log_reg_model(model_parameters, random_seed=RANDOM_SEED, device=device)
    model_parameters['max_iterations_per_epoch'] = max_iterations

    metrics = {'train_loss': [], 'test_loss': [], 'test_acc': []}

    start = time.time()
    for it in range(max_iterations):
        log_reg_full.fit(X_train, y_train)
        if it % 10 == 0:
            log_metrics(it, log_reg_full, X_train, y_train, X_test, y_test, metrics, device=device)
        #val_loss = log_loss(y_val, log_reg_full.predict_proba(X_val))
        #if val_loss < best_loss:
        #    best_loss = val_loss
        #    no_improvement_count = 0
        #else:
        #    no_improvement_count += 1

        #if no_improvement_count >= patience:
        #    print("Early stopping triggered.")
        #    break

    log_metrics(max_iterations, log_reg_full, X_train, y_train, X_test, y_test, metrics, device)

    y_hat = log_reg_full.predict(X_test)
    accuracy_whole_dataset = accuracy_score(y_test, y_hat)
    print(f"Test accuracy with whole Test dataset: {accuracy_whole_dataset:.4f}")
    print(f"Training time: {time.time() - start:.2f} seconds")

    return log_reg_full, metrics

log_reg_whole_data, log_reg_whole_data_metrics = train_full_model()
save_model_and_metrics(experiment, dataset_name, "whole_dataset", log_reg_whole_data, log_reg_whole_data_metrics, base_path=exp_save_path)
del log_reg_whole_data, log_reg_whole_data_metrics

After iteration 0: 
  - Train Loss: 0.8040 
  - Test Loss: 0.7760 
  - Test Accuracy: 0.7583
After iteration 10: 
  - Train Loss: 0.2469 
  - Test Loss: 0.2639 
  - Test Accuracy: 0.9258
After iteration 20: 
  - Train Loss: 0.2469 
  - Test Loss: 0.2639 
  - Test Accuracy: 0.9258
After iteration 30: 
  - Train Loss: 0.2469 
  - Test Loss: 0.2639 
  - Test Accuracy: 0.9258
After iteration 40: 
  - Train Loss: 0.2469 
  - Test Loss: 0.2639 
  - Test Accuracy: 0.9258
After iteration 50: 
  - Train Loss: 0.2469 
  - Test Loss: 0.2639 
  - Test Accuracy: 0.9258
Test accuracy with whole Test dataset: 0.9258
Training time: 52.61 seconds


---

# Active Learning

---

### Train a Logistic Regressor on 100 data points without Active Learning

This serves as a baseline to show how much can be learnt from n_initial data points. (This is expected to be low)

In [11]:
def train_initial_model():
    best_loss = np.inf
    no_improvement_count = 0
    initialize_random_number_generators(seed=RANDOM_SEED)

    # Set max_iterations_per epoch to 1 for early stopping
    model_parameters['max_iterations_per_epoch'] = 1
    log_reg_initial = create_log_reg_model(model_parameters, random_seed=RANDOM_SEED, device=device)
    model_parameters['max_iterations_per_epoch'] = max_iterations

    metrics = {'train_loss': [], 'train_loss_current': [], 'test_loss': [], 'test_acc': []}

    for it in range(max_iterations):
        log_reg_initial.fit(X_initial, y_initial)
        if it % 10 == 0:
            log_metrics(it, log_reg_initial, X_train, y_train, X_test, y_test, metrics, device=device)
        #val_loss = log_loss(y_val, log_reg_initial.predict_proba(X_val))
        #if val_loss < best_loss:
        #    best_loss = val_loss
        #    no_improvement_count = 0
        #else:
        #    no_improvement_count += 1
        #
        #if no_improvement_count >= patience:
        #    print("Early stopping triggered.")
        #    break


    log_metrics(max_iterations, log_reg_initial, X_train, y_train, X_test, y_test, metrics, device)

    y_pred = log_reg_initial.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    train_loss_current = log_loss(y_initial, log_reg_initial.predict_proba(X_initial))
    metrics['train_loss_current'].append(train_loss_current)

    print(f"Test accuracy with whole Test dataset: {accuracy:.4f}")

    return log_reg_initial, metrics

log_reg_initial, log_reg_initial_metrics = train_initial_model()
save_model_and_metrics(experiment, dataset_name, "initial_active_model", log_reg_initial, log_reg_initial_metrics, base_path=exp_save_path)
del log_reg_initial, log_reg_initial_metrics

After iteration 0: 
  - Train Loss: 1.1448 
  - Test Loss: 1.1273 
  - Test Accuracy: 0.6435
After iteration 10: 
  - Train Loss: 1.0730 
  - Test Loss: 1.0509 
  - Test Accuracy: 0.7352
After iteration 20: 
  - Train Loss: 1.0730 
  - Test Loss: 1.0509 
  - Test Accuracy: 0.7352
After iteration 30: 
  - Train Loss: 1.0730 
  - Test Loss: 1.0509 
  - Test Accuracy: 0.7352
After iteration 40: 
  - Train Loss: 1.0730 
  - Test Loss: 1.0509 
  - Test Accuracy: 0.7352
After iteration 50: 
  - Train Loss: 1.0730 
  - Test Loss: 1.0509 
  - Test Accuracy: 0.7352
Test accuracy with whole Test dataset: 0.7352


## Train a Classifier with Various Query Strategies

From the documents and maybe worth trying: If you would like to start from scratch, you can use the .fit(X, y) method to make the learner forget everything it has seen and fit the model to the newly provided data.

To train only on the newly acquired data, you should pass only_new=True to the .teach() method.

In [10]:
training_config = {
    "datasets": datasets,
    "random_seed": RANDOM_SEED,
    "n_query_instances": n_query_instances,
    "n_query_epochs": n_query_epochs,
    "create_model": create_log_reg_model,
    "model_params": model_parameters,
    "n_iter": n_iter,
    "patience": patience,
    "device": device
}
training_config_committee = training_config.copy()
training_config_committee['n_learners'] = n_learners

## Random Sampling

In [27]:
m_random_sampling, m_random_sampling_metrics = train_active_learner(query_strat=random_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "random_sampling", m_random_sampling, m_random_sampling_metrics, base_path=exp_save_path)
del m_random_sampling, m_random_sampling_metrics

[8 8 1 7 7]
After iteration 0: 
  - Train Loss: 1.0691 
  - Test Loss: 1.0462 
  - Test Accuracy: 0.7243
  - number of train samples: 80
[7 6 1 7 0]
After iteration 1: 
  - Train Loss: 1.0493 
  - Test Loss: 1.0265 
  - Test Accuracy: 0.7176
  - number of train samples: 85
[5 8 3 6 8]
After iteration 2: 
  - Train Loss: 1.0183 
  - Test Loss: 0.9966 
  - Test Accuracy: 0.7299
  - number of train samples: 90
[1 5 5 5 3]
After iteration 3: 
  - Train Loss: 1.0069 
  - Test Loss: 0.9856 
  - Test Accuracy: 0.7249
  - number of train samples: 95
[6 1 3 8 1]
After iteration 4: 
  - Train Loss: 0.9902 
  - Test Loss: 0.9656 
  - Test Accuracy: 0.7282
  - number of train samples: 100
[5 4 9 5 2]
After iteration 5: 
  - Train Loss: 0.9565 
  - Test Loss: 0.9341 
  - Test Accuracy: 0.7441
  - number of train samples: 105
[8 6 8 9 1]
After iteration 6: 
  - Train Loss: 0.9363 
  - Test Loss: 0.9167 
  - Test Accuracy: 0.7494
  - number of train samples: 110
[5 7 8 5 7]
After iteration 7: 
  - Tr

In [28]:
c_random_sampling, c_random_sampling_metrics = train_committee_learner(query_strat=random_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "random_sampling_committee", c_random_sampling, c_random_sampling_metrics, base_path=exp_save_path)
del c_random_sampling, c_random_sampling_metrics

test
[0 6 7 3 7]
80
After iteration 0: 
  - Train Loss: 1.0604 
  - Test Loss: 1.0398 
  - Test Accuracy: 0.7381
  - number of train samples: 80
[6 8 2 5 4]
85
After iteration 1: 
  - Train Loss: 1.0436 
  - Test Loss: 1.0209 
  - Test Accuracy: 0.7508
  - number of train samples: 85
[2 1 7 7 8]
90
After iteration 2: 
  - Train Loss: 1.0241 
  - Test Loss: 1.0024 
  - Test Accuracy: 0.7508
  - number of train samples: 90
[0 2 1 6 5]
95
After iteration 3: 
  - Train Loss: 0.9830 
  - Test Loss: 0.9635 
  - Test Accuracy: 0.7598
  - number of train samples: 95
[5 7 9 7 1]
100
After iteration 4: 
  - Train Loss: 0.9669 
  - Test Loss: 0.9506 
  - Test Accuracy: 0.7589
  - number of train samples: 100
[3 0 4 3 9]
105
After iteration 5: 
  - Train Loss: 0.9594 
  - Test Loss: 0.9463 
  - Test Accuracy: 0.7532
  - number of train samples: 105
[3 3 0 8 9]
110
After iteration 6: 
  - Train Loss: 0.9476 
  - Test Loss: 0.9298 
  - Test Accuracy: 0.7611
  - number of train samples: 110
[4 7 2 3 

## Uncertainty based strategies

**Uncertainty Sampling**: Samples where classifier is least sure are selected

In [29]:
m_uc_sampling, m_uc_sampling_metrics = train_active_learner(query_strat=uncertainty_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "uncertainty_sampling", m_uc_sampling, m_uc_sampling_metrics, base_path=exp_save_path)
del m_uc_sampling, m_uc_sampling_metrics

[5 4 0 8 0]
After iteration 0: 
  - Train Loss: 1.0684 
  - Test Loss: 1.0457 
  - Test Accuracy: 0.736
  - number of train samples: 80
[3 8 9 3 4]
After iteration 1: 
  - Train Loss: 1.0522 
  - Test Loss: 1.0327 
  - Test Accuracy: 0.7396
  - number of train samples: 85
[5 0 3 3 5]
After iteration 2: 
  - Train Loss: 1.0429 
  - Test Loss: 1.0228 
  - Test Accuracy: 0.7393
  - number of train samples: 90
[6 4 8 3 6]
After iteration 3: 
  - Train Loss: 1.0254 
  - Test Loss: 1.0120 
  - Test Accuracy: 0.7421
  - number of train samples: 95
[3 7 9 3 4]
After iteration 4: 
  - Train Loss: 1.0242 
  - Test Loss: 1.0166 
  - Test Accuracy: 0.7312
  - number of train samples: 100
[4 7 7 2 5]
After iteration 5: 
  - Train Loss: 1.0036 
  - Test Loss: 0.9957 
  - Test Accuracy: 0.742
  - number of train samples: 105
[7 2 1 3 7]
After iteration 6: 
  - Train Loss: 0.9994 
  - Test Loss: 0.9882 
  - Test Accuracy: 0.7384
  - number of train samples: 110
[4 2 4 6 2]
After iteration 7: 
  - Trai

In [30]:
c_uc_sampling, c_uc_sampling_metrics = train_committee_learner(query_strat=uncertainty_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "uncertainty_sampling_committee", c_uc_sampling, c_uc_sampling_metrics, base_path=exp_save_path)
del c_uc_sampling, c_uc_sampling_metrics

test
[5 4 0 8 0]
80
After iteration 0: 
  - Train Loss: 1.0684 
  - Test Loss: 1.0457 
  - Test Accuracy: 0.736
  - number of train samples: 80
[3 8 9 3 4]
85
After iteration 1: 
  - Train Loss: 1.0522 
  - Test Loss: 1.0327 
  - Test Accuracy: 0.7396
  - number of train samples: 85
[5 0 3 3 5]
90
After iteration 2: 
  - Train Loss: 1.0429 
  - Test Loss: 1.0228 
  - Test Accuracy: 0.7393
  - number of train samples: 90
[6 4 8 3 6]
95
After iteration 3: 
  - Train Loss: 1.0254 
  - Test Loss: 1.0120 
  - Test Accuracy: 0.7421
  - number of train samples: 95
[3 7 9 3 4]
100
After iteration 4: 
  - Train Loss: 1.0242 
  - Test Loss: 1.0166 
  - Test Accuracy: 0.7312
  - number of train samples: 100
[4 7 7 2 5]
105
After iteration 5: 
  - Train Loss: 1.0036 
  - Test Loss: 0.9957 
  - Test Accuracy: 0.742
  - number of train samples: 105
[7 2 1 3 7]
110
After iteration 6: 
  - Train Loss: 0.9994 
  - Test Loss: 0.9882 
  - Test Accuracy: 0.7384
  - number of train samples: 110
[4 2 4 6 2]

**Entropy Sampling**: Samples where class probability has the largest Entropy

In [31]:
m_entropy_sampling, m_entropy_sampling_metrics = train_active_learner(query_strat=entropy_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "entropy_sampling", m_entropy_sampling, m_entropy_sampling_metrics, base_path=exp_save_path)
del m_entropy_sampling, m_entropy_sampling_metrics

[5 3 8 5 3]
After iteration 0: 
  - Train Loss: 1.0638 
  - Test Loss: 1.0393 
  - Test Accuracy: 0.7423
  - number of train samples: 80
[5 2 9 8 0]
After iteration 1: 
  - Train Loss: 1.0496 
  - Test Loss: 1.0281 
  - Test Accuracy: 0.7436
  - number of train samples: 85
[2 3 5 8 9]
After iteration 2: 
  - Train Loss: 1.0403 
  - Test Loss: 1.0191 
  - Test Accuracy: 0.7454
  - number of train samples: 90
[8 5 6 5 4]
After iteration 3: 
  - Train Loss: 1.0420 
  - Test Loss: 1.0215 
  - Test Accuracy: 0.732
  - number of train samples: 95
[3 2 4 7 4]
After iteration 4: 
  - Train Loss: 1.0273 
  - Test Loss: 1.0049 
  - Test Accuracy: 0.7451
  - number of train samples: 100
[0 8 0 7 3]
After iteration 5: 
  - Train Loss: 1.0221 
  - Test Loss: 1.0012 
  - Test Accuracy: 0.7471
  - number of train samples: 105
[6 1 3 9 8]
After iteration 6: 
  - Train Loss: 0.9972 
  - Test Loss: 0.9826 
  - Test Accuracy: 0.76
  - number of train samples: 110
[3 4 0 8 5]
After iteration 7: 
  - Train

In [32]:
c_entropy_sampling, c_entropy_sampling_metrics = train_committee_learner(query_strat=entropy_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "entropy_sampling_committee", c_entropy_sampling, c_entropy_sampling_metrics, base_path=exp_save_path)
del c_entropy_sampling, c_entropy_sampling_metrics

test
[5 3 8 5 3]
80
After iteration 0: 
  - Train Loss: 1.0638 
  - Test Loss: 1.0393 
  - Test Accuracy: 0.7423
  - number of train samples: 80
[5 2 9 8 0]
85
After iteration 1: 
  - Train Loss: 1.0496 
  - Test Loss: 1.0281 
  - Test Accuracy: 0.7436
  - number of train samples: 85
[2 3 5 8 9]
90
After iteration 2: 
  - Train Loss: 1.0403 
  - Test Loss: 1.0191 
  - Test Accuracy: 0.7454
  - number of train samples: 90
[8 5 6 5 4]
95
After iteration 3: 
  - Train Loss: 1.0420 
  - Test Loss: 1.0215 
  - Test Accuracy: 0.732
  - number of train samples: 95
[3 2 4 7 4]
100
After iteration 4: 
  - Train Loss: 1.0273 
  - Test Loss: 1.0049 
  - Test Accuracy: 0.7451
  - number of train samples: 100
[0 8 0 7 3]
105
After iteration 5: 
  - Train Loss: 1.0221 
  - Test Loss: 1.0012 
  - Test Accuracy: 0.7471
  - number of train samples: 105
[6 1 3 9 8]
110
After iteration 6: 
  - Train Loss: 0.9972 
  - Test Loss: 0.9826 
  - Test Accuracy: 0.76
  - number of train samples: 110
[3 4 0 8 5]


**Margin Sampling**: Selects instances where difference between first most likely and second most likely classes are the smallest

In [33]:
m_margin_sampling, m_margin_sampling_metrics = train_active_learner(query_strat=margin_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "margin_sampling", m_margin_sampling, m_margin_sampling_metrics, base_path=exp_save_path)
del m_margin_sampling, m_margin_sampling_metrics

[2 4 9 8 2]
After iteration 0: 
  - Train Loss: 1.0380 
  - Test Loss: 1.0192 
  - Test Accuracy: 0.7466
  - number of train samples: 80
[7 6 9 8 8]
After iteration 1: 
  - Train Loss: 0.9976 
  - Test Loss: 0.9703 
  - Test Accuracy: 0.7714
  - number of train samples: 85
[9 6 9 4 2]
After iteration 2: 
  - Train Loss: 0.9732 
  - Test Loss: 0.9508 
  - Test Accuracy: 0.784
  - number of train samples: 90
[3 3 3 2 5]
After iteration 3: 
  - Train Loss: 0.9656 
  - Test Loss: 0.9427 
  - Test Accuracy: 0.7818
  - number of train samples: 95
[7 0 1 5 8]
After iteration 4: 
  - Train Loss: 0.9470 
  - Test Loss: 0.9222 
  - Test Accuracy: 0.7835
  - number of train samples: 100
[5 7 8 0 9]
After iteration 5: 
  - Train Loss: 0.9389 
  - Test Loss: 0.9129 
  - Test Accuracy: 0.7787
  - number of train samples: 105
[9 5 3 5 5]
After iteration 6: 
  - Train Loss: 0.9119 
  - Test Loss: 0.8866 
  - Test Accuracy: 0.7847
  - number of train samples: 110
[0 5 4 4 9]
After iteration 7: 
  - Tra

In [34]:
c_margin_sampling, c_margin_sampling_metrics = train_committee_learner(query_strat=margin_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "margin_sampling_committee", c_margin_sampling, c_margin_sampling_metrics, base_path=exp_save_path)
del c_margin_sampling, c_margin_sampling_metrics

test
[2 4 9 8 2]
80
After iteration 0: 
  - Train Loss: 1.0380 
  - Test Loss: 1.0192 
  - Test Accuracy: 0.7466
  - number of train samples: 80
[7 6 9 8 8]
85
After iteration 1: 
  - Train Loss: 0.9976 
  - Test Loss: 0.9703 
  - Test Accuracy: 0.7714
  - number of train samples: 85
[9 6 9 4 2]
90
After iteration 2: 
  - Train Loss: 0.9732 
  - Test Loss: 0.9508 
  - Test Accuracy: 0.784
  - number of train samples: 90
[3 3 3 2 5]
95
After iteration 3: 
  - Train Loss: 0.9656 
  - Test Loss: 0.9427 
  - Test Accuracy: 0.7818
  - number of train samples: 95
[7 0 1 5 8]
100
After iteration 4: 
  - Train Loss: 0.9470 
  - Test Loss: 0.9222 
  - Test Accuracy: 0.7835
  - number of train samples: 100
[5 7 8 0 9]
105
After iteration 5: 
  - Train Loss: 0.9389 
  - Test Loss: 0.9129 
  - Test Accuracy: 0.7787
  - number of train samples: 105
[9 5 3 5 5]
110
After iteration 6: 
  - Train Loss: 0.9119 
  - Test Loss: 0.8866 
  - Test Accuracy: 0.7847
  - number of train samples: 110
[0 5 4 4 9

## Ranked Batch-Mode Sampling

$$score=\alpha\left(1-\Phi\left(x, X_{\text {labeled }}\right)\right)+(1-\alpha) U(x)$$
where $\alpha=\frac{\left|X_{\text {unlabeled }}\right|}{\left|X_{\text {unlabeled }}\right|+\left|X_{\text {labeled }}\right|}, X_{\text {labeled }}$ is the labeled dataset, $U(x)$ is the uncertainty of predictions for $x$, and $\Phi$ is a so-called similarity function, for instance cosine similarity. This latter function measures how well the feature space is explored near $x$. (The lower the better.)

According to the modAL docs: This strategy differs from uncertainty_sampling() because, although it is supported, traditional active learning query strategies suffer from sub-optimal record selection when passing n_instances > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking – that is, which records among the batch are most important for labeling?

In [35]:
m_ranked_batch_mode_sampling, m_ranked_batch_mode_sampling_metrics = train_active_learner(query_strat=uncertainty_batch_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "ranked_batch_mode", m_ranked_batch_mode_sampling, m_ranked_batch_mode_sampling_metrics, base_path=exp_save_path)
del m_ranked_batch_mode_sampling, m_ranked_batch_mode_sampling_metrics

[0 2 2 9 8]
After iteration 0: 
  - Train Loss: 1.0491 
  - Test Loss: 1.0332 
  - Test Accuracy: 0.7428
  - number of train samples: 80
[8 9 5 2 6]
After iteration 1: 
  - Train Loss: 1.0293 
  - Test Loss: 1.0159 
  - Test Accuracy: 0.7554
  - number of train samples: 85
[6 0 4 2 3]
After iteration 2: 
  - Train Loss: 1.0157 
  - Test Loss: 1.0053 
  - Test Accuracy: 0.7565
  - number of train samples: 90
[2 8 8 6 8]
After iteration 3: 
  - Train Loss: 0.9926 
  - Test Loss: 0.9794 
  - Test Accuracy: 0.7709
  - number of train samples: 95
[2 2 2 3 3]
After iteration 4: 
  - Train Loss: 0.9958 
  - Test Loss: 0.9816 
  - Test Accuracy: 0.7699
  - number of train samples: 100
[7 7 2 4 8]
After iteration 5: 
  - Train Loss: 0.9877 
  - Test Loss: 0.9737 
  - Test Accuracy: 0.7663
  - number of train samples: 105
[2 8 2 9 8]
After iteration 6: 
  - Train Loss: 0.9798 
  - Test Loss: 0.9648 
  - Test Accuracy: 0.7682
  - number of train samples: 110
[8 5 6 5 0]
After iteration 7: 
  - Tr

In [None]:
c_ranked_batch_mode_sampling, c_ranked_batch_mode_sampling_metrics = train_committee_learner(query_strat=uncertainty_batch_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "ranked_batch_mode_committee", c_ranked_batch_mode_sampling, c_ranked_batch_mode_sampling_metrics, base_path=exp_save_path)
del c_ranked_batch_mode_sampling, c_ranked_batch_mode_sampling_metrics

test


# Vote Entropy

In [11]:
c_vote_entropy_sampling, c_vote_entropy_sampling_metrics = train_committee_learner(query_strat=vote_entropy_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "vote_entropy_committee", c_vote_entropy_sampling, c_vote_entropy_sampling_metrics, base_path=exp_save_path)
del c_vote_entropy_sampling, c_vote_entropy_sampling_metrics

test
After iteration 0: 
  - Train Loss: 1.0351 
  - Test Loss: 1.0196 
  - Test Accuracy: 0.752
  - number of train samples: 80
After iteration 1: 
  - Train Loss: 0.9934 
  - Test Loss: 0.9772 
  - Test Accuracy: 0.7637
  - number of train samples: 85
After iteration 2: 
  - Train Loss: 0.9685 
  - Test Loss: 0.9531 
  - Test Accuracy: 0.7743
  - number of train samples: 90
After iteration 3: 
  - Train Loss: 0.9501 
  - Test Loss: 0.9385 
  - Test Accuracy: 0.775
  - number of train samples: 95
After iteration 4: 
  - Train Loss: 0.9305 
  - Test Loss: 0.9211 
  - Test Accuracy: 0.7789
  - number of train samples: 100
After iteration 5: 
  - Train Loss: 0.9220 
  - Test Loss: 0.9148 
  - Test Accuracy: 0.7803
  - number of train samples: 105
After iteration 6: 
  - Train Loss: 0.9135 
  - Test Loss: 0.9078 
  - Test Accuracy: 0.7818
  - number of train samples: 110
After iteration 7: 
  - Train Loss: 0.9069 
  - Test Loss: 0.9005 
  - Test Accuracy: 0.7788
  - number of train sample

NameError: name 'c_ranked_batch_mode_sampling' is not defined

## Custom combination between uncertainty of the data point(s) in X and diversity between X and parts of the train dataset

In [None]:
m_uc_dv, m_uc_dv_metrics = train_active_learner(query_strat=ranked_uc_and_dv_query, **training_config)
save_model_and_metrics(experiment, dataset_name, f"ranked_uc_and_dv_0_5", m_uc_dv, m_uc_dv_metrics, base_path=exp_save_path)
del m_uc_dv, m_uc_dv_metrics

In [None]:
c_uc_dv, c_uc_dv_metrics = train_committee_learner(query_strat=ranked_uc_and_dv_query, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "ranked_uc_and_dv_0_5_committee", c_uc_dv, c_uc_dv_metrics, base_path=exp_save_path)
del c_uc_dv, c_uc_dv_metrics

---

# This is not used in the final report but is here to enable testing stream-based methods in the final experiments


---

# Stream-Based sampling

with Uncertainty Sampling/ Classifier uncertainty as it's query score method

In [None]:
# for uncertainty_threshold, uncertainty_threshold_str in [(0.2, "0_2"), (0.5, "0_5"), (0.8, "0_8")]:
#     learner, metrics = train_active_learner_stream(model_params=model_parameters, query_score_fn=classifier_uncertainty, query_score_threshold=uncertainty_threshold, n_query_instances=n_query_instances, epochs=epochs, random_seed=RANDOM_SEED, X_stream=X_train, y_stream=y_train, X_initial=X_initial, y_initial=y_initial)
#     save_model_and_metrics(experiment, dataset_name, f"stream_classifier_uncertainty_th_{uncertainty_threshold_str}", learner, metrics)

with Classification margin uncertainty as it's query score method

In [None]:
# learner, metrics = train_active_learner_stream(model_params=model_parameters, query_score_fn=classifier_margin, query_score_threshold=0.5, n_query_instances=n_query_instances, epochs=epochs, random_seed=RANDOM_SEED, X_stream=X_train, y_stream=y_train, X_initial=X_initial, y_initial=y_initial)
# save_model_and_metrics(experiment, dataset_name, "stream_classifier_margin", learner, metrics)

with Entropy margin uncertainty as it's query score method

In [None]:
# learner, metrics = train_active_learner_stream(model_params=model_parameters, query_score_fn=classifier_entropy, query_score_threshold=0.5, n_query_instances=n_query_instances, epochs=epochs, random_seed=RANDOM_SEED, X_stream=X_train, y_stream=y_train, X_initial=X_initial, y_initial=y_initial)
# save_model_and_metrics(experiment, dataset_name, "stream_entropy", learner, metrics)

with the custom measurement of uncertainty and diversity of the already seen datapoints

In [None]:
# for uncertainty_threshold, uncertainty_threshold_str in [(0.2, "0_2"), (0.5, "0_5"), (0.8, "0_8")]:
#     for a, a_str in [(0.1, "0_2"), (0.2, "0_2"), (0.5, "0_5"), (0.8, "0_8")]:
#         alpha_uc_dv = a # this is used by the ranked_uc_and_dv_score method.
#         learner, metrics = train_active_learner_stream(
#             model_params=model_parameters,
#             query_score_fn=ranked_uc_and_dv_score,
#             query_score_threshold=uncertainty_threshold,
#             n_query_instances=n_query_instances,
#             epochs=epochs, random_seed=RANDOM_SEED,
#             datasets=datasets, create_model=create_log_reg_model)
#         save_model_and_metrics(experiment, dataset_name,
#                                f"stream_classifier_ranked_uc_and_dv_{a_str}_th_{uncertainty_threshold_str}",
#                                learner, metrics)

## Disagreement Sampling (for classifiers) (uses a committee, so I should theoretically do every train run again for each methodwith a committee!)

---

# Methods that sadly don't work

---

## Expected error reduction (doesn't work on my pc due to time complexity)

In [None]:
# learner, metrics = train_active_learner(model_params=model_parameters, query_strat=expected_error_reduction, epochs=epochs, random_seed=RANDOM_SEED, pool_idx=pool_idx, X_initial=X_initial, y_initial=y_initial)

## Information Density (doesn't work on my pc due to time complexity)

$$I(x)=\frac{1}{\left|X_u\right|} \sum_{x^{\prime} \in X} \operatorname{sim}\left(x, x^{\prime}\right)$$

where $\operatorname{sim}\left(x, x^{\prime}\right)$ is a similarity function such as cosine similarity or Euclidean similarity, which is the reciprocal of Euclidean distance. The higher the information density, the more similar the given instance is to the rest of the data.


According to the modAL docs: When using uncertainty sampling (or other similar strategies), we are unable to take the structure of the data into account which can lead to suboptimal queries.

This could very well be used in combination with another strategy

In [None]:
# def inf_density(classifier, X_pool):
#     return information_density(X_pool, metric='euclidean')
#
# learner, metrics = train_active_learner(model_params=model_parameters, query_strat=inf_density,  epochs=epochs, random_seed=RANDOM_SEED, pool_idx=pool_idx, X_initial=X_initial, y_initial=y_initial)

---

This is for using it with a Committee (for multiple classes) so it's not optimal for comparing the query strategies themselves maybe?

---

Acquisition Functions might not be usable due to the fact that they require a BayesianOptimizer, not an ActiveLearner

## Acquisition Functions

**Probability of improvement**:
$$PI(x)=\psi\left(\frac{\mu(x) - f\left(x^+\right) - \xi}{\sigma(x)}\right)$$
where $\mu(x)$ and $\sigma(x)$ are mean and variance of regressor at $x$, $f$ is the model to be optimized with estimated maximum at $x^+$. $\xi$ is a parameter controlling the degree of exploration and $\psi(x)$ denotes cumulative distribution function of a standard Gaussian Distribution

[Example from the ModAL Docs](https://modal-python.readthedocs.io/en/latest/_images/bo-PI.png)


In [None]:
# learner, metrics = train_active_learner(model_params=model_parameters, query_strat=max_PI, epochs=epochs, random_seed=RANDOM_SEED, pool_idx=pool_idx, X_initial=X_initial, y_initial=y_initial)

**[Expected Improvement (from the ModAL Docs)](https://modal-python.readthedocs.io/en/latest/content/query_strategies/Acquisition-functions.html#expected-improvement)**:
$$
EI(x) =
\left( \mu(x) - f(x^+) - \xi \right) \cdot \psi \left( \frac{\mu(x) - f(x^+) - \xi}{\sigma(x)} \right)
+ \sigma(x) \phi \left( \frac{\mu(x) - f(x^+) - \xi}{\sigma(x)} \right),
$$

where $\mu(x)$ and $\sigma(x)$ are the mean and variance of the regressor at $x$, $f$ is the function to be optimized with estimated maximum at $x$, $\xi$ is a parameter controlling the degree of exploration, and $\psi(z), \phi(z)$ denote the cumulative distribution function and density function of a standard Gaussian distribution.

**Upper Confidence Bound**:
$$ UCB(x) = \mu(x) + \beta \sigma(x)$$
where $\mu(x)$ and $\sigma(x)$ are mean and variance of the regressor and $\beta$ is a parameter controlling the degree of exploration

[Example from the ModAL Docs](https://modal-python.readthedocs.io/en/latest/_images/bo-UCB.png)