# Common Variables and Imports

In [1]:
import os, time, warnings, pickle
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, pairwise_distances
from sklearn.exceptions import ConvergenceWarning
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling, entropy_sampling, margin_sampling, classifier_uncertainty, classifier_entropy, classifier_margin
from modAL.batch import uncertainty_batch_sampling
#from modAL.expected_error import expected_error_reduction
from modAL.density import information_density
from torch import nn
import torch
import tensorflow as tf

from utils import train_active_learner, load_CIFAR, log_metrics, initialize_random_number_generators, create_cnn_model, save_model_and_metrics, load_model_and_metrics, save_file, load_file, compute_loss, train_committee_learner

# Filter FutureWarnings to make outputs look more pleasant and ConvergenceWarnings which are given by sklearn LogisticRegressors when explicitly settings the multi_class to multinomial. Here, this could be omitted but I liked to leave it in for clarity to show that I'm not training 10 binary classifiers but one classifier with 10 outputs, each resembling the probabilities of a digit
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"using device; {device}")

RANDOM_SEED = 42
initialize_random_number_generators(RANDOM_SEED)

# This can be used to specify the experiment, e.g. when trying to test something new
# Previously it was used to test the 3 experimental settings (differing sizes of the initial dataset) with the following dictionary:
# experiment_parameters = {"1": {"n_initial" : 10},
#                          "2": {"n_initial" : 2500},
#                          "3": {"n_initial" : 10000}}
experiment = "1" # "1" or "2" or "3"
dataset_name = "CIFAR"

model_parameters=load_file("deep_classifier_parameters.pkl") #TODO
print(model_parameters)

# To use early stopping, do the models have to be trained for every single epoch
max_iterations = model_parameters['max_epochs']
patience = 50 # Controls early stopping iterations without improvement

# Active Learning parameters
n_query_instances = 250 # Amount of instances that are queried at a time
n_initial = 5000 # Initial set of labeled datapoints
n_query_epochs = 50  # How many times the algorithm should sample n_query_instances samples
n_iter_active_learning = 20 # During active learning, train for n_iter_active_learning epochs before querying a new sample. This controls overfitting

# For Committee based approaches
n_learners=2

using device; cpu
{'input_size': 32, 'num_channels': 3, 'l1_channels': 64, 'l1_kernel_size': 5, 'l1_padding': 1, 'l1_stride': 1, 'l2_channels': 64, 'l2_kernel_size': 5, 'l2_max_pool_kernel_size': 5, 'l2_padding': 1, 'l2_stride': 1, 'l2_dropout': 0.2, 'l3_dropout': 0.3, 'l4_input': 2048, 'l4_dropout': 0.3, 'l5_input': 1024, 'output_size': 10, 'lr': 0.0005, 'weight_decay': 0.0001, 'max_epochs': 175, 'batch_size': 256}


# Load Dataset (CIFAR)

The datasets are saved in the experiment folders for convenience and checking whether the splits, etc. are actually the same.

In [3]:
X_train, y_train, X_test, y_test, X_val, y_val, X_whole, y_whole = load_CIFAR(random_seed=RANDOM_SEED)
class_names = ['Airplane', 'Car', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck']
save_file(os.path.join("../results", dataset_name,  f"exp{experiment}", "datasets.pkl"), {"X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test})

Check how often each class appears in CIFAR and whether the data for each class is balanced

In [4]:
for i in range(10):
    print(f"Class {i} ({class_names[i]}): {np.count_nonzero(y_whole == i)} times")

Class 0 (Airplane): 6000 times
Class 1 (Car): 6000 times
Class 2 (Bird): 6000 times
Class 3 (Cat): 6000 times
Class 4 (Deer): 6000 times
Class 5 (Dog): 6000 times
Class 6 (Frog): 6000 times
Class 7 (Horse): 6000 times
Class 8 (Ship): 6000 times
Class 9 (Truck): 6000 times


## Creating an initial labelled dataset from random datapoints and the unlabelled pool

If n_initial is smaller than 10, the model cannot be initialized properly and will throw errors so exactly one sample from each class is picked from a random permutation as the initial training set. 

In [5]:
if n_initial == 10:
    initial_idx = []
    for cls in np.arange(10):
        cls_idxs = np.where(y_train == cls)[0]
        initial_idx.append(np.random.choice(cls_idxs))
    # construct the X and y initial with one item from each class from a random permutation. the initial idx should keep the original mnist index.
    # This is done to ensure that the model has an initial train set where it has seen each class
    # Sadly, otherwise it will throw errors
else:
    initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False) # Indices with which the initial train set is created with

X_initial = X_train[initial_idx]
y_initial = y_train[initial_idx]
pool_idx = np.setdiff1d(range(len(X_train)), initial_idx)

In [6]:
datasets = {'dataset_name': dataset_name,
            'X_initial': X_initial,
            'y_initial': y_initial,
            'X_train': X_train,
            'y_train': y_train,
            'X_test': X_test,
            'y_test': y_test,
            'X_val': X_val,
            'y_val': y_val, 
            'pool_idx': pool_idx}

# Train a CNN on the whole train dataset

In [None]:
def train_full_model():
    best_loss = np.inf
    no_improvement_count = 0
    
    model_parameters['max_epochs'] = 1
    cnn_full = create_cnn_model(model_parameters, random_seed=RANDOM_SEED)
    model_parameters['max_epochs'] = max_iterations
    
    
    metrics = {'train_loss': [], 'test_loss': [], 'test_acc': []}

    criterion = torch.nn.CrossEntropyLoss()
    
    start = time.time()
    for epoch in range(max_iterations):
        cnn_full.fit(X_train, y_train)
        if epoch % 10 == 0:
            log_metrics(epoch, cnn_full, X_train, y_train, X_test, y_test, metrics, is_cnn=True, device=device)
    
        val_loss = compute_loss(y_hat=cnn_full.predict_proba(X_val), y_data=y_val)
        
        # cnn_full.module_.eval()
        # with torch.no_grad():
        #     curr_val_logits = cnn_full.forward(torch.tensor(X_val, dtype=torch.float32, device=device))
        #     val_loss = criterion(curr_val_logits, torch.tensor(y_val, dtype=torch.long, device=device)).item()
        # cnn_full.module_.train()

        
        #if val_loss < best_loss:
        #    best_loss = val_loss
        #    no_improvement_count = 0
        #else:
        #    no_improvement_count += 1

        #print(f"  - Validation loss: {val_loss}")
        
        #if no_improvement_count >= patience:
        #    print("Early stopping triggered.")
        #    break

    y_hat = cnn_full.predict(X_test)
    accuracy_whole_dataset = accuracy_score(y_test, y_hat)
    print(f"Test accuracy with whole Test dataset: {accuracy_whole_dataset:.4f}")
    print(f"Training time: {time.time() - start:.2f} seconds")
    
    return cnn_full, metrics

cnn_whole_data, cnn_whole_data_metrics = train_full_model()
save_model_and_metrics(experiment, dataset_name, "whole_dataset", cnn_whole_data, cnn_whole_data_metrics)

---

# Active Learning

---

### Train a Logistic Regressor on 100 data points without Active Learning

This serves as a baseline to show how much can be learnt from n_initial data points. (This is expected to be low)

In [8]:
def train_initial_model():
    initialize_random_number_generators(seed=RANDOM_SEED)
    
    log_reg_initial = create_cnn_model(model_parameters, random_seed=RANDOM_SEED)
    
    metrics = {'train_loss': [], 'test_loss': [], 'test_acc': []}
    start = time.time()
  
    log_reg_initial.fit(X_initial, y_initial)
    print(f"Train time: {time.time() - start:.2f} seconds")
    
    y_pred = log_reg_initial.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    log_metrics(model_parameters["max_epochs"], log_reg_initial, X_train, y_train, X_test, y_test, metrics)

    print(f"Test accuracy with whole Test dataset: {accuracy:.4f}")
    
    return log_reg_initial, metrics

log_reg_initial, log_reg_initial_metrics = train_initial_model()
save_model_and_metrics(experiment, dataset_name, "initial_active_model", log_reg_initial, log_reg_initial_metrics)

  epoch    train_loss      dur
-------  ------------  -------
      1        [36m3.5469[0m  19.8929
      2        [36m1.8523[0m  20.0257
      3        [36m1.6955[0m  19.5617
      4        [36m1.5947[0m  19.2454
      5        [36m1.5082[0m  19.5146
      6        [36m1.4655[0m  19.5757
      7        [36m1.3806[0m  19.8260
      8        [36m1.3042[0m  19.9215
      9        [36m1.2727[0m  19.9210
     10        [36m1.2193[0m  19.7156
     11        [36m1.1593[0m  19.6916
     12        [36m1.0614[0m  19.7436
     13        [36m1.0153[0m  19.8118
     14        [36m0.9205[0m  20.0606
     15        [36m0.8629[0m  20.4369
     16        [36m0.8274[0m  19.8797
     17        [36m0.7219[0m  19.9239
     18        [36m0.6826[0m  19.6705
     19        [36m0.6284[0m  20.1231
     20        [36m0.5475[0m  20.1871
     21        [36m0.5085[0m  20.0481
     22        [36m0.4517[0m  20.4860
     23        [36m0.3982[0m  19.8593
     24        [3

## Train a Classifier with Various Query Strategies

From the documents and maybe worth trying: If you would like to start from scratch, you can use the .fit(X, y) method to make the learner forget everything it has seen and fit the model to the newly provided data.

To train only on the newly acquired data, you should pass only_new=True to the .teach() method. 

In [9]:
training_config = {
    "datasets": datasets,
    "random_seed": RANDOM_SEED,
    "n_query_instances": n_query_instances,
    "n_query_epochs": n_query_epochs,
    "create_model": create_cnn_model, 
    "model_params": model_parameters, 
    "n_iter_active_learning": n_iter_active_learning,
    "patience": patience
}
training_config_committee = training_config.copy()
training_config_committee['n_learners'] = n_learners

## Random Sampling

In [10]:
def random_sampling(classifier, X_pool, n_instances):
    n_samples = len(X_pool)
    query_idx = np.random.choice(range(n_samples), size=n_instances, replace=False)
    return query_idx, X_pool[query_idx]

In [None]:
learner, metrics = train_active_learner(query_strat=random_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "random_sampling", learner, metrics)

  epoch    train_loss      dur
-------  ------------  -------
      1        [36m3.5469[0m  20.4214
      2        [36m1.8523[0m  19.5690
      3        [36m1.6955[0m  19.3404
      4        [36m1.5947[0m  19.7510
      5        [36m1.5082[0m  19.7948
      6        [36m1.4655[0m  19.8474
      7        [36m1.3806[0m  19.9067
      8        [36m1.3042[0m  19.4518
      9        [36m1.2727[0m  19.6197
     10        [36m1.2193[0m  19.5436
     11        [36m1.1593[0m  19.4814
     12        [36m1.0614[0m  19.8223
     13        [36m1.0153[0m  19.8884
     14        [36m0.9205[0m  19.8874
     15        [36m0.8629[0m  19.8291
     16        [36m0.8274[0m  19.6121
     17        [36m0.7219[0m  19.5727
     18        [36m0.6826[0m  19.7373
     19        [36m0.6284[0m  19.8752
     20        [36m0.5475[0m  19.9443
     21        [36m0.5085[0m  19.9470
     22        [36m0.4517[0m  19.7629
     23        [36m0.3982[0m  20.2205
     24        [3

In [None]:
c_random_sampling, c_random_sampling_metrics = train_committee_learner(query_strat=random_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "random_sampling_committee", c_random_sampling, c_random_sampling_metrics)

## Uncertainty sampling strategies

**Uncertainty Sampling**: Samples where classifier is least sure are selected

In [None]:
learner, metrics = train_active_learner(query_strat=uncertainty_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "uncertainty_sampling", learner, metrics)

In [None]:
c_uc_sampling, c_uc_sampling_metrics = train_committee_learner(query_strat=uncertainty_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "uncertainty_sampling_committee", c_uc_sampling, c_uc_sampling_metrics)

**Entropy Sampling**: Samples where class probability has the largest Entropy

In [None]:
learner, metrics = train_active_learner(query_strat=entropy_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "entropy_sampling", learner, metrics)

In [None]:
c_entropy_sampling, c_entropy_sampling_metrics = train_committee_learner(query_strat=entropy_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "entropy_sampling_committee", c_entropy_sampling, c_entropy_sampling_metrics)

**Margin Sampling**: Selects instances where difference between first most likely and second most likely classes are the smallest

In [None]:
learner, metrics = train_active_learner(query_strat=margin_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "margin_sampling", learner, metrics)

In [None]:
c_margin_sampling, c_margin_sampling_metrics = train_committee_learner(query_strat=margin_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "margin_sampling_committee", c_margin_sampling, c_margin_sampling_metrics)

## Ranked Batch-Mode Sampling

$$score=\alpha\left(1-\Phi\left(x, X_{\text {labeled }}\right)\right)+(1-\alpha) U(x)$$
where $\alpha=\frac{\left|X_{\text {unlabeled }}\right|}{\left|X_{\text {unlabeled }}\right|+\left|X_{\text {labeled }}\right|}, X_{\text {labeled }}$ is the labeled dataset, $U(x)$ is the uncertainty of predictions for $x$, and $\Phi$ is a so-called similarity function, for instance cosine similarity. This latter function measures how well the feature space is explored near $x$. (The lower the better.)

According to the modAL docs: This strategy differs from uncertainty_sampling() because, although it is supported, traditional active learning query strategies suffer from sub-optimal record selection when passing n_instances > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking – that is, which records among the batch are most important for labeling?

In [None]:
learner, metrics = train_active_learner(query_strat=uncertainty_batch_sampling, **training_config)
save_model_and_metrics(experiment, dataset_name, "ranked_batch_mode", learner, metrics)

In [None]:
c_ranked_batch_mode_sampling, c_ranked_batch_mode_sampling_metrics = train_committee_learner(query_strat=uncertainty_batch_sampling, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "ranked_batch_mode_committee", c_ranked_batch_mode_sampling, c_ranked_batch_mode_sampling_metrics)

## Custom combination between uncertainty of the data point(s) in X and diversity between X and parts of the train dataset

In [None]:
alpha_uc_dv = 0.5
def ranked_uc_and_dv_score(learner, X):
    uncertainty = classifier_uncertainty(learner, X)
    diversity = np.min(pairwise_distances(X, learner.X_training), axis=1)
    combined_scores = alpha_uc_dv * uncertainty + (1 - alpha_uc_dv) * diversity
    return combined_scores

def ranked_uc_and_dv_query(learner, X, n_instances=1):
    uc_dv_scores = ranked_uc_and_dv_score(learner, X)
    # Sort them in descending order
    ranked_indices = np.argsort(uc_dv_scores)[::-1]
    selected_indices = ranked_indices[:n_instances]
    selected_instances = X[selected_indices]
    
    return selected_indices, selected_instances

In [None]:
learner, metrics = train_active_learner(query_strat=ranked_uc_and_dv_query, **training_config)
save_model_and_metrics(experiment, dataset_name, f"ranked_uc_and_dv_0_5", learner, metrics)

In [None]:
c_uc_dv, c_uc_dv_metrics = train_committee_learner(query_strat=ranked_uc_and_dv_query, **training_config_committee)
save_model_and_metrics(experiment, dataset_name, "ranked_uc_and_dv_0_5_committee", c_uc_dv, c_uc_dv_metrics)

---

# This is not used in the final report but is here to enable testing stream-based methods in the final experiments


---

# Stream-Based sampling

with Uncertainty Sampling/ Classifier uncertainty as it's query score method

In [None]:
# for uncertainty_threshold, uncertainty_threshold_str in [(0.2, "0_2"), (0.5, "0_5"), (0.8, "0_8")]:
#     learner, metrics = train_active_learner_stream(model_params=model_parameters, query_score_fn=classifier_uncertainty, query_score_threshold=uncertainty_threshold, n_query_instances=n_query_instances, epochs=epochs, random_seed=RANDOM_SEED, X_stream=X_train, y_stream=y_train, X_initial=X_initial, y_initial=y_initial)
#     save_model_and_metrics(experiment, dataset_name, f"stream_classifier_uncertainty_th_{uncertainty_threshold_str}", learner, metrics)

with Classification margin uncertainty as it's query score method

In [None]:
# learner, metrics = train_active_learner_stream(model_params=model_parameters, query_score_fn=classifier_margin, query_score_threshold=0.5, n_query_instances=n_query_instances, epochs=epochs, random_seed=RANDOM_SEED, X_stream=X_train, y_stream=y_train, X_initial=X_initial, y_initial=y_initial)
# save_model_and_metrics(experiment, dataset_name, "stream_classifier_margin", learner, metrics)

with Entropy margin uncertainty as it's query score method

In [None]:
# learner, metrics = train_active_learner_stream(model_params=model_parameters, query_score_fn=classifier_entropy, query_score_threshold=0.5, n_query_instances=n_query_instances, epochs=epochs, random_seed=RANDOM_SEED, X_stream=X_train, y_stream=y_train, X_initial=X_initial, y_initial=y_initial)
# save_model_and_metrics(experiment, dataset_name, "stream_entropy", learner, metrics)

with the custom measurement of uncertainty and diversity of the already seen datapoints

In [None]:
# for uncertainty_threshold, uncertainty_threshold_str in [(0.2, "0_2"), (0.5, "0_5"), (0.8, "0_8")]:
#     for a, a_str in [(0.2, "0_2"), (0.5, "0_5"), (0.8, "0_8")]:
#         alpha_uc_dv = a
#         learner, metrics = train_active_learner_stream(
#             model_params=model_parameters, 
#             query_score_fn=ranked_uc_and_dv_score, 
#             query_score_threshold=uncertainty_threshold, 
#             n_query_instances=n_query_instances, 
#             epochs=epochs, random_seed=RANDOM_SEED, 
#             X_stream=X_train, 
#             y_stream=y_train, 
#             X_initial=X_initial, 
#             y_initial=y_initial)
#         save_model_and_metrics(experiment, dataset_name, 
#                                f"stream_classifier_ranked_uc_and_dv_{a_str}_th_{uncertainty_threshold_str}", 
#                                learner, metrics)

## Disagreement Sampling (for classifiers) (uses a committee, so I should theoretically do every train run again for each methodwith a committee!)

---

# Methods that sadly don't work 

---

## Expected error reduction (doesn't work on my pc due to time complexity)

In [None]:
# learner, metrics = train_active_learner(model_params=model_parameters, query_strat=expected_error_reduction, epochs=epochs, random_seed=RANDOM_SEED, pool_idx=pool_idx, X_initial=X_initial, y_initial=y_initial)

## Information Density (doesn't work on my pc due to time complexity)

$$I(x)=\frac{1}{\left|X_u\right|} \sum_{x^{\prime} \in X} \operatorname{sim}\left(x, x^{\prime}\right)$$

where $\operatorname{sim}\left(x, x^{\prime}\right)$ is a similarity function such as cosine similarity or Euclidean similarity, which is the reciprocal of Euclidean distance. The higher the information density, the more similar the given instance is to the rest of the data.


According to the modAL docs: When using uncertainty sampling (or other similar strategies), we are unable to take the structure of the data into account which can lead to suboptimal queries.

This could very well be used in combination with another strategy

In [None]:
# def inf_density(classifier, X_pool):
#     return information_density(X_pool, metric='euclidean')
# 
# learner, metrics = train_active_learner(model_params=model_parameters, query_strat=inf_density,  epochs=epochs, random_seed=RANDOM_SEED, pool_idx=pool_idx, X_initial=X_initial, y_initial=y_initial)

---

This is for using it with a Committee (for multiple classes) so it's not optimal for comparing the query strategies themselves maybe?

---

Acquisition Functions might not be usable due to the fact that they require a BayesianOptimizer, not an ActiveLearner

## Acquisition Functions

**Probability of improvement**: 
$$PI(x)=\psi\left(\frac{\mu(x) - f\left(x^+\right) - \xi}{\sigma(x)}\right)$$
where $\mu(x)$ and $\sigma(x)$ are mean and variance of regressor at $x$, $f$ is the model to be optimized with estimated maximum at $x^+$. $\xi$ is a parameter controlling the degree of exploration and $\psi(x)$ denotes cumulative distribution function of a standard Gaussian Distribution

[Example from the ModAL Docs](https://modal-python.readthedocs.io/en/latest/_images/bo-PI.png)


In [None]:
# learner, metrics = train_active_learner(model_params=model_parameters, query_strat=max_PI, epochs=epochs, random_seed=RANDOM_SEED, pool_idx=pool_idx, X_initial=X_initial, y_initial=y_initial)

**[Expected Improvement (from the ModAL Docs)](https://modal-python.readthedocs.io/en/latest/content/query_strategies/Acquisition-functions.html#expected-improvement)**: 
$$
EI(x) = 
\left( \mu(x) - f(x^+) - \xi \right) \cdot \psi \left( \frac{\mu(x) - f(x^+) - \xi}{\sigma(x)} \right)
+ \sigma(x) \phi \left( \frac{\mu(x) - f(x^+) - \xi}{\sigma(x)} \right),
$$

where $\mu(x)$ and $\sigma(x)$ are the mean and variance of the regressor at $x$, $f$ is the function to be optimized with estimated maximum at $x$, $\xi$ is a parameter controlling the degree of exploration, and $\psi(z), \phi(z)$ denote the cumulative distribution function and density function of a standard Gaussian distribution.

**Upper Confidence Bound**:
$$ UCB(x) = \mu(x) + \beta \sigma(x)$$
where $\mu(x)$ and $\sigma(x)$ are mean and variance of the regressor and $\beta$ is a parameter controlling the degree of exploration

[Example from the ModAL Docs](https://modal-python.readthedocs.io/en/latest/_images/bo-UCB.png)