<a href="https://colab.research.google.com/github/MoritzLaurer/ActiveLLM/blob/main/analysis_transformers_run_al_generative-test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Active learning loop


## Install and load relevant packages

In [None]:
import sys
import os
#%%capture
# if in colab
if sys.argv[0] == '/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py':
    #%pip install "argilla[server, listeners]==1.1.1"
    %pip install "transformers[sentencepiece]~=4.27"
    %pip install "datasets~=2.8"
    %pip install accelerate==0.18
    #%pip install "small-text[transformers]~=1.1.1"
    #%pip install "colab-xterm~=0.1.2"


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# info on the GPU you are using
import sys
print(sys.argv)
# if in colab
if sys.argv[0] == '/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py':
    !nvidia-smi
    # info on available ram
    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print('\n\nYour runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

['/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py', '-f', '/root/.local/share/jupyter/runtime/kernel-a5327a9e-1dce-4d50-a81e-ca040a415489.json']
Thu Apr 20 16:21:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   28C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
          

In [None]:
## connect to your google drive in case you want to load and save your own data
# if in colab
if sys.argv[0] == '/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py':
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    #drive.flush_and_unmount()

    print(os.getcwd())
    os.chdir("/content/drive/My Drive/PhD/generative-models")
    print(os.getcwd())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/My Drive/PhD/generative-models


In [None]:
## load local modules
#sys.path.append("/content/drive/My Drive/PhD/validity")
sys.path.insert(0, os.getcwd())
import active_learning_v3
import importlib  # in case of manual updates in .py file
importlib.reload(active_learning_v3)

from active_learning_v3 import ActiveLearner

In [None]:
## import non-local packages
import numpy as np
import pandas as pd
import torch
import datasets
import copy

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
    AutoConfig, AutoModelForNextSentencePrediction, T5ForConditionalGeneration,
    TrainingArguments, Trainer
)

from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report
from sklearn.model_selection import train_test_split



## Set main arguments

In [None]:
DATASET = "squad_v2"  # "pimpo", "uk-leftright-econ"
TRAINING_DIRECTORY = f"results/{DATASET}"

MAX_SAMPLE_MAJORITY = 10_000
MAX_SAMPLE = 2_000
N_SAMPLE_AL = 50
N_ITER_MAX = 5
DATE = 20230207
TASK = "squad_v2"  #"pimpo"
METHOD = "generative"   #args.method
MODEL_NAME = "google/flan-t5-small"  #"MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"  #"MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary", "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
VECTORIZER = "en"  #args.vectorizer

# set global seed for reproducibility and against seed hacking
#N_ITER = 0  #args.n_iteration - 1
#N_ITER_MAX = 1  #args.n_iterations_max
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

# special variables for pimpo
#SAMPLE_NO_TOPIC = 10
#TRAIN_NOTOPIC_PROPORTION = 0.4

# randomly assign different seeds for each run
"""seed_runs_all = np.random.choice(range(1000), size=N_ITER_MAX)
SEED_RUN = seed_runs_all[N_ITER]
print("Iteration number: ", N_ITER)
print("All random seeds: ", seed_runs_all)
print("Random seed for this run: ", SEED_RUN)"""

# not sure if I should keep TASK variable
assert DATASET in TASK, f"Mismatch between dataset {DATASET} and task {TASK}"

In [None]:
MODEL_MAX_LENGTH = 512
HYPER_PARAMS_DIC = {
    'lr_scheduler_type': 'linear', 'learning_rate': 5e-4, 'num_train_epochs': 5, 'seed': SEED_GLOBAL, 
    'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8*6, 
    'warmup_ratio': 0.30, 'weight_decay': 0.01,
    "include_inputs_for_metrics": True,
    # ! need to set this to true, otherwise seq2seq-trainer is not used https://github.com/huggingface/transformers/blob/v4.28.1/src/transformers/trainer_seq2seq.py#L246
    "predict_with_generate": True,
    "gradient_checkpointing": True,
    #"gradient_accumulation_steps": 2,
}  # "do_eval": False
TRAINING_DIRECTORY = ""
#FP16_BOOL = True if torch.cuda.is_available() else False # FP16 if cuda and if not mDeBERTa
#if "mDeBERTa".lower() in MODEL_NAME.lower(): FP16_BOOL = False  # mDeBERTa does not support FP16 yet

# try bf16?
#BF16_BOOL = False
#if BF16_BOOL == True: FP16_BOOL = False
#FP16_BOOL = False

In [None]:
# ## Load helper functions
"""import sys
sys.path.insert(0, os.getcwd())
import helpers
import importlib  # in case of manual updates in .py file
importlib.reload(helpers)
from helpers import compute_metrics_standard, clean_memory, compute_metrics_nli_binary
#from helpers import load_model_tokenizer, tokenize_datasets, set_train_args, create_trainer, format_nli_trainset, format_nli_testset
"""

'import sys\nsys.path.insert(0, os.getcwd())\nimport helpers\nimport importlib  # in case of manual updates in .py file\nimportlib.reload(helpers)\nfrom helpers import compute_metrics_standard, clean_memory, compute_metrics_nli_binary\n#from helpers import load_model_tokenizer, tokenize_datasets, set_train_args, create_trainer, format_nli_trainset, format_nli_testset\n'

## Load and clean data

In [None]:
## load data
if DATASET == "uk-leftright-econ":
    df = pd.read_csv(f"/content/drive/My Drive/PhD/validity/data-clean/benoit_leftright_sentences.zip", engine='python')
    df_cl = df.copy(deep=True)
elif "pimpo" in DATASET:
    # df = pd.read_csv(f"/Users/moritzlaurer/Dropbox/PhD/Papers/multilingual/multilingual-repo/data-clean/df_pimpo_samp_trans_m2m_100_1.2B_embed_tfidf.zip", engine='python')
    df = pd.read_csv("/content/drive/My Drive/PhD/validity/data-clean/df_pimpo_samp_trans_lemmatized_stopwords.zip", engine="python")
    df_cl = df.copy(deep=True)
elif "squad_v2" in DATASET:
    from datasets import load_dataset
    dataset_squad_v2 = load_dataset("squad_v2")
    df_train = dataset_squad_v2["train"].to_pandas()
    df_test = dataset_squad_v2["validation"].to_pandas()
    # write correct response to column
    df_train["label_text"] = df_train["answers"].apply(lambda x: x["text"][0] if len(x["text"]) > 0 else "unanswerable" )
    df_test["label_text"] = df_test["answers"].apply(lambda x: x["text"][0] if len(x["text"]) > 0 else "unanswerable" )
else:
    raise Exception(f"Dataset name not found: {DATASET}")



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

## uk-leftright
if "uk-leftright-econ" in DATASET:
    # select to work with crowd annotations and expert annotations
    df_cl = df_cl[df_cl.source == "Crowd"]
    # select task on either economy or social policy
    df_cl = df_cl[df_cl.scale == "Economic"]
    # transform continuous float data to categorical classes
    df_cl["label_scale"] = df_cl.score_sent_mean.fillna(0).round().astype(int)
    print(df_cl["label_scale"].value_counts())
    # prepare input data
    df_cl["text_prepared"] = df_cl["text_preceding"].fillna('') + " " + df_cl["text_original"] + " " + df_cl["text_following"].fillna('')
elif "uk-leftright-soc" in DATASET:
    raise NotImplementedError

if "uk-leftright" in DATASET:
    ## simplify scale to three classes for label text
    task_label_text_map = {0: "neutral", 1: "right", 2: "right", -1: "left", -2: "left"}
    # could also test scale as 5 classes
    #task_label_text_map = {0: "neutral", 1: "right", 2: "very_right", -1: "left", -2: "very_left"}
    df_cl["label_text"] = df_cl.label_scale.map(task_label_text_map)
    print(df_cl["label_text"].value_counts())
    ## adapt numeric label
    task_label_text_map_factorized = {"neutral": 1, "right": 2, "right": 2, "left": 0, "left": 0}
    #task_label_text_map_factorized = {"neutral": 2, "right": 3, "very_right": 4, "left": 1, "very_left": 0}
    df_cl["label"] = df_cl["label_text"].map(task_label_text_map_factorized)
    print(df_cl["label"].value_counts())


# remove x% no_topic for faster testing
if "pimpo" in DATASET:
    #df_cl = df_cl.groupby(by="label_text", as_index=False, group_keys=False).apply(lambda x: x.sample(n=min(SAMPLE_NO_TOPIC, len(x)), random_state=SEED_GLOBAL) if x.label_text.iloc[0] == "no_topic" else x)
    #print(df_cl["label_text"].value_counts())
    df_cl["text_prepared"] = df_cl.text_preceding_trans.fillna("") + '  || The quote: "' + df_cl.text_original_trans.fillna("") + '" End of the quote ||  ' + df_cl.text_following_trans.fillna("") 

if "squad_v2" in DATASET:
    df_cl = df_train

label_text_alphabetical = np.sort(df_cl.label_text.unique())


In [None]:
# sample training data
if "uk-leftright" in DATASET:
    df_train = df_cl.sample(n=MAX_SAMPLE, random_state=SEED_GLOBAL)
elif "pimpo" in DATASET:
    # sample x% of training data for no topic, then share the remainder equally across classes
    """ n_sample_notopic = int(MAX_SAMPLE * TRAIN_NOTOPIC_PROPORTION)
    n_sample_perclass = int((MAX_SAMPLE - n_sample_notopic) / (len(df_cl.label_text.unique()) - 1))
    df_train_samp1 = df_cl.groupby("label_text", as_index=False, group_keys=False).apply(
        lambda x: x.sample(min(n_sample_perclass, len(x)), random_state=SEED_GLOBAL) if x.label_text.unique()[0] != "no_topic" else None)
    df_train_samp2 = df_cl[df_cl.label_text == "no_topic"].sample(n_sample_notopic, random_state=SEED_GLOBAL)
    df_train = pd.concat([df_train_samp1, df_train_samp2])"""
    
    df_corpus = df_cl.groupby(by="label_text", as_index=False, group_keys=False).apply(lambda x: x.sample(n=min(MAX_SAMPLE_MAJORITY, len(x)), random_state=SEED_GLOBAL) if x.label_text.iloc[0] == "no_topic" else x)
    #df_corpus = df_cl.sample(n=MAX_SAMPLE, random_state=SEED_GLOBAL).copy(deep=True)
elif "squad_v2" in DATASET:
    df_corpus = df_train
    


print(df_corpus.label_text.value_counts())

# create df test
# just to get accuracy figure as benchmark, less relevant for substantive use-case
#df_test = df_cl[~df_cl.index.isin(df_train.index)]
#assert len(df_train) + len(df_test) == len(df_cl)

# make df_test smaller for quick testing:
if MAX_SAMPLE:
    df_corpus = df_corpus.sample(n=MAX_SAMPLE, random_state=SEED_GLOBAL)
    df_test = df_test.sample(n=int(MAX_SAMPLE/2), random_state=SEED_GLOBAL)

print(df_corpus.label_text.value_counts())


unanswerable                                                     43498
three                                                              231
two                                                                206
four                                                               171
five                                                               133
                                                                 ...  
Three years                                                          1
Conflict with Arius and Arianism                                     1
Jesus of Nazareth, is of a distinct substance from the Father        1
Athanasius Against the World                                         1
Kathmandu Metropolitan City                                          1
Name: label_text, Length: 64764, dtype: int64
unanswerable             654
four                       5
three                      4
two                        4
Paris                      4
                        ... 

In [None]:
## data checks
# verify that numeric label is in alphabetical order of label_text (can avoid issues for NLI)
#labels_num_via_numeric = df_cl[~df_cl.label_text.duplicated(keep="first")].sort_values("label_text").label.tolist()  # label num via labels: get labels from data when ordering label text alphabetically
#labels_num_via_text = pd.factorize(np.sort(df_cl.label_text.unique()))[0]  # label num via label_text: create label numeric via label text
#assert all(labels_num_via_numeric == labels_num_via_text)

In [None]:
## nli hypotheses
if METHOD == "nli":
    HYPO_LABEL_DIC = {
            "immigration_neutral": "The quote describes immigration neutrally without implied value judgement or describes the status quo of immigration, for example only stating facts or using technocratic language about immigration",
            "immigration_sceptical": "The quote describes immigration sceptically / disapprovingly. For example, the quote could mention the costs of immigration, be against migrant workers, state that foreign labour decreases natives' wages, that there are already enough refugees, refugees are actually economic migrants, be in favour of stricter immigration controls, exceptions to the freedom of movement in the EU.",
            "immigration_supportive": "The quote describes immigration favourably / supportively. For example, the quote could mention the benefits of immigration, the need for migrant workers, international obligations to take in refugees, protection of human rights, in favour of family reunification or freedom of movement in the EU.",
            "integration_neutral": "The quote describes immigrant integration neutrally without implied value judgement or describes the status quo of immigrant integration, for example only stating facts or using technocratic language about immigrant integration",
            "integration_sceptical": "The quote describes immigrant integration sceptically / disapprovingly. For example, the quote could mention negative references to multiculturalism and diversity, underline the importance of ethnic homogeneity and national culture, call for immigrants to give up their culture of origin, warn of islamization, mention duties in order to stay in the country, demand integration tests, associate immigrant communities with problems or crimes, demand an oath of allegiance of immigrants, or underline ethnic criteria for receiving citizenship.",
            "integration_supportive": "The quote describes immigrant integration favourably / supportively. For example, the quote could mention positive references to multiculturalism and diversity, underline cosmopolitan values towards immigrants, demand inclusion of immigrants, demand anti-discrimination policies based on ethnicity and origin, demand policies against racism, demand more rights for immigrants, or underline civic values instead of ethnic values for being able to receive citizenship.",            
            "no_topic": "The quote is neither about immigration nor about immigrant integration.",
    }
if METHOD == "generative":
    if DATASET == "pimpo":
        # create instruction
        label_instruction_map_short = {
            "immigration_neutral": "immigration neutral", 
            "immigration_sceptical": "immigration sceptical", 
            "immigration_supportive": "immigration supportive", 
            "integration_neutral": "integration neutral", 
            "integration_sceptical": "integration sceptical", 
            "integration_supportive": "integration supportive", 
            "no_topic": "other topic"
        }
        label_instruction_map_definition = {
                "immigration neutral": "The quote describes immigration neutrally without implied value judgement or describes the status quo of immigration, for example only stating facts or using technocratic language about immigration",
                "immigration sceptical": "The quote describes immigration sceptically / disapprovingly. For example, the quote could mention the costs of immigration, be against migrant workers, state that foreign labour decreases natives' wages, that there are already enough refugees, refugees are actually economic migrants, be in favour of stricter immigration controls, exceptions to the freedom of movement in the EU.",
                "immigration supportive": "The quote describes immigration favourably / supportively. For example, the quote could mention the benefits of immigration, the need for migrant workers, international obligations to take in refugees, protection of human rights, in favour of family reunification or freedom of movement in the EU.",
                "integration neutral": "The quote describes immigrant integration neutrally without implied value judgement or describes the status quo of immigrant integration, for example only stating facts or using technocratic language about immigrant integration",
                "integration sceptical": "The quote describes immigrant integration sceptically / disapprovingly. For example, the quote could mention negative references to multiculturalism and diversity, underline the importance of ethnic homogeneity and national culture, call for immigrants to give up their culture of origin, warn of islamization, mention duties in order to stay in the country, demand integration tests, associate immigrant communities with problems or crimes, demand an oath of allegiance of immigrants, or underline ethnic criteria for receiving citizenship.",
                "integration supportive": "The quote describes immigrant integration favourably / supportively. For example, the quote could mention positive references to multiculturalism and diversity, underline cosmopolitan values towards immigrants, demand inclusion of immigrants, demand anti-discrimination policies based on ethnicity and origin, demand policies against racism, demand more rights for immigrants, or underline civic values instead of ethnic values for being able to receive citizenship.",            
                "other topic": "The quote is neither about immigration nor about immigrant integration.",
        }
        short = False
        if short == False:
            label_instruction_string = '\n'.join([f'"{key}": {value}' for key, value in label_instruction_map_definition.items()])
        elif short == True:
            label_instruction_string = ', '.join(label_instruction_map_short.values())

        instruction = f"""Classify the quote below in one of the following categories:\n{label_instruction_string}.\nWhich category applies best to the quote below?""" 
        print("Instruction: ", instruction)

        df_corpus["text_prepared"] = instruction + "\n\n" + df_corpus["text_prepared"]

        # update label strings in dataset accordingly 
        df_corpus["label_text"] = df_corpus["label_text"].map(label_instruction_map_short)
        print(df_corpus["label_text"].value_counts())
    
    elif DATASET == "squad_v2":

        df_corpus["text_prepared"] = "Context: " + df_corpus["context"] + "\nQuestion: " + df_corpus["question"] + "\nExtract the answer from the context. Extracted answer: "
        df_test["text_prepared"] = "Context: " + df_test["context"] + "\nQuestion: " + df_test["question"] + "\nExtract the answer from the context. Extracted answer: "

else:
    raise NotImplementedError


In [None]:
# label_map necessary downstream
#label_map = {label_text: label for label, label_text in zip(np.unique(df_cl["label_text"].factorize(sort=False)[0]), df_cl["label_text"].factorize(sort=False)[1])}

#df_cl = df_cl[["label", "label_text", "text"]]  # "label_text", 
#df_cl = df_cl.reset_index(drop=True)

In [None]:
df_corpus[["text_prepared", "label_text"]]
df_corpus["label_text"].str.len().max()

202

## Initialise the active learner

In [None]:
# ?! does it make sense to split this, or should I just always run on all and then remove new sampled texts from df_test and add to corpus/train?
# ! should work for both non-annotation loop and in annotation loop
# could remove the double inference step ?
#df_corpus, df_test = train_test_split(df_cl, train_size=0.5, random_state=SEED_GLOBAL, stratify=df_cl["label_text"])
#df_corpus = df_train
#df_test = df_test

In [None]:
learner = ActiveLearner()

# for generative
learner.load_pd_dataset(df_corpus=df_corpus, df_test=df_test, text_column="text_prepared", label_column="label_text", separate_testset=True)  
#learner.format_pd_dataset_for_generative(instruction=instruction, text_column="text_prepared")

# for nli
#learner.load_pd_dataset(df_corpus=df_corpus, text_column="text_prepared", separate_testset=False)  # df_test=df_test
#learner.format_pd_dataset_for_nli(hypo_label_dic=HYPO_LABEL_DIC)

#learner.df_test_format
#learner.df_test_original
#learner.df_corpus_format.tolist()
#learner.df_corpus_original
#learner.df_corpus_original_update

In [None]:
from transformers import GenerationConfig
# docs https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/text_generation#transformers.GenerationMixin.generate
config_params = {
    # ! trainer seems to fully ignore this in eval-loop outputs. 
    # probably because of: https://github.com/huggingface/transformers/blob/v4.28.1/src/transformers/trainer_seq2seq.py#L273
    "max_new_tokens": 64,
    "num_beams": 4,
    #"generation_num_beams": 5,  # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/training_args_seq2seq.py#L42
    "num_return_sequences": 1,
    "temperature": 0,  # default: 1.0
    "top_k": 500,  # default: 50
    "return_dict_in_generate": True,
    "output_scores": True,
    #"predict_with_generate": False,
    #"include_inputs_for_metrics": True
    "renormalize_logits": "True",
}
model_params = {
    #"torch_dtype": torch.float16,  #torch.bfloat16, torch.float16
    #load_in_8bit=True,
    "device_map": "auto",
    "offload_folder": "offload",  
    "offload_state_dict": True
}
#generation_config = GenerationConfig.from_pretrained(MODEL_NAME, **params)
#generation_config
#learner.model.generation_config = generation_config
#learner.model.generation_config._from_model_config

In [None]:
# the function that determines the input for compute_metrics: 
# https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/trainer.py#L280
# compute metrics function: 
# https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/trainer.py#L3214
# 

In [None]:
# only for first run
learner.load_model_tokenizer(model_name=MODEL_NAME, method=METHOD, label_text_alphabetical=label_text_alphabetical, model_max_length=MODEL_MAX_LENGTH, 
                             config_params=config_params, model_params=model_params)

Device: cuda


In [None]:
learner.model_params

{'device_map': 'auto', 'offload_folder': 'offload', 'offload_state_dict': True}

In [None]:
learner.tokenize_hf_dataset()  # label_col="label_text", text_col="text_prepared"

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# might need to use this here: Seq2SeqTrainingArguments
learner.set_train_args(hyperparams_dic=HYPER_PARAMS_DIC, training_directory=TRAINING_DIRECTORY, disable_tqdm=False, evaluation_strategy="no")

In [None]:
learner.model.generation_config

GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "max_new_tokens": 64,
  "num_beams": 4,
  "output_scores": true,
  "pad_token_id": 0,
  "renormalize_logits": "True",
  "return_dict_in_generate": true,
  "temperature": 0,
  "top_k": 500,
  "transformers_version": "4.28.1"
}

In [None]:
learner.model_name

'google/flan-t5-small'

In [None]:
"""from torch.utils.data import Dataset, DataLoader
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: torch.tensor(value, dtype=torch.long).to(device) for key, value in learner.dataset["corpus"].remove_columns("idx").to_dict().items()}

class TokenizedTextDataset(Dataset):
    def __init__(self, tokenized_inputs):
        self.tokenized_inputs = tokenized_inputs

    def __len__(self):
        return len(self.tokenized_inputs["input_ids"])

    def __getitem__(self, idx):
        item = {key: value[idx] for key, value in self.tokenized_inputs.items()}
        return item

dataset_inputs = TokenizedTextDataset(inputs)
batch_size_inference = 8
dataloader = DataLoader(dataset_inputs, batch_size=batch_size_inference, shuffle=False)


reconstructed_scores = []
labels_pred = []

#with torch.no_grad():
for batch in tqdm.tqdm(dataloader, desc="Inference"):
    inputs_batched = {k: v.to(learner.model.device) for k, v in batch.items()}
    #generated = model.generate(**inputs)
    outputs = learner.model.generate(
        **inputs_batched,
        **{key: value for key, value in config_params.items() if key != "generation_num_beams"},
    )

    if learner.config_params["num_beams"] == 1:
        transition_scores = learner.model.compute_transition_scores(
            outputs.sequences, outputs.scores, normalize_logits=False, #outputs.beam_indices
        )
    else:
        transition_scores = learner.model.compute_transition_scores(
            outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
        )
    #transition_scores = self.model.compute_transition_scores(
    #    outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
    #)

    ## get scores for entire sequence
    # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
    # Tip: set `normalize_logits=True` to recompute the scores from the normalized logits.
    output_length = inputs["input_ids"].shape[1] + np.sum(transition_scores.to(torch.float32).cpu().numpy() < 0, axis=1)
    length_penalty = learner.model.generation_config.length_penalty
    reconstructed_scores_batch = transition_scores.to(torch.float32).cpu().sum(axis=1) / (output_length**length_penalty)
    reconstructed_scores.append(reconstructed_scores_batch.tolist())

    # get predicted label strings
    labels_pred_batch = learner.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
    labels_pred.append(labels_pred_batch)

reconstructed_scores = [item for sublist in reconstructed_scores for item in sublist]
labels_pred = [item for sublist in labels_pred for item in sublist]
"""

'from torch.utils.data import Dataset, DataLoader\nimport tqdm\n\ndevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")\ninputs = {key: torch.tensor(value, dtype=torch.long).to(device) for key, value in learner.dataset["corpus"].remove_columns("idx").to_dict().items()}\n\nclass TokenizedTextDataset(Dataset):\n    def __init__(self, tokenized_inputs):\n        self.tokenized_inputs = tokenized_inputs\n\n    def __len__(self):\n        return len(self.tokenized_inputs["input_ids"])\n\n    def __getitem__(self, idx):\n        item = {key: value[idx] for key, value in self.tokenized_inputs.items()}\n        return item\n\ndataset_inputs = TokenizedTextDataset(inputs)\nbatch_size_inference = 8\ndataloader = DataLoader(dataset_inputs, batch_size=batch_size_inference, shuffle=False)\n\n\nreconstructed_scores = []\nlabels_pred = []\n\n#with torch.no_grad():\nfor batch in tqdm.tqdm(dataloader, desc="Inference"):\n    inputs_batched = {k: v.to(learner.model.device) for k, v in ba

In [None]:
# ! memory inefficient
"""device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: torch.tensor(value, dtype=torch.long).to(device) for key, value in learner.dataset["corpus"].remove_columns("idx").to_dict().items()}

outputs = learner.model.generate(
            **inputs,
            **{key: value for key, value in config_params.items() if key != "generation_num_beams"},
        )
"""

'device = torch.device("cuda" if torch.cuda.is_available() else "cpu")\ninputs = {key: torch.tensor(value, dtype=torch.long).to(device) for key, value in learner.dataset["corpus"].remove_columns("idx").to_dict().items()}\n\noutputs = learner.model.generate(\n            **inputs,\n            **{key: value for key, value in config_params.items() if key != "generation_num_beams"},\n        )\n'

In [None]:
### first zero-shot sampling % testing run, no training
# TODO: this does currently not support separate test-set
learner.train_test_infer()
learner.n_iteration
#learner.results_test
#learner.results_corpus

Inference: 100%|██████████| 42/42 [02:22<00:00,  3.40s/it]


  Aggregate metrics:  {'f1_macro': 0.225, 'f1_micro': 0.315, 'accuracy_balanced': 0.457, 'precision_macro': 0.226, 'recall_macro': 0.227, 'precision_micro': 0.315, 'recall_micro': 0.315}


1

In [None]:
"""## inspect input received by compute_metrics
if HYPER_PARAMS_DIC["predict_with_generate"] == False:
    print("predict_with_generate: ", HYPER_PARAMS_DIC["predict_with_generate"])
    # prediction
    print(learner.eval_pred[0][0].shape)
    # embeddings for input tokens
    print(learner.eval_pred[0][1].shape)  
    # label
    print(learner.eval_pred[1].shape)
    # input
    print(learner.eval_pred[2].shape)
elif HYPER_PARAMS_DIC["predict_with_generate"] == True:
    print("predict_with_generate: ", HYPER_PARAMS_DIC["predict_with_generate"])
    #prediction
    print(learner.eval_pred[0].shape)
    #print(learner.eval_pred[0][1].shape)  # embeddings for input tokens
    # label
    print(learner.eval_pred[1].shape)
    # input
    print(learner.eval_pred[2].shape)
"""

  and should_run_async(code)


'## inspect input received by compute_metrics\nif HYPER_PARAMS_DIC["predict_with_generate"] == False:\n    print("predict_with_generate: ", HYPER_PARAMS_DIC["predict_with_generate"])\n    # prediction\n    print(learner.eval_pred[0][0].shape)\n    # embeddings for input tokens\n    print(learner.eval_pred[0][1].shape)  \n    # label\n    print(learner.eval_pred[1].shape)\n    # input\n    print(learner.eval_pred[2].shape)\nelif HYPER_PARAMS_DIC["predict_with_generate"] == True:\n    print("predict_with_generate: ", HYPER_PARAMS_DIC["predict_with_generate"])\n    #prediction\n    print(learner.eval_pred[0].shape)\n    #print(learner.eval_pred[0][1].shape)  # embeddings for input tokens\n    # label\n    print(learner.eval_pred[1].shape)\n    # input\n    print(learner.eval_pred[2].shape)\n'

In [None]:
"""# Given PyTorch tensor of shape (20, 6, 32128)
tensor = learner.eval_pred[0][0]  #torch.randn(20, 6, 32128)
# Get the index of the maximum logit along the last dimension
max_indices = np.argmax(tensor, axis=-1)
# Calculate the softmax probabilities along the last dimension
exp_arr = np.exp(tensor)
softmax_probs = exp_arr / np.sum(exp_arr, axis=-1, keepdims=True)
# Get the largest logit along the last dimension
max_logits = np.max(tensor, axis=-1)
print(np.sum(max_logits, axis=1))

# Convert the indices to tokens
# Assuming the tokens are represented as integers from 0 to 32127
tokens = max_indices.tolist()
# Print the chosen tokens for each input text and output position
#for i in range(20):  # Loop over 20 input texts
#    for j in range(6):  # Loop over 6 output positions
#        print(f"Input text {i+1}, Output position {j+1}: Token {tokens[i][j]}")


learner.tokenizer.batch_decode(max_indices, skip_special_tokens=False)"""

'# Given PyTorch tensor of shape (20, 6, 32128)\ntensor = learner.eval_pred[0][0]  #torch.randn(20, 6, 32128)\n# Get the index of the maximum logit along the last dimension\nmax_indices = np.argmax(tensor, axis=-1)\n# Calculate the softmax probabilities along the last dimension\nexp_arr = np.exp(tensor)\nsoftmax_probs = exp_arr / np.sum(exp_arr, axis=-1, keepdims=True)\n# Get the largest logit along the last dimension\nmax_logits = np.max(tensor, axis=-1)\nprint(np.sum(max_logits, axis=1))\n\n# Convert the indices to tokens\n# Assuming the tokens are represented as integers from 0 to 32127\ntokens = max_indices.tolist()\n# Print the chosen tokens for each input text and output position\n#for i in range(20):  # Loop over 20 input texts\n#    for j in range(6):  # Loop over 6 output positions\n#        print(f"Input text {i+1}, Output position {j+1}: Token {tokens[i][j]}")\n\n\nlearner.tokenizer.batch_decode(max_indices, skip_special_tokens=False)'

In [None]:
#predictions_token_id = np.argmax(learner.eval_pred[0][0], axis=-1)
"""token_id_cl = np.where(learner.eval_pred[0] != -100, learner.eval_pred[0], learner.tokenizer.pad_token_id)
labels_gold = learner.tokenizer.batch_decode(token_id_cl, skip_special_tokens=True)
labels_gold = [label.lower() for label in labels_gold]
labels_gold"""

'token_id_cl = np.where(learner.eval_pred[0] != -100, learner.eval_pred[0], learner.tokenizer.pad_token_id)\nlabels_gold = learner.tokenizer.batch_decode(token_id_cl, skip_special_tokens=True)\nlabels_gold = [label.lower() for label in labels_gold]\nlabels_gold'

In [None]:
#learner.iteration_probabilities

In [None]:
## apply sampling strategy
#learner.sample_breaking_ties(n_sample_al=N_SAMPLE_AL)
learner.min_certainty(n_sample_al=N_SAMPLE_AL)

print(learner.index_al_sample)
print(learner.index_train_all)
#print(learner.df_al_sample_per_iter)
#learner.df_corpus_al_sample[["label", "label_text", "text_prepared", "label_pred_probs"]]
learner.df_corpus_with_probs[["label_text", "text_prepared", "label_pred_probs"]]

[110338, 115255, 109753, 25590, 92070, 116031, 74413, 33347, 106726, 44641, 7395, 61484, 98840, 54008, 52914, 106766, 115693, 108636, 4293, 41338, 77268, 94066, 107074, 88115, 124518, 2605, 11404, 121762, 77405, 36703, 79216, 31824, 48858, 124103, 103988, 27028, 123698, 30877, 112480, 38350, 14077, 81851, 3833, 27407, 61765, 53534, 90338, 21054, 119018, 81066]
[]


Unnamed: 0_level_0,label_text,text_prepared,label_pred_probs
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
125137,2012,Context: It threatened the collapse of large f...,-0.000872
30275,Amnesia,Context: But house was also being developed on...,-0.001081
39176,unanswerable,Context: Although Calvin and Huldrych Zwingli ...,-0.000628
32129,varies from hot and subhumid tropical,Context: Due to extreme variation in elevation...,-0.006482
44136,unanswerable,Context: The Queen addressed the United Nation...,-0.018094
...,...,...,...
16512,cane,"Context: Greek kanon / Ancient Greek: κανών, A...",-0.011007
90967,Taxa,Context: Another defense that often uses color...,-0.002771
115121,antibiotic resistance,Context: Antibiotics revolutionized medicine i...,-0.014073
82065,unanswerable,Context: After the Dalai Lama's government fle...,-0.001445


In [None]:
"""probs_arr = np.array(learner.iteration_probabilities)

# Get indices of n smallest numbers
probs_min_indices = np.argpartition(probs_arr, 50)[:50]
# Extract the n smallest numbers from the array
probs_min = probs_arr[probs_min_indices]"""

  and should_run_async(code)


'probs_arr = np.array(learner.iteration_probabilities)\n\n# Get indices of n smallest numbers\nprobs_min_indices = np.argpartition(probs_arr, 50)[:50]\n# Extract the n smallest numbers from the array\nprobs_min = probs_arr[probs_min_indices]'

In [None]:
# are datasets on the wrong device / dataformat?
"""learner.dataset
learner.dataset["corpus"]["input_ids"]"""

'learner.dataset\nlearner.dataset["corpus"]["input_ids"]'

## Active learning loop from existing annotations

In [None]:
print(learner.model_params)

{'device_map': 'auto', 'offload_folder': 'offload', 'offload_state_dict': True}


In [None]:
"""for name, param in learner.model.named_parameters():
    if param.type() == 'torch.cuda.HalfTensor':
        print(f"Parameter '{name}' is in half-precision.")
    else:
        print(f"Parameter '{name}' is not in half-precision.")"""

'for name, param in learner.model.named_parameters():\n    if param.type() == \'torch.cuda.HalfTensor\':\n        print(f"Parameter \'{name}\' is in half-precision.")\n    else:\n        print(f"Parameter \'{name}\' is not in half-precision.")'

In [None]:
n_iter = 0
sample_label_distribution = []

while n_iter < N_ITER_MAX: 
    #_ = input()

    ## this is where annotation needs to happen
    # dataset_train in update below needs to ingest the manual annotations created here (or oracle labels)
    # ! need to make it work without and with existing gold labels
    #label_annotation = learner.df_corpus_al_sample["label"]
    
    # save label distributions to test impact of sampling strategy on balance
    #print("\n\n", learner.df_corpus_al_sample["label_text"].value_counts(), "\n\n")
    sample_label_distribution_iter = learner.df_corpus_al_sample["label_text"].value_counts()
    sample_label_distribution_iter.name = n_iter
    sample_label_distribution.append(sample_label_distribution_iter)
    #print(sample_label_distribution_iter)

    iteration_label_predicted = pd.Series(learner.iteration_label_predicted).value_counts()
    iteration_label_predicted.name = "pred"
    iteration_label_gold = pd.Series(learner.iteration_label_gold).value_counts()
    iteration_label_gold.name = "gold"
    print(pd.concat([iteration_label_predicted, iteration_label_gold], axis=1))


    # with first dataset update, a dataset_train is added
    # with all updates, dataset_train and dataset_corpus is updated
    #print(learner.dataset)
    learner.update_dataset()  #label_annotation=label_annotation
    #print(learner.dataset)
    # ! note: test rows are multiplied by n_label, train rows are n_sample_al * 2

    ## training run
    learner.train_test_infer()

    # print results
    for key_iter, value_metrics_dic in learner.metrics.items():
        print(f"Aggregate metrics for {key_iter}: ", {key: value_metrics_dic[key] for key in value_metrics_dic if key not in ["label_gold_raw", "label_predicted_raw"]})  # print metrics but without label lists

    # new sampling run before updating the dataset. need new index_al_sample
    #learner.sample_breaking_ties(n_sample_al=N_SAMPLE_AL)
    learner.min_certainty(n_sample_al=N_SAMPLE_AL)

    print(f"\n\n    Iteration {n_iter} finished.\n\n")
    n_iter += 1






                                pred   gold
unanswerable                     8.0  654.0
2009                             4.0    1.0
2007                             4.0    2.0
two                              4.0    4.0
Paris                            4.0    4.0
...                              ...    ...
Lúcio Costa and Oscar Niemeyer   NaN    1.0
40.4                             NaN    1.0
$25 million                      NaN    1.0
147,405                          NaN    1.0
antibiotic resistance            NaN    1.0

[2626 rows x 2 columns]
Examples in previous corpus iteration:  2000


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Number of new training data:  50


Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Examples in new corpus data without newly sampled training data:  1950


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Device: cuda


Step,Training Loss
7,30.3961
14,16.9556
21,7.6393
28,5.2555
35,4.0888


Inference: 100%|██████████| 41/41 [05:11<00:00,  7.59s/it]


  Aggregate metrics:  {'f1_macro': 0.0, 'f1_micro': 0.144, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.144, 'recall_micro': 0.144}
Aggregate metrics for iter_0:  {'f1_macro': 0.225, 'f1_micro': 0.315, 'accuracy_balanced': 0.457, 'precision_macro': 0.226, 'recall_macro': 0.227, 'precision_micro': 0.315, 'recall_micro': 0.315}
Aggregate metrics for iter_1:  {'f1_macro': 0.0, 'f1_micro': 0.144, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.144, 'recall_micro': 0.144}


    Iteration 0 finished.


                                                     pred   gold
unanswerable                                        824.0  637.0
unanswerable specimen specimen specimen specime...   42.0    NaN
unanswerable. Unanswerable. Unanswerable. Unans...   10.0    NaN
unanswerable; unanswerable; unanswerable; unans...    3.0    NaN
fibreglass fibreglass fibreglass fibreglass fib...    3.0    NaN
...              

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Number of new training data:  100


Filter:   0%|          | 0/1950 [00:00<?, ? examples/s]

Examples in new corpus data without newly sampled training data:  1900


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Device: cuda


Step,Training Loss
13,26.248
26,10.2168
39,3.657
52,1.9758
65,1.2465


Inference: 100%|██████████| 40/40 [04:56<00:00,  7.42s/it]


  Aggregate metrics:  {'f1_macro': 0.091, 'f1_micro': 0.236, 'accuracy_balanced': 0.171, 'precision_macro': 0.093, 'recall_macro': 0.091, 'precision_micro': 0.236, 'recall_micro': 0.236}
Aggregate metrics for iter_0:  {'f1_macro': 0.225, 'f1_micro': 0.315, 'accuracy_balanced': 0.457, 'precision_macro': 0.226, 'recall_macro': 0.227, 'precision_micro': 0.315, 'recall_micro': 0.315}
Aggregate metrics for iter_1:  {'f1_macro': 0.0, 'f1_micro': 0.144, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.144, 'recall_micro': 0.144}
Aggregate metrics for iter_2:  {'f1_macro': 0.091, 'f1_micro': 0.236, 'accuracy_balanced': 0.171, 'precision_macro': 0.093, 'recall_macro': 0.091, 'precision_micro': 0.236, 'recall_micro': 0.236}


    Iteration 1 finished.


                                  pred   gold
unanswerable                     581.0  625.0
Paris                              2.0    4.0
capital                            2.0    1.0
tuberculosis       

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Number of new training data:  150


Filter:   0%|          | 0/1900 [00:00<?, ? examples/s]

Examples in new corpus data without newly sampled training data:  1850


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Device: cuda


Step,Training Loss
19,24.2592
38,6.3904
57,1.5966
76,0.382
95,0.2074


Inference: 100%|██████████| 39/39 [04:50<00:00,  7.45s/it]


  Aggregate metrics:  {'f1_macro': 0.114, 'f1_micro': 0.344, 'accuracy_balanced': 0.16, 'precision_macro': 0.116, 'recall_macro': 0.114, 'precision_micro': 0.344, 'recall_micro': 0.344}
Aggregate metrics for iter_0:  {'f1_macro': 0.225, 'f1_micro': 0.315, 'accuracy_balanced': 0.457, 'precision_macro': 0.226, 'recall_macro': 0.227, 'precision_micro': 0.315, 'recall_micro': 0.315}
Aggregate metrics for iter_1:  {'f1_macro': 0.0, 'f1_micro': 0.144, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.144, 'recall_micro': 0.144}
Aggregate metrics for iter_2:  {'f1_macro': 0.091, 'f1_micro': 0.236, 'accuracy_balanced': 0.171, 'precision_macro': 0.093, 'recall_macro': 0.091, 'precision_micro': 0.236, 'recall_micro': 0.236}
Aggregate metrics for iter_3:  {'f1_macro': 0.114, 'f1_micro': 0.344, 'accuracy_balanced': 0.16, 'precision_macro': 0.116, 'recall_macro': 0.114, 'precision_micro': 0.344, 'recall_micro': 0.344}


    Iteration 2 finished.


         

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Number of new training data:  200


Filter:   0%|          | 0/1850 [00:00<?, ? examples/s]

Examples in new corpus data without newly sampled training data:  1800


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Device: cuda


Step,Training Loss
25,22.4572
50,4.0813
75,0.5124
100,0.1729
125,0.0887


Inference: 100%|██████████| 38/38 [04:35<00:00,  7.25s/it]


  Aggregate metrics:  {'f1_macro': 0.152, 'f1_micro': 0.385, 'accuracy_balanced': 0.199, 'precision_macro': 0.154, 'recall_macro': 0.151, 'precision_micro': 0.385, 'recall_micro': 0.385}
Aggregate metrics for iter_0:  {'f1_macro': 0.225, 'f1_micro': 0.315, 'accuracy_balanced': 0.457, 'precision_macro': 0.226, 'recall_macro': 0.227, 'precision_micro': 0.315, 'recall_micro': 0.315}
Aggregate metrics for iter_1:  {'f1_macro': 0.0, 'f1_micro': 0.144, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.144, 'recall_micro': 0.144}
Aggregate metrics for iter_2:  {'f1_macro': 0.091, 'f1_micro': 0.236, 'accuracy_balanced': 0.171, 'precision_macro': 0.093, 'recall_macro': 0.091, 'precision_micro': 0.236, 'recall_micro': 0.236}
Aggregate metrics for iter_3:  {'f1_macro': 0.114, 'f1_micro': 0.344, 'accuracy_balanced': 0.16, 'precision_macro': 0.116, 'recall_macro': 0.114, 'precision_micro': 0.344, 'recall_micro': 0.344}
Aggregate metrics for iter_4:  {'f1_ma

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Number of new training data:  250


Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Examples in new corpus data without newly sampled training data:  1750


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Device: cuda


Step,Training Loss
32,20.674
64,2.7822
96,0.2442
128,0.1216
160,0.0742


Inference: 100%|██████████| 37/37 [04:27<00:00,  7.24s/it]


  Aggregate metrics:  {'f1_macro': 0.18, 'f1_micro': 0.376, 'accuracy_balanced': 0.261, 'precision_macro': 0.182, 'recall_macro': 0.18, 'precision_micro': 0.376, 'recall_micro': 0.376}
Aggregate metrics for iter_0:  {'f1_macro': 0.225, 'f1_micro': 0.315, 'accuracy_balanced': 0.457, 'precision_macro': 0.226, 'recall_macro': 0.227, 'precision_micro': 0.315, 'recall_micro': 0.315}
Aggregate metrics for iter_1:  {'f1_macro': 0.0, 'f1_micro': 0.144, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.144, 'recall_micro': 0.144}
Aggregate metrics for iter_2:  {'f1_macro': 0.091, 'f1_micro': 0.236, 'accuracy_balanced': 0.171, 'precision_macro': 0.093, 'recall_macro': 0.091, 'precision_micro': 0.236, 'recall_micro': 0.236}
Aggregate metrics for iter_3:  {'f1_macro': 0.114, 'f1_micro': 0.344, 'accuracy_balanced': 0.16, 'precision_macro': 0.116, 'recall_macro': 0.114, 'precision_micro': 0.344, 'recall_micro': 0.344}
Aggregate metrics for iter_4:  {'f1_macr

In [None]:
## squad_v2 metrics
"""
# flan-t5-small
MAX_SAMPLE = 2_000
N_SAMPLE_AL = 50
N_ITER_MAX = 5
# "NA" instead of "unanswerable"
Aggregate metrics for iter_0:  {'f1_macro': 0.252, 'f1_micro': 0.338, 'accuracy_balanced': 0.497, 'precision_macro': 0.253, 'recall_macro': 0.254, 'precision_micro': 0.338, 'recall_micro': 0.338}
Aggregate metrics for iter_1:  {'f1_macro': 0.003, 'f1_micro': 0.005, 'accuracy_balanced': 0.007, 'precision_macro': 0.003, 'recall_macro': 0.003, 'precision_micro': 0.005, 'recall_micro': 0.005}
Aggregate metrics for iter_2:  {'f1_macro': 0.051, 'f1_micro': 0.085, 'accuracy_balanced': 0.122, 'precision_macro': 0.052, 'recall_macro': 0.051, 'precision_micro': 0.085, 'recall_micro': 0.085}
Aggregate metrics for iter_3:  {'f1_macro': 0.117, 'f1_micro': 0.225, 'accuracy_balanced': 0.238, 'precision_macro': 0.118, 'recall_macro': 0.117, 'precision_micro': 0.225, 'recall_micro': 0.225}
Aggregate metrics for iter_4:  {'f1_macro': 0.12, 'f1_micro': 0.278, 'accuracy_balanced': 0.211, 'precision_macro': 0.121, 'recall_macro': 0.12, 'precision_micro': 0.278, 'recall_micro': 0.278}
# "unanswerable" and self.model = trainer.model
Aggregate metrics for iter_0:  {'f1_macro': 0.252, 'f1_micro': 0.351, 'accuracy_balanced': 0.497, 'precision_macro': 0.253, 'recall_macro': 0.254, 'precision_micro': 0.351, 'recall_micro': 0.351}
Aggregate metrics for iter_1:  {'f1_macro': 0.013, 'f1_micro': 0.156, 'accuracy_balanced': 0.025, 'precision_macro': 0.013, 'recall_macro': 0.013, 'precision_micro': 0.156, 'recall_micro': 0.156}
Aggregate metrics for iter_2:  {'f1_macro': 0.066, 'f1_micro': 0.274, 'accuracy_balanced': 0.103, 'precision_macro': 0.066, 'recall_macro': 0.066, 'precision_micro': 0.274, 'recall_micro': 0.274}
Aggregate metrics for iter_3:  {'f1_macro': 0.117, 'f1_micro': 0.332, 'accuracy_balanced': 0.169, 'precision_macro': 0.119, 'recall_macro': 0.117, 'precision_micro': 0.332, 'recall_micro': 0.332}
Aggregate metrics for iter_4:  {'f1_macro': 0.134, 'f1_micro': 0.341, 'accuracy_balanced': 0.199, 'precision_macro': 0.136, 'recall_macro': 0.134, 'precision_micro': 0.341, 'recall_micro': 0.341}
Aggregate metrics for iter_5:  {'f1_macro': 0.152, 'f1_micro': 0.37, 'accuracy_balanced': 0.215, 'precision_macro': 0.152, 'recall_macro': 0.152, 'precision_micro': 0.37, 'recall_micro': 0.37}
# same as above, only refined prompt with "extracted answer"
Aggregate metrics for iter_0:  {'f1_macro': 0.225, 'f1_micro': 0.315, 'accuracy_balanced': 0.457, 'precision_macro': 0.226, 'recall_macro': 0.227, 'precision_micro': 0.315, 'recall_micro': 0.315}
Aggregate metrics for iter_1:  {'f1_macro': 0.0, 'f1_micro': 0.144, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.144, 'recall_micro': 0.144}
Aggregate metrics for iter_2:  {'f1_macro': 0.091, 'f1_micro': 0.236, 'accuracy_balanced': 0.171, 'precision_macro': 0.093, 'recall_macro': 0.091, 'precision_micro': 0.236, 'recall_micro': 0.236}
Aggregate metrics for iter_3:  {'f1_macro': 0.114, 'f1_micro': 0.344, 'accuracy_balanced': 0.16, 'precision_macro': 0.116, 'recall_macro': 0.114, 'precision_micro': 0.344, 'recall_micro': 0.344}
Aggregate metrics for iter_4:  {'f1_macro': 0.152, 'f1_micro': 0.385, 'accuracy_balanced': 0.199, 'precision_macro': 0.154, 'recall_macro': 0.151, 'precision_micro': 0.385, 'recall_micro': 0.385}
Aggregate metrics for iter_5:  {'f1_macro': 0.18, 'f1_micro': 0.376, 'accuracy_balanced': 0.261, 'precision_macro': 0.182, 'recall_macro': 0.18, 'precision_micro': 0.376, 'recall_micro': 0.376}


"""

  and should_run_async(code)


'\n# flan-t5-small\nMAX_SAMPLE = 2_000\nN_SAMPLE_AL = 50\nN_ITER_MAX = 5\n# "NA" instead of "unanswerable"\nAggregate metrics for iter_0:  {\'f1_macro\': 0.252, \'f1_micro\': 0.338, \'accuracy_balanced\': 0.497, \'precision_macro\': 0.253, \'recall_macro\': 0.254, \'precision_micro\': 0.338, \'recall_micro\': 0.338}\nAggregate metrics for iter_1:  {\'f1_macro\': 0.003, \'f1_micro\': 0.005, \'accuracy_balanced\': 0.007, \'precision_macro\': 0.003, \'recall_macro\': 0.003, \'precision_micro\': 0.005, \'recall_micro\': 0.005}\nAggregate metrics for iter_2:  {\'f1_macro\': 0.051, \'f1_micro\': 0.085, \'accuracy_balanced\': 0.122, \'precision_macro\': 0.052, \'recall_macro\': 0.051, \'precision_micro\': 0.085, \'recall_micro\': 0.085}\nAggregate metrics for iter_3:  {\'f1_macro\': 0.117, \'f1_micro\': 0.225, \'accuracy_balanced\': 0.238, \'precision_macro\': 0.118, \'recall_macro\': 0.117, \'precision_micro\': 0.225, \'recall_micro\': 0.225}\nAggregate metrics for iter_4:  {\'f1_macro\': 0.

In [None]:
# generative models test
""" 
### pimpo metrics
# flan-t5-small, 50 per iter, forgot full dataset
    Aggregate metrics:  {'f1_macro': 0.19, 'f1_micro': 0.619, 'accuracy_balanced': 0.301, 'precision_macro': 0.189, 'recall_macro': 0.192, 'precision_micro': 0.619, 'recall_micro': 0.619}
Aggregate metrics for iter_0:  {'f1_macro': 0.0, 'f1_micro': 0.0, 'accuracy_balanced': 0.0, 'precision_macro': 0.0, 'recall_macro': 0.0, 'precision_micro': 0.0, 'recall_micro': 0.0}
Aggregate metrics for iter_1:  {'f1_macro': 0.018, 'f1_micro': 0.189, 'accuracy_balanced': 0.147, 'precision_macro': 0.027, 'recall_macro': 0.027, 'precision_micro': 0.189, 'recall_micro': 0.189}
Aggregate metrics for iter_2:  {'f1_macro': 0.062, 'f1_micro': 0.396, 'accuracy_balanced': 0.184, 'precision_macro': 0.065, 'recall_macro': 0.061, 'precision_micro': 0.396, 'recall_micro': 0.396}
Aggregate metrics for iter_3:  {'f1_macro': 0.106, 'f1_micro': 0.45, 'accuracy_balanced': 0.212, 'precision_macro': 0.129, 'recall_macro': 0.106, 'precision_micro': 0.45, 'recall_micro': 0.45}
Aggregate metrics for iter_4:  {'f1_macro': 0.069, 'f1_micro': 0.459, 'accuracy_balanced': 0.228, 'precision_macro': 0.069, 'recall_macro': 0.073, 'precision_micro': 0.459, 'recall_micro': 0.459}
Aggregate metrics for iter_5:  {'f1_macro': 0.123, 'f1_micro': 0.529, 'accuracy_balanced': 0.249, 'precision_macro': 0.123, 'recall_macro': 0.125, 'precision_micro': 0.529, 'recall_micro': 0.529}
Aggregate metrics for iter_6:  {'f1_macro': 0.209, 'f1_micro': 0.534, 'accuracy_balanced': 0.268, 'precision_macro': 0.228, 'recall_macro': 0.208, 'precision_micro': 0.534, 'recall_micro': 0.534}
Aggregate metrics for iter_7:  {'f1_macro': 0.209, 'f1_micro': 0.593, 'accuracy_balanced': 0.301, 'precision_macro': 0.22, 'recall_macro': 0.211, 'precision_micro': 0.593, 'recall_micro': 0.593}
Aggregate metrics for iter_8:  {'f1_macro': 0.136, 'f1_micro': 0.584, 'accuracy_balanced': 0.283, 'precision_macro': 0.133, 'recall_macro': 0.141, 'precision_micro': 0.584, 'recall_micro': 0.584}
Aggregate metrics for iter_9:  {'f1_macro': 0.183, 'f1_micro': 0.61, 'accuracy_balanced': 0.291, 'precision_macro': 0.183, 'recall_macro': 0.185, 'precision_micro': 0.61, 'recall_micro': 0.61}
Aggregate metrics for iter_10:  {'f1_macro': 0.19, 'f1_micro': 0.619, 'accuracy_balanced': 0.301, 'precision_macro': 0.189, 'recall_macro': 0.192, 'precision_micro': 0.619, 'recall_micro': 0.619}

# flan-T5-base
MAX_SAMPLE_MAJORITY = 10_000
MAX_SAMPLE = 5_000
N_SAMPLE_AL = 50
N_ITER_MAX = 10
DATE = 20230207
TASK = "pimpo"
Aggregate metrics for iter_0:  {'f1_macro': 0.034, 'f1_micro': 0.091, 'accuracy_balanced': 0.187, 'precision_macro': 0.113, 'recall_macro': 0.082, 'precision_micro': 0.091, 'recall_micro': 0.091}
Aggregate metrics for iter_1:  {'f1_macro': 0.073, 'f1_micro': 0.199, 'accuracy_balanced': 0.156, 'precision_macro': 0.111, 'recall_macro': 0.109, 'precision_micro': 0.199, 'recall_micro': 0.199}
Aggregate metrics for iter_2:  {'f1_macro': 0.131, 'f1_micro': 0.398, 'accuracy_balanced': 0.174, 'precision_macro': 0.144, 'recall_macro': 0.136, 'precision_micro': 0.398, 'recall_micro': 0.398}
Aggregate metrics for iter_3:  {'f1_macro': 0.095, 'f1_micro': 0.481, 'accuracy_balanced': 0.256, 'precision_macro': 0.125, 'recall_macro': 0.128, 'precision_micro': 0.481, 'recall_micro': 0.481}
Aggregate metrics for iter_4:  {'f1_macro': 0.175, 'f1_micro': 0.685, 'accuracy_balanced': 0.302, 'precision_macro': 0.199, 'recall_macro': 0.192, 'precision_micro': 0.685, 'recall_micro': 0.685}
Aggregate metrics for iter_5:  {'f1_macro': 0.314, 'f1_micro': 0.652, 'accuracy_balanced': 0.354, 'precision_macro': 0.334, 'recall_macro': 0.354, 'precision_micro': 0.652, 'recall_micro': 0.652}
Aggregate metrics for iter_6:  {'f1_macro': 0.315, 'f1_micro': 0.699, 'accuracy_balanced': 0.374, 'precision_macro': 0.31, 'recall_macro': 0.328, 'precision_micro': 0.699, 'recall_micro': 0.699}
Aggregate metrics for iter_7:  {'f1_macro': 0.296, 'f1_micro': 0.711, 'accuracy_balanced': 0.395, 'precision_macro': 0.324, 'recall_macro': 0.307, 'precision_micro': 0.711, 'recall_micro': 0.711}
Aggregate metrics for iter_8:  {'f1_macro': 0.352, 'f1_micro': 0.722, 'accuracy_balanced': 0.427, 'precision_macro': 0.338, 'recall_macro': 0.374, 'precision_micro': 0.722, 'recall_micro': 0.722}
Aggregate metrics for iter_9:  {'f1_macro': 0.383, 'f1_micro': 0.728, 'accuracy_balanced': 0.398, 'precision_macro': 0.377, 'recall_macro': 0.398, 'precision_micro': 0.728, 'recall_micro': 0.728}
Aggregate metrics for iter_10:  {'f1_macro': 0.272, 'f1_micro': 0.711, 'accuracy_balanced': 0.41, 'precision_macro': 0.267, 'recall_macro': 0.287, 'precision_micro': 0.711, 'recall_micro': 0.711}

# same as above, only flan-T5-small and detailed instructions
Aggregate metrics for iter_0:  {'f1_macro': 0.099, 'f1_micro': 0.527, 'accuracy_balanced': 0.143, 'precision_macro': 0.075, 'recall_macro': 0.143, 'precision_micro': 0.527, 'recall_micro': 0.527}
Aggregate metrics for iter_1:  {'f1_macro': 0.056, 'f1_micro': 0.337, 'accuracy_balanced': 0.139, 'precision_macro': 0.059, 'recall_macro': 0.065, 'precision_micro': 0.337, 'recall_micro': 0.337}
Aggregate metrics for iter_2:  {'f1_macro': 0.087, 'f1_micro': 0.367, 'accuracy_balanced': 0.144, 'precision_macro': 0.09, 'recall_macro': 0.101, 'precision_micro': 0.367, 'recall_micro': 0.367}
Aggregate metrics for iter_3:  {'f1_macro': 0.12, 'f1_micro': 0.431, 'accuracy_balanced': 0.139, 'precision_macro': 0.123, 'recall_macro': 0.139, 'precision_micro': 0.431, 'recall_micro': 0.431}
Aggregate metrics for iter_4:  {'f1_macro': 0.096, 'f1_micro': 0.505, 'accuracy_balanced': 0.143, 'precision_macro': 0.12, 'recall_macro': 0.125, 'precision_micro': 0.505, 'recall_micro': 0.505}
Aggregate metrics for iter_5:  {'f1_macro': 0.134, 'f1_micro': 0.436, 'accuracy_balanced': 0.149, 'precision_macro': 0.142, 'recall_macro': 0.149, 'precision_micro': 0.436, 'recall_micro': 0.436}
Aggregate metrics for iter_6:  {'f1_macro': 0.119, 'f1_micro': 0.455, 'accuracy_balanced': 0.142, 'precision_macro': 0.131, 'recall_macro': 0.142, 'precision_micro': 0.455, 'recall_micro': 0.455}
Aggregate metrics for iter_7:  {'f1_macro': 0.111, 'f1_micro': 0.512, 'accuracy_balanced': 0.145, 'precision_macro': 0.139, 'recall_macro': 0.145, 'precision_micro': 0.512, 'recall_micro': 0.512}

# same as above, only falen-T5-base
Aggregate metrics for iter_0:  {'f1_macro': 0.008, 'f1_micro': 0.03, 'accuracy_balanced': 0.143, 'precision_macro': 0.004, 'recall_macro': 0.143, 'precision_micro': 0.03, 'recall_micro': 0.03}
Aggregate metrics for iter_1:  {'f1_macro': 0.062, 'f1_micro': 0.272, 'accuracy_balanced': 0.134, 'precision_macro': 0.071, 'recall_macro': 0.078, 'precision_micro': 0.272, 'recall_micro': 0.272}
Aggregate metrics for iter_2:  {'f1_macro': 0.1, 'f1_micro': 0.298, 'accuracy_balanced': 0.139, 'precision_macro': 0.106, 'recall_macro': 0.108, 'precision_micro': 0.298, 'recall_micro': 0.298}
Aggregate metrics for iter_3:  {'f1_macro': 0.11, 'f1_micro': 0.455, 'accuracy_balanced': 0.144, 'precision_macro': 0.091, 'recall_macro': 0.144, 'precision_micro': 0.455, 'recall_micro': 0.455}



"""


' \n### pimpo metrics\n# flan-t5-small, 50 per iter, forgot full dataset\n    Aggregate metrics:  {\'f1_macro\': 0.19, \'f1_micro\': 0.619, \'accuracy_balanced\': 0.301, \'precision_macro\': 0.189, \'recall_macro\': 0.192, \'precision_micro\': 0.619, \'recall_micro\': 0.619}\nAggregate metrics for iter_0:  {\'f1_macro\': 0.0, \'f1_micro\': 0.0, \'accuracy_balanced\': 0.0, \'precision_macro\': 0.0, \'recall_macro\': 0.0, \'precision_micro\': 0.0, \'recall_micro\': 0.0}\nAggregate metrics for iter_1:  {\'f1_macro\': 0.018, \'f1_micro\': 0.189, \'accuracy_balanced\': 0.147, \'precision_macro\': 0.027, \'recall_macro\': 0.027, \'precision_micro\': 0.189, \'recall_micro\': 0.189}\nAggregate metrics for iter_2:  {\'f1_macro\': 0.062, \'f1_micro\': 0.396, \'accuracy_balanced\': 0.184, \'precision_macro\': 0.065, \'recall_macro\': 0.061, \'precision_micro\': 0.396, \'recall_micro\': 0.396}\nAggregate metrics for iter_3:  {\'f1_macro\': 0.106, \'f1_micro\': 0.45, \'accuracy_balanced\': 0.212, \

In [None]:
"""
# deberta-v3-xsmall, pimpo 7-class,  20 samp/iter
Aggregate metrics for iter_0:  {'eval_loss': 0.5628172755241394, 'eval_f1_macro': 0.10033008782443473, 'eval_f1_micro': 0.18049525101763908, 'eval_accuracy_balanced': 0.20495167826276692, 'eval_accuracy_not_b': 0.18049525101763908, 'eval_precision_macro': 0.20943928165314035, 'eval_recall_macro': 0.20495167826276692, 'eval_precision_micro': 0.18049525101763908, 'eval_recall_micro': 0.18049525101763908, 'eval_runtime': 342.0504, 'eval_samples_per_second': 1206.606, 'eval_steps_per_second': 1.006}
Aggregate metrics for iter_1:  {'eval_loss': 0.6045360565185547, 'eval_f1_macro': 0.15260009714764486, 'eval_f1_micro': 0.8471666101119782, 'eval_accuracy_balanced': 0.16439810608981156, 'eval_accuracy_not_b': 0.8471666101119782, 'eval_precision_macro': 0.33555706572966665, 'eval_recall_macro': 0.16439810608981156, 'eval_precision_micro': 0.8471666101119782, 'eval_recall_micro': 0.8471666101119782, 'eval_runtime': 345.7595, 'eval_samples_per_second': 1193.257, 'eval_steps_per_second': 0.995, 'epoch': 7.0}
Aggregate metrics for iter_2:  {'eval_loss': 0.4772863984107971, 'eval_f1_macro': 0.26077623269021505, 'eval_f1_micro': 0.8516972165648337, 'eval_accuracy_balanced': 0.29263319995117204, 'eval_accuracy_not_b': 0.8516972165648337, 'eval_precision_macro': 0.36578347167342795, 'eval_recall_macro': 0.29263319995117204, 'eval_precision_micro': 0.8516972165648337, 'eval_recall_micro': 0.8516972165648337, 'eval_runtime': 345.1595, 'eval_samples_per_second': 1194.926, 'eval_steps_per_second': 0.997, 'epoch': 7.0}
Aggregate metrics for iter_3:  {'eval_loss': 0.47742778062820435, 'eval_f1_macro': 0.26043270223998805, 'eval_f1_micro': 0.8612733446519525, 'eval_accuracy_balanced': 0.28025374662633135, 'eval_accuracy_not_b': 0.8612733446519525, 'eval_precision_macro': 0.36634156674514073, 'eval_recall_macro': 0.28025374662633135, 'eval_precision_micro': 0.8612733446519525, 'eval_recall_micro': 0.8612733446519525, 'eval_runtime': 347.2688, 'eval_samples_per_second': 1187.265, 'eval_steps_per_second': 0.991, 'epoch': 7.0}
Aggregate metrics for iter_4:  {'eval_loss': 0.4174456000328064, 'eval_f1_macro': 0.31376364102688165, 'eval_f1_micro': 0.8698539402173914, 'eval_accuracy_balanced': 0.31283989290145314, 'eval_accuracy_not_b': 0.8698539402173913, 'eval_precision_macro': 0.384340706788142, 'eval_recall_macro': 0.31283989290145314, 'eval_precision_micro': 0.8698539402173913, 'eval_recall_micro': 0.8698539402173913, 'eval_runtime': 346.3045, 'eval_samples_per_second': 1190.166, 'eval_steps_per_second': 0.993, 'epoch': 7.0}
Aggregate metrics for iter_5:  {'eval_loss': 0.4523952007293701, 'eval_f1_macro': 0.28008776426269627, 'eval_f1_micro': 0.8661909616038056, 'eval_accuracy_balanced': 0.3104114533290034, 'eval_accuracy_not_b': 0.8661909616038056, 'eval_precision_macro': 0.4061062016477311, 'eval_recall_macro': 0.3104114533290034, 'eval_precision_micro': 0.8661909616038056, 'eval_recall_micro': 0.8661909616038056, 'eval_runtime': 347.1449, 'eval_samples_per_second': 1186.882, 'eval_steps_per_second': 0.991, 'epoch': 7.0}
Aggregate metrics for iter_6:  {'eval_loss': 0.5697242021560669, 'eval_f1_macro': 0.2961691315816643, 'eval_f1_micro': 0.865210740992522, 'eval_accuracy_balanced': 0.30603519038165455, 'eval_accuracy_not_b': 0.865210740992522, 'eval_precision_macro': 0.39979096609180625, 'eval_recall_macro': 0.30603519038165455, 'eval_precision_micro': 0.865210740992522, 'eval_recall_micro': 0.865210740992522, 'eval_runtime': 346.7695, 'eval_samples_per_second': 1187.763, 'eval_steps_per_second': 0.992, 'epoch': 7.0}
Aggregate metrics for iter_7:  {'eval_loss': 0.32757243514060974, 'eval_f1_macro': 0.31114812126500613, 'eval_f1_micro': 0.8739034342060523, 'eval_accuracy_balanced': 0.30836998783057973, 'eval_accuracy_not_b': 0.8739034342060523, 'eval_precision_macro': 0.4103498778685187, 'eval_recall_macro': 0.30836998783057973, 'eval_precision_micro': 0.8739034342060523, 'eval_recall_micro': 0.8739034342060523, 'eval_runtime': 344.2441, 'eval_samples_per_second': 1196.07, 'eval_steps_per_second': 0.999, 'epoch': 7.0}
Aggregate metrics for iter_8:  {'eval_loss': 0.5603343844413757, 'eval_f1_macro': 0.31475080038285846, 'eval_f1_micro': 0.8693197278911564, 'eval_accuracy_balanced': 0.3187929055460235, 'eval_accuracy_not_b': 0.8693197278911564, 'eval_precision_macro': 0.3898263536409363, 'eval_recall_macro': 0.3187929055460235, 'eval_precision_micro': 0.8693197278911564, 'eval_recall_micro': 0.8693197278911564, 'eval_runtime': 345.6567, 'eval_samples_per_second': 1190.777, 'eval_steps_per_second': 0.992, 'epoch': 7.0}
Aggregate metrics for iter_9:  {'eval_loss': 0.47514623403549194, 'eval_f1_macro': 0.30322578859648897, 'eval_f1_micro': 0.8719462402177611, 'eval_accuracy_balanced': 0.31412035033954816, 'eval_accuracy_not_b': 0.8719462402177611, 'eval_precision_macro': 0.39481936275701085, 'eval_recall_macro': 0.31412035033954816, 'eval_precision_micro': 0.8719462402177611, 'eval_recall_micro': 0.8719462402177611, 'eval_runtime': 344.5713, 'eval_samples_per_second': 1194.121, 'eval_steps_per_second': 0.995, 'epoch': 7.0}

# deberta-v3-base, pimpo 7-class,  50 samp/iter
Aggregate metrics for iter_0:  {'eval_loss': 1.0465834140777588, 'eval_f1_macro': 0.18708357519323013, 'eval_f1_micro': 0.44886363636363635, 'eval_accuracy_balanced': 0.2837292310996416, 'eval_accuracy_not_b': 0.44886363636363635, 'eval_precision_macro': 0.20706836504511425, 'eval_recall_macro': 0.2837292310996416, 'eval_precision_micro': 0.44886363636363635, 'eval_recall_micro': 0.44886363636363635, 'eval_runtime': 587.6281, 'eval_samples_per_second': 702.349, 'eval_steps_per_second': 1.098}
Aggregate metrics for iter_1:  {'eval_loss': 0.7379368543624878, 'eval_f1_macro': 0.3540535852129493, 'eval_f1_micro': 0.8555932778815142, 'eval_accuracy_balanced': 0.3962828367929724, 'eval_accuracy_not_b': 0.8555932778815142, 'eval_precision_macro': 0.36594427731868545, 'eval_recall_macro': 0.3962828367929724, 'eval_precision_micro': 0.8555932778815142, 'eval_recall_micro': 0.8555932778815142, 'eval_runtime': 592.2202, 'eval_samples_per_second': 696.312, 'eval_steps_per_second': 1.089, 'epoch': 7.0}
Aggregate metrics for iter_2:  {'eval_loss': 0.23365318775177002, 'eval_f1_macro': 0.3853095478008094, 'eval_f1_micro': 0.882330954808019, 'eval_accuracy_balanced': 0.39244891225949236, 'eval_accuracy_not_b': 0.882330954808019, 'eval_precision_macro': 0.4130412842144277, 'eval_recall_macro': 0.39244891225949236, 'eval_precision_micro': 0.882330954808019, 'eval_recall_micro': 0.882330954808019, 'eval_runtime': 590.1724, 'eval_samples_per_second': 698.135, 'eval_steps_per_second': 1.091, 'epoch': 7.0}
Aggregate metrics for iter_3:  {'eval_loss': 0.31954094767570496, 'eval_f1_macro': 0.3910498559135234, 'eval_f1_micro': 0.8786260839993199, 'eval_accuracy_balanced': 0.38795022533818585, 'eval_accuracy_not_b': 0.8786260839993199, 'eval_precision_macro': 0.45431584606544356, 'eval_recall_macro': 0.38795022533818585, 'eval_precision_micro': 0.8786260839993199, 'eval_recall_micro': 0.8786260839993199, 'eval_runtime': 592.409, 'eval_samples_per_second': 694.908, 'eval_steps_per_second': 1.087, 'epoch': 7.0}
Aggregate metrics for iter_4:  {'eval_loss': 0.14558205008506775, 'eval_f1_macro': 0.42822905247342585, 'eval_f1_micro': 0.8938053097345132, 'eval_accuracy_balanced': 0.4294680989704006, 'eval_accuracy_not_b': 0.8938053097345132, 'eval_precision_macro': 0.4412513971659594, 'eval_recall_macro': 0.4294680989704006, 'eval_precision_micro': 0.8938053097345132, 'eval_recall_micro': 0.8938053097345132, 'eval_runtime': 589.325, 'eval_samples_per_second': 697.951, 'eval_steps_per_second': 1.091, 'epoch': 7.0}
Aggregate metrics for iter_5:  {'eval_loss': 0.2067262977361679, 'eval_f1_macro': 0.4034053528084353, 'eval_f1_micro': 0.8973769374893544, 'eval_accuracy_balanced': 0.4072557747705853, 'eval_accuracy_not_b': 0.8973769374893544, 'eval_precision_macro': 0.44854908851359454, 'eval_recall_macro': 0.4072557747705853, 'eval_precision_micro': 0.8973769374893544, 'eval_recall_micro': 0.8973769374893544, 'eval_runtime': 591.3407, 'eval_samples_per_second': 694.98, 'eval_steps_per_second': 1.087, 'epoch': 7.0}
Aggregate metrics for iter_6:  {'eval_loss': 0.2272566556930542, 'eval_f1_macro': 0.4472755733571036, 'eval_f1_micro': 0.904398227071258, 'eval_accuracy_balanced': 0.43738736733482897, 'eval_accuracy_not_b': 0.904398227071258, 'eval_precision_macro': 0.48463336923636635, 'eval_recall_macro': 0.43738736733482897, 'eval_precision_micro': 0.904398227071258, 'eval_recall_micro': 0.904398227071258, 'eval_runtime': 588.6974, 'eval_samples_per_second': 697.506, 'eval_steps_per_second': 1.091, 'epoch': 7.0}
Aggregate metrics for iter_7:  {'eval_loss': 0.290232390165329, 'eval_f1_macro': 0.42661537998829313, 'eval_f1_micro': 0.8987203548882443, 'eval_accuracy_balanced': 0.42904588924320325, 'eval_accuracy_not_b': 0.8987203548882443, 'eval_precision_macro': 0.4540518816208738, 'eval_recall_macro': 0.42904588924320325, 'eval_precision_micro': 0.8987203548882443, 'eval_recall_micro': 0.8987203548882443, 'eval_runtime': 589.9751, 'eval_samples_per_second': 695.402, 'eval_steps_per_second': 1.088, 'epoch': 7.0}
Aggregate metrics for iter_8:  {'eval_loss': 0.32584547996520996, 'eval_f1_macro': 0.4615640923438204, 'eval_f1_micro': 0.9022711748633878, 'eval_accuracy_balanced': 0.4472825467518498, 'eval_accuracy_not_b': 0.9022711748633879, 'eval_precision_macro': 0.48341487497111035, 'eval_recall_macro': 0.4472825467518498, 'eval_precision_micro': 0.9022711748633879, 'eval_recall_micro': 0.9022711748633879, 'eval_runtime': 588.7656, 'eval_samples_per_second': 696.236, 'eval_steps_per_second': 1.089, 'epoch': 7.0}
Aggregate metrics for iter_9:  {'eval_loss': 0.21070201694965363, 'eval_f1_macro': 0.43570455533758573, 'eval_f1_micro': 0.9026320287130406, 'eval_accuracy_balanced': 0.4140247304519257, 'eval_accuracy_not_b': 0.9026320287130405, 'eval_precision_macro': 0.4980765590865742, 'eval_recall_macro': 0.4140247304519257, 'eval_precision_micro': 0.9026320287130405, 'eval_recall_micro': 0.9026320287130405, 'eval_runtime': 589.1166, 'eval_samples_per_second': 695.227, 'eval_steps_per_second': 1.086, 'epoch': 7.0}
Aggregate metrics for iter_10:  {'eval_loss': 0.15217804908752441, 'eval_f1_macro': 0.45882320773920243, 'eval_f1_micro': 0.9075265138556278, 'eval_accuracy_balanced': 0.45064523080760244, 'eval_accuracy_not_b': 0.9075265138556278, 'eval_precision_macro': 0.4860711524841334, 'eval_recall_macro': 0.45064523080760244, 'eval_precision_micro': 0.9075265138556278, 'eval_recall_micro': 0.9075265138556278, 'eval_runtime': 588.5002, 'eval_samples_per_second': 695.361, 'eval_steps_per_second': 1.088, 'epoch': 7.0}

"""

"\n# deberta-v3-xsmall, pimpo 7-class,  20 samp/iter\nAggregate metrics for iter_0:  {'eval_loss': 0.5628172755241394, 'eval_f1_macro': 0.10033008782443473, 'eval_f1_micro': 0.18049525101763908, 'eval_accuracy_balanced': 0.20495167826276692, 'eval_accuracy_not_b': 0.18049525101763908, 'eval_precision_macro': 0.20943928165314035, 'eval_recall_macro': 0.20495167826276692, 'eval_precision_micro': 0.18049525101763908, 'eval_recall_micro': 0.18049525101763908, 'eval_runtime': 342.0504, 'eval_samples_per_second': 1206.606, 'eval_steps_per_second': 1.006}\nAggregate metrics for iter_1:  {'eval_loss': 0.6045360565185547, 'eval_f1_macro': 0.15260009714764486, 'eval_f1_micro': 0.8471666101119782, 'eval_accuracy_balanced': 0.16439810608981156, 'eval_accuracy_not_b': 0.8471666101119782, 'eval_precision_macro': 0.33555706572966665, 'eval_recall_macro': 0.16439810608981156, 'eval_precision_micro': 0.8471666101119782, 'eval_recall_micro': 0.8471666101119782, 'eval_runtime': 345.7595, 'eval_samples_pe

In [None]:
### extract prediction results for downstream validity analyses
# metrics results
learner.n_iteration
learner.metrics
# labels and predictions for latest test iteration on corpus
learner.iteration_label_gold
learner.iteration_label_predicted
learner.iteration_probabilities

## merge predictions from learner with df_corpus and all meta-data
# get idx for last tested corpus for getting full df_corpus with all columns
#print(len(list(set(learner.dataset["corpus"]["idx"]))))
df_corpus_test = df_corpus[df_corpus.index.isin(list(set(learner.dataset["corpus"]["idx"])))]
df_corpus_test["label_pred"] = learner.iteration_label_predicted
# add probabilities, e.g. for data cleaning and interpretation downstream
df_corpus_test["label_probabilities"] = learner.iteration_probabilities
df_corpus_train = df_corpus[~df_corpus.index.isin(list(set(learner.dataset["corpus"]["idx"])))]

df_corpus_concat = pd.concat([df_corpus_train, df_corpus_test])

# add label text for predictions
"""label_text_map = {}
for i, row in df_corpus_concat[~df_corpus_concat.label_text.duplicated(keep='first')].iterrows():
    label_text_map.update({row["label"]: row["label_text"]})
df_corpus_concat["label_text_pred"] = df_corpus_concat["label_pred"].map(label_text_map)"""

## translate label pred back to -2 to +2 labels to enable mean calculation for correlation
if TASK == "uk-leftright":
    task_label_text_map_reversed = {value: key for key, value in task_label_text_map.items()}
    df_corpus_concat["label_scale_pred"] = df_corpus_concat.label_text_pred.map(task_label_text_map_reversed)
# in case of simplified -1 to +1 scale
elif TASK == "uk-leftright-simple":
    task_label_text_map_reversed = {value: key for key, value in task_label_text_map.items() if key not in [-2, 2]}
    df_corpus_concat["label_scale_pred"] = df_corpus_concat.label_text_pred.map(task_label_text_map_reversed)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corpus_test["label_pred"] = learner.iteration_label_predicted
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corpus_test["label_probabilities"] = learner.iteration_probabilities


In [None]:
df_corpus_concat[['label_text', 'label_pred', 'label_probabilities', "text_prepared"]]  # 'label_text_pred',

Unnamed: 0_level_0,label_text,label_pred,label_probabilities,text_prepared
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
32129,varies from hot and subhumid tropical,,,Context: Due to extreme variation in elevation...
87476,unanswerable,,,"Context: Throughout the war, the King and Quee..."
6579,pups,,,"Context: In breeding circles, a male canine is..."
118125,unanswerable,,,Context: About half of the population depends ...
82239,team selection and tactics,,,Context: International friendlies give team ma...
...,...,...,...,...
17983,unanswerable,France,-0.000603,"Context: In the early 1950s, Universal set up ..."
16512,cane,unanswerable,-0.000214,"Context: Greek kanon / Ancient Greek: κανών, A..."
90967,Taxa,Taxa from the toxic genus Heliconius,-0.002850,Context: Another defense that often uses color...
82065,unanswerable,Dharamsala,-0.000455,Context: After the Dalai Lama's government fle...


  and should_run_async(code)


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


  and should_run_async(code)


In [None]:
## save all data with outputs to disk
n_sample_str = n_iter * N_SAMPLE_AL
while len(str(n_sample_str)) <= 3:
    n_sample_str = "0" + str(n_sample_str)

#df_corpus_concat.to_csv(f"./outputs/df_{DATASET}_{n_sample_str}_{MODEL_NAME.split('/')[-1][:15]}_{DATE}_{n_iter}.zip",
#                    compression={"method": "zip", "archive_name": f"df_{DATASET}_{n_sample_str}_{MODEL_NAME.split('/')[-1][:15]}_{DATE}_{n_iter}.csv"}, index=False)

In [None]:
## also save learner to disk to inspect iteration progress
import pickle
import gzip

# compression
#with gzip.open(f"./outputs/{DATASET}_{n_sample_str}_{MODEL_NAME.split('/')[-1][:15]}_{DATE}_{n_iter}_learner.pickle.gz", "wb", compresslevel=9) as f:
#    pickle.dump(learner, f)
# without compression
#with open(f"./outputs/{DATASET}_{n_sample_str}_{MODEL_NAME.split('/')[-1][:15]}_{DATE}_{n_iter}_learner.pkl", 'wb') as f:
#    pickle.dump(learner, f)

## open
#with gzip.open(f"./data-classified/{DATASET}/{DATASET}_{n_sample_str}_{MODEL_NAME.split('/')[-1][:15]}_{DATE}_{n_iter}_learner.pickle.gz", 'rb') as f:
#    learner_loaded = pickle.load(f)



## inspect results

In [None]:
## plot metrics over iterations
metrics_dic_lst = []
n_sample = 0
for key_iter, value_metrics_dic in learner.metrics.items():
    metrics_dic_iter = {key: value_metrics_dic[key] for key in value_metrics_dic if key not in ["eval_label_gold_raw", "eval_label_predicted_raw"]}
    metrics_dic_iter = {"iter_samp_total": n_sample, **metrics_dic_iter}
    metrics_dic_lst.append(metrics_dic_iter)
    n_sample += N_SAMPLE_AL

df_metrics = pd.DataFrame(metrics_dic_lst)
#df_metrics.plot(x="iter", subplots=[("iter", "eval_f1_macro"), ("iter", "eval_f1_micro"), ("iter", "eval_accuracy_balanced")])

df_metrics.plot(x="iter_samp_total", y="f1_macro", grid=True)
df_metrics.plot(x="iter_samp_total", y="f1_micro", grid=True)
#df_metrics.plot(x="iter_samp_total", y="eval_accuracy_balanced")



In [None]:
## plot label distribution sampled over iterations
df_label_dist = pd.DataFrame(sample_label_distribution)

df_label_dist.plot(kind="bar", stacked=True, legend=True).legend(loc='center left', bbox_to_anchor=(1.0, 0.8))

# final label distribution in training data
print(df_corpus_train.label_text.value_counts())

In [None]:
# real label distribution for this run
learner.df_corpus_format["label_text"].value_counts()


In [None]:
# actual predictions by the model
df_corpus_concat["label_pred"].value_counts()

## test other query strategies

In [None]:
"""### breaking ties balanced
n_sample_al = N_SAMPLE_AL

#def sample_breaking_ties_balanced(self, n_sample_al=5):  # results_corpus=None, df_corpus=None, label_text_alphabetical=None,

hypo_prob_entail = learner.results_corpus["eval_hypo_probabilities_entail"]
# mapping entail probabilities to labels
hypo_prob_entail = [{label_text: round(entail_score, 4) for entail_score, label_text in zip(prob_entail, learner.label_text_alphabetical)} for prob_entail in hypo_prob_entail]

### splitting the data to introduce some balance
### Options:
## 1: ties per class: take K=N/n_label most unsure texts where class c is part of the tie. repeat for each class. must sum to N
## 2: ties + certainty per class: take 2/n most unsure texts overall + 2/n_label/n most sure texts for each class. must sum to N
# could lead to issues with severe minority classes when high-certainty examples are exhausted
# but can be good to see high certainty examples to as expert
## 3. ! could also later trie uncertainty splits by other meta-data rather than classes

## ties per class
df_label_balance = pd.DataFrame(hypo_prob_entail)
# add columns with entailment difference of highest probability tie and with the two labels of the tie
df_label_balance["tie_difference"] = df_label_balance.apply(lambda row: row.nlargest(n=2).diff().dropna().values[0], axis=1)
df_label_balance["tie_labels_text"] = df_label_balance.apply(lambda row: "++".join(row.nlargest(n=2).index.tolist()), axis=1)

test = {}
for label in label_text_alphabetical:
    df_tie_label = df_label_balance[df_label_balance["tie_labels_text"].str.contains(label)]
    print("Number of ties with the label: ", label, len(df_tie_label))
    test.update({label: df_tie_label})


# inefficient
entail_distance = [{"entail_distance": pd.Series(prob_entail.values()).nlargest(n=2).max() - pd.Series(prob_entail.values()).nlargest(n=2).min(), "labels_in_tie": pd.Series(prob_entail.keys())[pd.Series(prob_entail.values()).nlargest(n=2).index.tolist()].tolist()} for prob_entail in hypo_prob_entail]


# select N hardest ties for active learning
entail_distance_min = pd.Series(entail_distance).nsmallest(n=n_sample_al)

# model prediction
entail_max = [max(prob_entail, key=prob_entail.get) for prob_entail in hypo_prob_entail]

# write to clean df
df_corpus_ties = learner.df_corpus_original_update.copy(deep=True)
df_corpus_ties["probs_entail"] = hypo_prob_entail
df_corpus_ties["label_text_pred"] = entail_max

#df_corpus_ties_sample = df_corpus_ties[df_corpus_ties.index.isin(entail_distance_min.index)]
df_corpus_ties_sample = df_corpus_ties.iloc[entail_distance_min.index]
# shuffle for random training sequence instead of ordered by difficulty/ties/uncertainty
df_corpus_ties_sample = df_corpus_ties_sample.sample(frac=1, random_state=learner.seed)

# scale probabilities to sum to 1 to enable argilla to ingest it
df_corpus_ties_sample["probs_entail_scaled"] = [dict(zip(list(probs_entail_dic.keys()), np.array(list(probs_entail_dic.values())) / np.array(list(probs_entail_dic.values())).sum())) for probs_entail_dic in df_corpus_ties_sample.probs_entail]

index_al_sample = df_corpus_ties_sample.index.tolist()

df_corpus_ties_sample

#learner.index_al_sample = index_al_sample
#learner.df_corpus_al_sample = df_corpus_ties_sample"""