In [6]:
import abc
from dataclasses import dataclass, field
import json
from pathlib import Path
from typing import *
import sys

from beartype import beartype
import datasets
import numpy as np
import more_itertools
import queue
import rich
import torch
import torch.nn as nn
import transformers
import wandb

TokenizerType = transformers.tokenization_utils_fast.PreTrainedTokenizerFast

class BaseEpsilonScheduler(abc.ABC):
    @abc.abstractmethod
    def __call__(self):
        pass

class LinearEpsilonScheduler(BaseEpsilonScheduler):
    def __init__(self, epsilon, num_steps):
        self.epsilon = epsilon
        self.num_steps = num_steps
        self.epoch = 0

    def __call__(self):
        self.epoch += 1
        epsilon = min(self.epsilon * (1 - self.epoch / self.num_epochs), 1)
        wandb.log({"epsilon": epsilon})
        wandb.log({"epsilon_num_steps": self.num_steps})
        return epsilon

class ConstantEpsilonScheduler(BaseEpsilonScheduler):
    def __init__(self, epsilon):
        self.epsilon = epsilon

    def __call__(self):
        epsilon = self.epsilon
        wandb.log({"epsilon": epsilon})
        return epsilon

In [7]:
class BaseRetriever(abc.ABC):
    @abc.abstractmethod
    def retrieve(self, query_ids, query_index):
        pass


class StupidRetriever(BaseRetriever): 
    @beartype
    def __init__(
        self, *, model, tokenizer: TokenizerType, device: Union[int, str], 
        train_vectors: torch.Tensor, train_samples_dict: Dict[str, Any],
    ):
    
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.train_vectors = train_vectors
        self.train_samples_dict = train_samples_dict

    def retrieve(self, query_index):
        # Get the representation
        representation = self.train_vectors[query_index]
        with torch.inference_mode():
            # Compute the inner products
            scores = torch.matmul(representation, self.train_vectors.t())
            # Get the top 2 results, to potentially exclude the sample itself.
            topk = torch.topk(scores, k=2, dim=-1)
        topk = topk.indices.cpu().numpy()
        
        for retrieved_idx in topk:
            if retrieved_idx != query_index:
                return {k: v[retrieved_idx] for k, v in self.train_samples_dict.items()} | {"index": retrieved_idx}
        
# build train vectors
@beartype
def make_retrival_model_and_vectors(
    retriever_name: str, path_to_vectors: Union[str, Path], device: Union[int, str], dataset_type: str,
) -> BaseRetriever:
    """We expect the dir to have the following structure:
    - config.json
    - train_samples.json 
    - train_vectors.npy
    """    
    # Make some checks
    retriever_model = transformers.AutoModel.from_pretrained(retriever_name)
    retriever_tokenizer = transformers.AutoTokenizer.from_pretrained(retriever_name)

    with open(path_to_vectors / "train_samples.json") as f:
        train_samples_dict = json.load(f)
        

    vectors = torch.tensor(np.load(path_to_vectors / "train_vectors.npy")).to(device)
    retriever = StupidRetriever(
        model=retriever_model, 
        tokenizer=retriever_tokenizer, 
        device=device, 
        train_vectors=vectors, 
        train_samples_dict=train_samples_dict,
    )
    
    return retriever


@dataclass(order=True)
class PrioritizedItem:
    priority: int
    item: Any=field(compare=False)


class BoostingIterator(torch.utils.data.IterableDataset):
    @beartype
    def __init__(
        self, 
        *, 
        dataset, 
        retriever_client: BaseRetriever, 
        classifier: nn.Module, seed: int, 
        classification_device: Union[int, str], 
        classification_tokenizer: TokenizerType, 
        retriever_device: Union[int, str],
        epsilon_scheduler: BaseEpsilonScheduler, 
        loss_ema_alpha: float, 
        score_mode: str,
        fixed_loss_warmup_steps: Optional[int]=None,
        config: Dict[str, Any],
    ):
        super().__init__()
        self.dataset = dataset.map(
            lambda example, idx:{"index": idx}, with_indices=True, 
        ).shuffle(seed=seed)
        self.dataset = self.dataset.remove_columns(["idx"])
        self.priority_queue = queue.PriorityQueue()
        self.retriever_client = retriever_client
        self.epsilon_scheduler = epsilon_scheduler
        self.randomizer = np.random.RandomState(seed)
        self.seed = seed
        self.dataset_iter = None
        self.classifier = classifier
        self.classification_tokenizer = classification_tokenizer
        self.classification_device = classification_device
        self.retriever_device = retriever_device
        self.variance_rolling_average = None
        self.loss_ema_alpha = loss_ema_alpha
        self.dataset_type = config["dataset_type"]
        self.seen_samples = 0
        self.epoch_length = len(self.dataset)
        self.total_num_steps = 0
        self.score_mode = score_mode
        self.fixed_loss_warmup_steps = fixed_loss_warmup_steps
        self.loss_moving_average = None

        if self.score_mode == "fixed_loss" or self.score_mode == "step_sensitive_fixed_loss":
            assert self.fixed_loss is not None


        if self.dataset_type == "dual_entry_classification":
            self.field_a_name = config["field_a_name"]
            self.field_b_name = config["field_b_name"]

        assert "idx" not in self.dataset

        # assert mode in ["epsilon_priority_no_reset", "pure_sampled", "epsilon_sampled"], mode

    def push_score(self, inputs, loss):
        loss: Final = loss
        inputs: Final = inputs

        with torch.inference_mode():
            ################################################################################
            # Moving average of the loss.
            ################################################################################    
            uniform_samples_loss: Final = loss[torch.logical_not(inputs["is_retrieved"])].detach()

            # Only use the uniform random samples to evaluate the batch's average loss.
            # Protection against the edge case where everything is retrieved.
            if len(uniform_samples_loss) != 0:
                average_loss: Final = torch.mean(uniform_samples_loss).detach().cpu().numpy()
                if self.loss_moving_average is None:
                    self.loss_moving_average = average_loss
                else:
                    self.loss_moving_average = (
                        self.loss_ema_alpha * self.loss_moving_average + (1 - self.loss_ema_alpha) * average_loss
                    )
                wandb.log({"loss_moving_average": self.loss_moving_average})

            ################################################################################
            # Scores the inputs and pushes them to the priority queue.
            ################################################################################

            for i, (input_, mask, loss_, index) in (
                enumerate(more_itertools.zip_equal(inputs["input_ids"], inputs["attention_mask"], loss, inputs["index"]))
            ):
                
                loss_ = loss_.detach().cpu().numpy()
                relative_loss = loss_ / self.loss_moving_average
                

                if inputs["has_previous_loss"][i]:
                    previous_loss = inputs["previous_loss"][i]

                    wandb.log({"previous_relative_loss": previous_loss})
                    wandb.log({"current_relative_loss": relative_loss})
                    wandb.log({"ratio_relative_losses": relative_loss / previous_loss})
                    wandb.log({"average_relative_losses_check": (relative_loss + previous_loss) / 2})

                assert loss_.shape == torch.Size([]), loss_.shape

                if self.score_mode == "relative_loss":
                    score = - relative_loss
                elif self.score_mode == "step_sensitive_relative_loss":
                    score = - relative_loss * self.total_num_steps
                elif self.score_mode == "fixed_loss":
                    score = np.abs(loss_ - self.fixed_loss)
                elif self.score_mode == "step_sensitive_fixed_loss":
                    score = np.abs(loss_ - self.fixed_loss) * self.total_num_steps 
                else:
                    raise ValueError(f"Unknown score mode: {self.score_mode}")

                self.priority_queue.put(
                    PrioritizedItem(
                            priority=score, 
                            item=dict(input_ids=input_, attention_mask=mask, index=index, previous_loss=relative_loss)
                        )
                    )

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        if self.dataset_iter is None:
            self.dataset_iter = iter(self.dataset)
        self.seen_samples = 0
        return self
    
    def __next__(self):
        if self.seen_samples == self.epoch_length:
            raise StopIteration

        # This next is only called by the training dataset.
        self.total_num_steps += 1

        # Test if we have a sample and if we pass the epsilon threshold
        empty = self.priority_queue.empty()
        rand = self.randomizer.rand()
        if not empty and rand < self.epsilon_scheduler():
            ################################################################################
            # Retrieved sample
            ################################################################################
            # pull a sample from the priority queue
            sample = self.priority_queue.get().item
            
            # We retrieve the next sample.
            next_sample = self.retriever_client.retrieve(sample["index"])
            
            
            next_sample["is_retrieved"] = True
            next_sample["previous_loss"] = sample["previous_loss"]
            next_sample["has_previous_loss"] = True
        else:
            ################################################################################
            # Uniform random sample
            ################################################################################
            try:
                next_sample = next(self.dataset_iter)  
            except StopIteration:
                self.dataset = self.dataset.shuffle(seed=self.seed)
                self.dataset_iter = iter(self.dataset)
                next_sample = next(self.dataset_iter)  
            next_sample["is_retrieved"] = False
            next_sample["previous_loss"] = float('nan')
            next_sample["has_previous_loss"] = False

        ################################################################################
        # Per dataset type preparation
        ################################################################################
        if self.dataset_type == "single_entry_classification":
            tokenized = self.classification_tokenizer.encode_plus(
                next_sample["inputs"], 
                truncation=True, 
                padding=True,    
            )
            del next_sample["inputs"]
        elif self.dataset_type == "dual_entry_classification":
            tokenized = self.classification_tokenizer.encode_plus(
                next_sample[self.field_a_name], 
                next_sample[self.field_b_name], 
                truncation=True, 
                padding=True,
            )
            del next_sample[self.field_a_name]
            del next_sample[self.field_b_name]
            
        else:
            raise ValueError(f"Unknown dataset type: {self.dataset_type}")
        # import html
        # if next_sample["is_retrieved"]:
        #     wandb.log(
        #         {
        #         "thing_pair":[
        #             wandb.Html(f"<b>previous:</b>      {sample['index']}      {html.escape(self.classification_tokenizer.decode(sample['input_ids']   ))})"),
        #             wandb.Html(f"<b>current:&nbsp;</b> {next_sample['index']} {html.escape(self.classification_tokenizer.decode(tokenized['input_ids']))})")
        #         ]
        #         }
        #        
        #     )
        # rich.print(
        #     f"[bold green]previous:[/]  {sample['index']} {self.classification_tokenizer.decode(sample['input_ids'])}\n", 
        #     f"[bold green]current:[/]   {next_sample['index']} {self.classification_tokenizer.decode(tokenized['input_ids'])}"
        # )
        # print("#" * 80)

        assert len(tokenized.keys() & next_sample.keys()) == 0, (tokenized.keys(), next_sample.keys()) 
        retval = dict(**tokenized, **next_sample)
        assert "previous_loss" in retval, retval.keys()
        return retval


class BoostingTrainer(transformers.Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        assert "labels" in inputs, inputs.keys()
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        assert labels is None


        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            assert False
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.
        Subclass and override to inject custom behavior.
        Args:
            model (`nn.Module`):
                The model to train.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. 
                Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
        Return:
            `torch.Tensor`: The tensor with training loss on this batch.
        """
        model.train()
        
        ################################################################################
        # Remove the parts of the inputs that model.forward does not need.
        ################################################################################
        inputs = self._prepare_inputs(inputs)
        index = inputs["index"]
        is_retrieved = inputs["is_retrieved"]
        previous_loss = inputs["previous_loss"]
        has_previous_loss = inputs["has_previous_loss"]
        del inputs["previous_loss"]
        del inputs["is_retrieved"]
        del inputs["index"]
        del inputs["has_previous_loss"]

        with self.autocast_smart_context_manager():
            # Get the loss
            loss, outputs = self.compute_loss(model, inputs, return_outputs=True)

        if self.args.n_gpu > 1:
            # Mean over per gpu averages
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        # This is ignored in the priority queue computation
        if self.args.gradient_accumulation_steps > 1 and not self.deepspeed:
            assert False
            # Deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
            loss = loss / self.args.gradient_accumulation_steps
        
        if self.do_grad_scaling:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            assert False
            with torch.cuda.amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            assert False
            # loss gets scaled under gradient_accumulation_steps in deepspeed
            loss = self.deepspeed.backward(loss)
        else:
            loss.backward()

        loss = loss.detach()

        # Addition for RetroBoost
        # Make sure the losses are similar, then push them to the priority queue
        # Put index back in

        inputs["index"] = index
        inputs["is_retrieved"] = is_retrieved
        inputs["previous_loss"] = previous_loss
        inputs["has_previous_loss"] = has_previous_loss

        with torch.inference_mode():
            loss_per_sample = torch.nn.functional.cross_entropy(outputs.logits.detach(), inputs["labels"].detach(), reduction="none")
            assert loss_per_sample.ndim == 1, loss_per_sample.ndim
            loss_per_gpu = torch.mean(loss_per_sample, dim=0)
            computed_loss = torch.mean(loss_per_gpu)
            # rich.print("[red bold]logits[/]", outputs.logits.detach().cpu().numpy())
            # rich.print("[red bold]logits[/]", outputs.logits.detach().cpu().numpy().shape)
            # rich.print("[red bold]LOSS[/]", loss.detach().cpu().numpy(), " [red bold]computed_loss[/]", computed_loss)
            # assert torch.allclose(loss, computed_loss)

            self.get_train_dataloader().dataset.push_score(inputs, loss_per_sample)

        return loss


In [8]:

DATASET_TUPLE = ("super_glue", "rte")
CLASSIFIER_NAME = "roberta-base"
CLASSIFIER_BATCH_SIZE = 20
EPSILON_SCHEDULER_TYPE = "constant"
EPSILON_SCHEDULER_CONFIG = dict(
    epsilon=.5,
)
LOSS_EMA_ALPHA = 0.995
REGULAR_TRAINER = False
WEIGHT_DECAY = 0.01
LEARNING_RATE = 1e-5
ENABLE_FP16 = True

SCORE_MODE = "step_sensitive_relative_loss"
FIXED_LOSS_WARMUP_STEPS = 10


# Things that don't change
RETRIEVER_NAME = "facebook/contriever"
PATH_TO_VECTORS = Path(f"./vectors_{'_'.join(DATASET_TUPLE)}_{RETRIEVER_NAME.split('/')[-1]}/")
CLASSIFIER_EVAL_BATCH_SIZE_MULTIPLIER = 1.5
CLASSIFIER_DEVICE = "cuda"
RETRIEVER_DEVICE = "cuda"
SEED = 0
SPLIT_RATIO = 0.85
NUM_EPOCHS_TO_TRAIN_ON = 60



RUN_NAME = f"{'fp16' if ENABLE_FP16 else 'fp32'}_{SCORE_MODE}_{LOSS_EMA_ALPHA=}_{EPSILON_SCHEDULER_TYPE}_epsilon"


###############################################################################
# Fast setup 
###############################################################################
config: Final = json.loads((PATH_TO_VECTORS / "config.json").read_text())
assert config["retriever_name"] == RETRIEVER_NAME, f"{config['retriever_name']} != {RETRIEVER_NAME}"

wandb_config = dict(
        classifier_batch_size=CLASSIFIER_BATCH_SIZE,
        classifier_name=CLASSIFIER_NAME,
        dataset_tuple=DATASET_TUPLE,
        epsilon=dict(
            scheduler_type=EPSILON_SCHEDULER_TYPE,
            scheduler_config=EPSILON_SCHEDULER_CONFIG,
        ),
        loss_ema_alpha=LOSS_EMA_ALPHA,
        random_seed=SEED,
        regular_trainer=REGULAR_TRAINER,
        retriever_name=RETRIEVER_NAME,
        split_ratio=SPLIT_RATIO,
        weight_decay=WEIGHT_DECAY,
        learning_rate=LEARNING_RATE,
        dataset_type=config["dataset_type"],
        enable_fp16=ENABLE_FP16,
    )

wandb.init(
    config=wandb_config,
    project="RetroBoost", 
    entity="retroboost",
    name=RUN_NAME,
)

EPSILON_SCHEDULER_TYPE_MAP = dict(
    constant=ConstantEpsilonScheduler,
)

# Random seeds. 
np.random.seed(0)
torch.manual_seed(0)

classifier_tokenizer: Final = transformers.AutoTokenizer.from_pretrained(CLASSIFIER_NAME)

# Load the config

# Load the datasets
dataset_train: Final = datasets.load_dataset(*DATASET_TUPLE, split=f"train[:{SPLIT_RATIO:.0%}]")
dataset_validation: Final = datasets.load_dataset(*DATASET_TUPLE, split=f"train[{SPLIT_RATIO:.0%}:]")

ALL_LABELS = set(dataset_train["label"])
NUM_LABELS = len(ALL_LABELS)
assert ALL_LABELS == set(range(NUM_LABELS))

# Delete the extra fields
if config["dataset_type"] == "dual_entry_classification":
    fields = dataset_train[0].keys()
    dataset_train.remove_columns(fields - {config["field_a_name"], config["field_b_name"], "label"} )

def preprocess_function(examples, tokenizer, config):
    if config["dataset_type"] == "single_entry_classification":
        return tokenizer(examples["text"], truncation=True, padding=True)
    elif config["dataset_type"] == "dual_entry_classification":
        return tokenizer(
            examples[config["field_a_name"]], 
            examples[config["field_b_name"]], 
            truncation=True, 
            padding=True,
        )

    raise ValueError(f"Unknown dataset type {config['dataset_type']}")

tokenized_training: Final = dataset_train.map(
    lambda examples: preprocess_function(examples, classifier_tokenizer, config), 
    batched=True
).shuffle(seed=SEED)

tokenized_validation: Final = dataset_validation.map(
    lambda examples: preprocess_function(examples, classifier_tokenizer, config), 
    batched=True
).shuffle(seed=SEED)

training_args: Final = transformers.TrainingArguments(
    evaluation_strategy="steps",
    eval_steps=10,
    output_dir="./results",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=CLASSIFIER_BATCH_SIZE,
    per_device_eval_batch_size=int(CLASSIFIER_BATCH_SIZE * CLASSIFIER_EVAL_BATCH_SIZE_MULTIPLIER),
    num_train_epochs=NUM_EPOCHS_TO_TRAIN_ON,
    report_to="wandb",
    weight_decay=WEIGHT_DECAY,
    fp16=ENABLE_FP16,
)

retriever: Final = make_retrival_model_and_vectors(
    retriever_name=RETRIEVER_NAME, 
    path_to_vectors=PATH_TO_VECTORS, 
    device=RETRIEVER_DEVICE, 
    dataset_type=config["dataset_type"],
)
retriever_client: Final = retriever

classifier: Final = transformers.AutoModelForSequenceClassification.from_pretrained(
    CLASSIFIER_NAME, num_labels=NUM_LABELS
)

classifier.config.problem_type = "single_label_classification"

VBox(children=(Label(value=' 3.95MB of 3.95MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
average_relative_losses_check,▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▂▂▂▂▁▂▃▁▂▃▁▇▂▃▂▂▁▄▃▃▂▃█▅▁
current_relative_loss,▃▃▃▃▃▃▃▃▃▃▃▄▃▄▃▃▄▃▃▂▃▃▁▄▅▂▅▂▁▂▂▂▁▄▁▂▁█▁▁
epsilon,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/accuracy,▂▂▁▁▁▁▁▁▄▂▃▃▂▄▅▃▆▅▆▅▅▆▆▅▅▇▅▇▆▇▆▇▆█▇▇▇▇██
eval/loss,█████████████▇▇█▇▇▅▆▆▅▅▅▇▄▆▃▄▂▅▂▄▂▂▃▃▃▁▂
eval/runtime,█▂▂▁▂▂▁▂▁▂▂▂▁▂▂▂▂▁▂▂▂▃▁▂▁▂▁▁▁▂▂▁▂▁▂▂▁▁▂▃
eval/samples_per_second,▁▇▇█▇▇▇▇█▇▇▇█▇▇▇▇█▇▇▇▆▇▇█▇▇██▇▇▇▇█▇▆█▇▇▆
eval/steps_per_second,▁▇▇█▇▇▇▇█▇▇▇█▇▇▇▇█▇▇▇▆▇▇█▇▇██▇▇▇▇█▇▆█▇▇▆
loss_moving_average,██████████████████▇▇▇▇▆▆▆▆▆▆▅▅▅▄▄▄▃▃▂▂▂▁
previous_relative_loss,▂▁▁▁▂▂▁▁▂▂▁▂▁▃▂▂▁▂▂▂▂▃▁▁▁▁█▂▅▂▂▁▆▁▄▃▅▇█▂

0,1
average_relative_losses_check,0.35335
current_relative_loss,0.15095
epsilon,0.5
eval/accuracy,0.85561
eval/loss,0.43105
eval/runtime,0.7268
eval/samples_per_second,514.55
eval/steps_per_second,17.885
loss_moving_average,0.49331
previous_relative_loss,0.55576


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/mila/g/gagnonju/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size"

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
loading configuration file https://huggingface.co/facebook/contriever/resolve/main/config.json from cache at /home/mila/g/gagnonju/.cache/huggingface/transformers/52956acf642ee38953ffe7ea253f16896223c887093b1cbbdcec52c82ed6ea6c.b19e5a240d78d865c8542d06f7a66ec533465c843c39fa863e76ebb42cd7a581
Model config BertConfig {
  "_name_or_path": "facebook/contriever",
  "architectures": [
    "Contriever"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return datasets.load_metric("accuracy").compute(predictions=predictions, references=labels)

if REGULAR_TRAINER:
    TrainerClass = transformers.Trainer    
    ds_train = tokenized_training
else:
    TrainerClass = BoostingTrainer
    
    ds_train = BoostingIterator(
        dataset=dataset_train, 
        retriever_client=retriever_client, 
        classifier=classifier, 
        epsilon_scheduler=EPSILON_SCHEDULER_TYPE_MAP[EPSILON_SCHEDULER_TYPE](**EPSILON_SCHEDULER_CONFIG), 
        seed=SEED,
        retriever_device=RETRIEVER_DEVICE, 
        classification_device=CLASSIFIER_DEVICE,
        classification_tokenizer=classifier_tokenizer,
        loss_ema_alpha=LOSS_EMA_ALPHA,
        config=config,
        score_mode=SCORE_MODE,
        fixed_loss_warmup_steps=FIXED_LOSS_WARMUP_STEPS,   
    )


@dataclass
class DataCollatorWithPadding:
    tokenizer: transformers.data.data_collator.PreTrainedTokenizerBase
    padding: Union[bool, str, transformers.data.data_collator.PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # check that they all have the same keys
        all_keys = set()
        for feature in features:
            all_keys |= feature.keys()
        
        for feature in features:
            assert all_keys == feature.keys(), all_keys - feature.keys()

        first = features[0]
        


        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )

        if "label" in batch:
            batch["labels"] = batch["label"]
            del batch["label"]
        
        if "label_ids" in batch:
            batch["labels"] = batch["label_ids"]
            del batch["label_ids"]
        
        return batch


trainer = TrainerClass(
        model=classifier,
        args=training_args, 
        tokenizer=classifier_tokenizer, 
        train_dataset=ds_train, 
        eval_dataset=tokenized_validation,
        data_collator=transformers.data.data_collator.DataCollatorWithPadding(
            tokenizer=classifier_tokenizer
        ),
        compute_metrics=compute_metrics,
    )





Using amp half precision backend


In [10]:
output = trainer.train()
print(output)
print(trainer.evaluate())

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, premise, hypothesis.
***** Running training *****
  Num examples = 2116
  Num Epochs = 60
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 1
  Total optimization steps = 6360
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy
10,No log,0.693,0.505348
20,No log,0.692936,0.540107
30,No log,0.692846,0.5
40,No log,0.692905,0.505348
50,No log,0.692771,0.505348
60,No log,0.69267,0.505348
70,No log,0.692516,0.505348
80,No log,0.692666,0.505348
90,No log,0.693361,0.505348
100,No log,0.693566,0.505348


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, premise, hypothesis.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 30
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, premise, hypothesis.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 30
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, premise, hypothesis.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 30
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, premise, hypothesis.
***** Running Evaluation *****
  Num examples = 374
  Batch size = 30
The following co

KeyboardInterrupt: 