In [89]:
import abc
from dataclasses import dataclass, field
import json
from pathlib import Path
from typing import *
import sys

from beartype import beartype
import datasets
import numpy as np
import more_itertools
import queue
import rich
import torch
import torch.nn as nn
import transformers
import wandb

TokenizerType = transformers.tokenization_utils_fast.PreTrainedTokenizerFast

class BaseEpsilonScheduler(abc.ABC):
    @abc.abstractmethod
    def __call__(self):
        pass

class LinearEpsilonScheduler(BaseEpsilonScheduler):
    def __init__(self, epsilon, num_steps):
        self.epsilon = epsilon
        self.num_steps = num_steps
        self.epoch = 0

    def __call__(self):
        self.epoch += 1
        epsilon = min(self.epsilon * (1 - self.epoch / self.num_epochs), 1)
        wandb.log({"epsilon": epsilon})
        wandb.log({"epsilon_num_steps": self.num_steps})
        return epsilon

class ConstantEpsilonScheduler(BaseEpsilonScheduler):
    def __init__(self, epsilon):
        self.epsilon = epsilon

    def __call__(self):
        epsilon = self.epsilon
        wandb.log({"epsilon": epsilon})
        return epsilon

In [90]:
class BaseRetriever(abc.ABC):
    @abc.abstractmethod
    def retrieve(self, query_ids, query_index):
        pass


class StupidRetriever(BaseRetriever): 
    @beartype
    def __init__(
        self, *, model, tokenizer: TokenizerType, device: Union[int, str], 
        train_vectors: torch.Tensor, train_samples_dict: Dict[str, Any],
    ):
    
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.train_vectors = train_vectors
        self.train_samples_dict = train_samples_dict

    def retrieve(self, query_index):
        # Get the representation
        representation = self.train_vectors[query_index]
        with torch.inference_mode():
            # Compute the inner products
            scores = torch.matmul(representation, self.train_vectors.t())
            # Get the top 2 results, to potentially exclude the sample itself.
            topk = torch.topk(scores, k=2, dim=-1)
        topk = topk.indices.cpu().numpy()
        
        for retrieved_idx in topk:
            if retrieved_idx != query_index:
                return {k: v[retrieved_idx] for k, v in self.train_samples_dict.items()} | {"index": retrieved_idx}
        
# build train vectors
@beartype
def make_retrival_model_and_vectors(
    retriever_name: str, path_to_vectors: Union[str, Path], device: int, dataset_type: str,
):
    """We expect the dir to have the following structure:
    - config.json
    - train_samples.json 
    - train_vectors.npy
    """    
    # Make some checks
    retriever_model = transformers.AutoModel.from_pretrained(retriever_name)
    retriever_tokenizer = transformers.AutoTokenizer.from_pretrained(retriever_name)

    with open(path_to_vectors / "train_samples.json") as f:
        train_samples_dict = json.load(f)
        

    vectors = torch.tensor(np.load(path_to_vectors / "train_vectors.npy")).to(device)
    retriever = StupidRetriever(
        model=retriever_model, 
        tokenizer=retriever_tokenizer, 
        device=device, 
        train_vectors=vectors, 
        train_samples_dict=train_samples_dict,
    )
    
    return retriever


@dataclass(order=True)
class PrioritizedItem:
    priority: int
    item: Any=field(compare=False)


class BoostingIterator(torch.utils.data.IterableDataset):
    @beartype
    def __init__(
        self, 
        *, 
        dataset, 
        retriever_client: BaseRetriever, 
        classifier: nn.Module, seed: int, 
        classification_device: Union[int, str], 
        classification_tokenizer: TokenizerType, 
        retriever_device: Union[int, str],
        epsilon_scheduler: BaseEpsilonScheduler, 
        loss_ema_alpha: float, 
        config: Dict[str, Any],
    ):
        super().__init__()
        self.dataset = dataset.map(
            lambda example, idx:{"index": idx}, with_indices=True, 
        ).shuffle(seed=seed)
        self.dataset = self.dataset.remove_columns(["idx"])
        self.priority_queue = queue.PriorityQueue()
        self.retriever_client = retriever_client
        self.epsilon_scheduler = epsilon_scheduler
        self.randomizer = np.random.RandomState(seed)
        self.seed = seed
        self.dataset_iter = None
        self.classifier = classifier
        self.classification_tokenizer = classification_tokenizer
        self.classification_device = classification_device
        self.retriever_device = retriever_device
        self.loss_moving_average = None
        self.loss_ema_alpha = loss_ema_alpha
        self.dataset_type = config["dataset_type"]
        if self.dataset_type == "dual_entry_classification":
            self.field_a_name = config["field_a_name"]
            self.field_b_name = config["field_b_name"]

        assert "idx" not in self.dataset

        # assert mode in ["epsilon_priority_no_reset", "pure_sampled", "epsilon_sampled"], mode

    def push_score(self, inputs, loss):
        average_loss = loss.mean()
        if self.loss_moving_average is None:
            self.loss_moving_average = average_loss
        else:
            self.loss_moving_average = (
                self.loss_ema_alpha * self.loss_moving_average + (1 - self.loss_ema_alpha) * average_loss
            )

        for input_, mask, loss_, index in (
            more_itertools.zip_equal(inputs["input_ids"], inputs["attention_mask"], loss, inputs["index"])
        ):
            assert loss_.shape == torch.Size([]), loss_.shape
            self.priority_queue.put(
                PrioritizedItem(
                    priority= -loss_.detach().cpu().numpy() / self.loss_moving_average, 
                    item=dict(input_ids=input_, attention_mask=mask, index=index)
                    )
                )

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        rich.print("[bold green]ITER[/]")
        self.dataset = self.dataset.shuffle(seed=self.seed)
        self.dataset_iter = iter(self.dataset)
        return self
    
    def __next__(self):
        """ This is where the sampling happens.
        """

        # Test if we have a sample and if we pass the epsilon threshold
        empty = self.priority_queue.empty()
        rand = self.randomizer.rand()
        if not empty and rand < self.epsilon_scheduler():
            # pull a sample from the priority queue
            sample = self.priority_queue.get().item

            # We retrieve the next sample.
            next_sample = self.retriever_client.retrieve(sample["index"])
        else:
            next_sample = next(self.dataset_iter)  # We raise here if we have no more samples in the dataset


        if self.dataset_type == "single_entry_classification":
            tokenized = self.classification_tokenizer.encode_plus(
                next_sample["inputs"], 
                truncation=True, 
                padding=True,    
            )
            del next_sample["inputs"]
        elif self.dataset_type == "dual_entry_classification":
            tokenized = self.classification_tokenizer.encode_plus(
                next_sample[self.field_a_name], 
                next_sample[self.field_b_name], 
                truncation=True, 
                padding=True,
            )
            del next_sample[self.field_a_name]
            del next_sample[self.field_b_name]
            # if "idx" in next_sample:
            #     del next_sample["idx"]
            
        else:
            raise ValueError(f"Unknown dataset type: {self.dataset_type}")

        # text is not needed anymore
        
        assert len(tokenized.keys() & next_sample.keys()) == 0, (tokenized.keys(), next_sample.keys()) 
        return dict(**tokenized, **next_sample)


class BoostingTrainer(transformers.Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        assert "labels" in inputs, inputs.keys()
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        assert labels is None


        outputs = model(**inputs)
        rich.print(f"[green bold]{outputs}")
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            assert False
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            rich.print(f"[yellow bold]{outputs[1].shape}")

        return (loss, outputs) if return_outputs else loss

    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.
        Subclass and override to inject custom behavior.
        Args:
            model (`nn.Module`):
                The model to train.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. 
                Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
        Return:
            `torch.Tensor`: The tensor with training loss on this batch.
        """
        model.train()
        inputs = self._prepare_inputs(inputs)
        index = inputs["index"]
        # Compute loss doesn't work with extra arguments.
        del inputs["index"]

        with self.autocast_smart_context_manager():
            # Get the loss
            loss, outputs = self.compute_loss(model, inputs, return_outputs=True)

        if self.args.n_gpu > 1:
            # Mean over per gpu averages
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        # This is ignored in the priority queue computation
        if self.args.gradient_accumulation_steps > 1 and not self.deepspeed:
            assert False
            # Deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
            loss = loss / self.args.gradient_accumulation_steps
        
        if self.do_grad_scaling:
            assert False
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            assert False
            with torch.cuda.amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            assert False
            # loss gets scaled under gradient_accumulation_steps in deepspeed
            loss = self.deepspeed.backward(loss)
        else:
            loss.backward()

        loss = loss.detach()

        # Addition for RetroBoost
        # Make sure the losses are similar, then push them to the priority queue
        # Put index back in

        inputs["index"] = index

        with torch.inference_mode():
            loss_per_sample = torch.nn.functional.cross_entropy(outputs.logits.detach(), inputs["labels"].detach(), reduction="none")
            assert loss_per_sample.ndim == 1, loss_per_sample.ndim
            loss_per_gpu = torch.mean(loss_per_sample, dim=0)
            computed_loss = torch.mean(loss_per_gpu)
            rich.print("[red bold]logits[/]", outputs.logits.detach().cpu().numpy())
            rich.print("[red bold]logits[/]", outputs.logits.detach().cpu().numpy().shape)
            rich.print("[red bold]LOSS[/]", loss.detach().cpu().numpy(), " [red bold]computed_loss[/]", computed_loss)
            assert torch.allclose(loss, computed_loss, atol=0.01)

            self.get_train_dataloader().dataset.push_score(inputs, loss_per_sample)

        return loss




In [91]:
RETRIEVER_NAME = "facebook/contriever"
DATASET_TUPLE = ("super_glue", "rte")
TASK_TYPE = "dual_entry_classification"
PATH_TO_VECTORS = Path(f"./vectors_{'_'.join(DATASET_TUPLE)}_{RETRIEVER_NAME.split('/')[-1]}/")
CLASSIFIER_NAME = "roberta-base"
CLASSIFIER_BATCH_SIZE = 20
EPSILON_SCHEDULER_TYPE = "constant"
EPSILON_SCHEDULER_CONFIG = dict(
    epsilon=0.5,
)
LOSS_EMA_ALPHA = 0.5
REGULAR_TRAINER = False
WEIGHT_DECAY = 0.05
LEARNING_RATE = 1e-5

CLASSIFIER_EVAL_BATCH_SIZE_MULTIPLIER = 1.5
CLASSIFIER_DEVICE = 1
RETRIEVER_DEVICE = 2
SEED = 0
SPLIT_RATIO = 0.85
NUM_EPOCHS_TO_TRAIN_ON = 30


###############################################################################
# Fast setup 
###############################################################################
wandb_config = dict(
        classifier_batch_size=CLASSIFIER_BATCH_SIZE,
        classifier_name=CLASSIFIER_NAME,
        dataset_tuple=DATASET_TUPLE,
        epsilon=dict(
            scheduler_type=EPSILON_SCHEDULER_TYPE,
            scheduler_config=EPSILON_SCHEDULER_CONFIG,
        ),
        loss_ema_alpha=LOSS_EMA_ALPHA,
        random_seed=SEED,
        regular_trainer=REGULAR_TRAINER,
        retriever_name=RETRIEVER_NAME,
        split_ratio=SPLIT_RATIO,
        weight_decay=WEIGHT_DECAY,
        learning_rate=LEARNING_RATE,
    )

wandb.init(
    config=wandb_config,
    project="RetroBoost", 
    entity="julesgm",
    name="baseline-trainer",
)

EPSILON_SCHEDULER_TYPE_MAP = dict(
    constant=ConstantEpsilonScheduler,
)

# Random seeds. 
np.random.seed(0)
torch.manual_seed(0)

classifier_tokenizer: Final = transformers.AutoTokenizer.from_pretrained(CLASSIFIER_NAME)

# Load the config
config: Final = json.loads((PATH_TO_VECTORS / "config.json").read_text())
assert config["retriever_name"] == RETRIEVER_NAME, f"{config['retriever_name']} != {RETRIEVER_NAME}"

# Load the datasets
dataset_train: Final = datasets.load_dataset(*DATASET_TUPLE, split=f"train[:{SPLIT_RATIO:.0%}]")
dataset_validation: Final = datasets.load_dataset(*DATASET_TUPLE, split=f"train[{SPLIT_RATIO:.0%}:]")

ALL_LABELS = set(dataset_train["label"])
NUM_LABELS = len(ALL_LABELS)
assert ALL_LABELS == set(range(NUM_LABELS))

# Delete the extra fields
if config["dataset_type"] == "dual_entry_classification":
    fields = dataset_train[0].keys()
    dataset_train.remove_columns(fields - {config["field_a_name"], config["field_b_name"], "label"} )

def preprocess_function(examples, tokenizer, config):
    if config["dataset_type"] == "single_entry_classification":
        return tokenizer(examples["text"], truncation=True, padding=True)
    elif config["dataset_type"] == "dual_entry_classification":
        return tokenizer(
            examples[config["field_a_name"]], 
            examples[config["field_b_name"]], 
            truncation=True, 
            padding=True,
        )

    raise ValueError(f"Unknown dataset type {config['dataset_type']}")

tokenized_training: Final = dataset_train.map(
    lambda examples: preprocess_function(examples, classifier_tokenizer, config), 
    batched=True
).shuffle(seed=SEED)

tokenized_validation: Final = dataset_validation.map(
    lambda examples: preprocess_function(examples, classifier_tokenizer, config), 
    batched=True
).shuffle(seed=SEED)

training_args: Final = transformers.TrainingArguments(
    evaluation_strategy="steps",
    eval_steps=10,
    output_dir="./results",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=CLASSIFIER_BATCH_SIZE,
    per_device_eval_batch_size=int(CLASSIFIER_BATCH_SIZE * CLASSIFIER_EVAL_BATCH_SIZE_MULTIPLIER),
    num_train_epochs=NUM_EPOCHS_TO_TRAIN_ON,
    report_to="wandb",
    weight_decay=WEIGHT_DECAY,
)

retriever: Final = make_retrival_model_and_vectors(
    retriever_name=RETRIEVER_NAME, 
    path_to_vectors=PATH_TO_VECTORS, 
    device=RETRIEVER_DEVICE, 
    dataset_type=config["dataset_type"],
)
retriever_client: Final = retriever

classifier: Final = transformers.AutoModelForSequenceClassification.from_pretrained(
    CLASSIFIER_NAME, num_labels=NUM_LABELS
)

classifier.config.problem_type = "single_label_classification"

VBox(children=(Label(value=' 0.14MB of 0.14MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/mila/g/gagnonju/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size"

In [92]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return datasets.load_metric("accuracy").compute(predictions=predictions, references=labels)

if REGULAR_TRAINER:
    TrainerClass = transformers.Trainer    
    ds_train = tokenized_training
else:
    TrainerClass = BoostingTrainer
    
    ds_train = BoostingIterator(
        dataset=dataset_train, 
        retriever_client=retriever_client, 
        classifier=classifier, 
        epsilon_scheduler=EPSILON_SCHEDULER_TYPE_MAP[EPSILON_SCHEDULER_TYPE](**EPSILON_SCHEDULER_CONFIG), 
        seed=SEED,
        retriever_device=RETRIEVER_DEVICE, 
        classification_device=CLASSIFIER_DEVICE,
        classification_tokenizer=classifier_tokenizer,
        loss_ema_alpha=LOSS_EMA_ALPHA,
        config=config,
    )


trainer = TrainerClass(
        model=classifier
        args=training_args, 
        tokenizer=classifier_tokenizer, 
        train_dataset=ds_train, 
        eval_dataset=tokenized_validation,
        data_collator=transformers.DataCollatorWithPadding(
            tokenizer=classifier_tokenizer
        ),
        compute_metrics=compute_metrics,
    )





SyntaxError: invalid syntax (480604505.py, line 28)

In [None]:
output = trainer.train()
print(output)
print(trainer.evaluate())

***** Running training *****
  Num examples = 2116
  Num Epochs = 30
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 60
  Gradient Accumulation steps = 1
  Total optimization steps = 3180
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Loading cached shuffled indices for dataset at /home/mila/g/gagnonju/.cache/huggingface/datasets/super_glue/rte/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7/cache-60360dcf2c50180c.arrow


AttributeError: 'DataParallel' object has no attribute 'model'

In [None]:
wandb.finish()


In [None]:
classifier

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [94]:
classifier.config.problem_type is None

True