In [1]:
import json

import datasets

import numpy as np
import matplotlib.pyplot as plt
import queue
import rich
import torch
import transformers
import tqdm

In [None]:
class StupidRetriever:
    def __init__(self, model, tokenizer, device, train_vectors, train_samples):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.train_vectors = train_vectors
        self.train_samples = train_samples
        self.inverted_index = {sample: i for i, sample in enumerate(train_samples)}

    def retrieve(self, query):
        query_tokens = self.tokenizer.tokenize(query)
        # <Copilot vomit>
        query_tokens = query_tokens[:self.model.config.n_ctx]
        query_tokens = query_tokens[::-1]
        query_tokens = self.tokenizer.convert_tokens_to_ids(query_tokens)
        query_tokens = torch.tensor(query_tokens).unsqueeze(0)
        query_tokens = query_tokens.to(self.device)
        # </Copilot vomit>
        with torch.inference():
            representation = self.model(query_tokens)[0]
            scores = torch.matmul(representation, self.train_vectors)
        topk = torch.topk(scores, k=2, dim=1)[1].cpu().numpy()
        query_index = self.inverted_index[query]
        for index in topk[1]:
            if index != query_index:
                return self.train_samples[index]

        raise ValueError("Two samples were equal to the query sample (!?)")


In [2]:
# build train vectors
def make_retrival_model_and_vectors(retriever_name, path_to_vectors, device, dataset_name):
    """We expect the dir to have the following structure:
    - config.json
    - train_samples.txt 
    - train_vectors.npy
    """    
    # Make some checks
    config =  json.loads((path_to_vectors / "config.json").read_text().strip())
    assert dataset_name == config["dataset_name"], (dataset_name, config["dataset_name"])
    assert retriever_name == config["retriver_name"], (retriever_name, config["retriver_name"])

    retriever_model = transformers.AutoModel.from_pretrained(retriever_name)
    retriever_tokenizer = transformers.AutoTokenizer.from_pretrained(retriever_name)

    with open(path_to_vectors / "train_samples.txt") as f:
        train_samples = [x.strip() for x in tqdm.tqdm(f.readlines().strip())]
    vectors = torch.tensor(np.load(path_to_vectors / "train_vectors.npy")).to(device)
    retriever = StupidRetriever(
        model=retriever_model, 
        retriever_tokenizer=retriever_tokenizer, 
        device=device, 
        train_vectors=vectors, 
        train_samples=train_samples,
    )

    
    return retriever


In [3]:
class BoostingIterator:
    def __init__(self, *, dataset, retriever_client, classifier, epsilon, seed, classification_device, retriever_device):
        self.dataset = dataset
        self.priority_queue = queue.PriorityQueue()
        self.retriever_client = retriever_client
        self.epsilon = epsilon
        self.randomizer = np.random.RandomState(seed)
        self.text_to_repr = {}
        self.dataset_iter = None
        self.classifier = classifier
        self.classification_device = classification_device
        self.retriever_device = retriever_device


    def __iter__(self):
        rich.print("[red bold]NEW EPOCH[/]")
        self.dataset_iter = iter(self.dataset)
        return self

    def __next__(self):
        if self.priority_queue and self.randomizer.rand() < self.epsilon:
            self.priority_queue.sort()
            sample = self.priority_queue.pop()
            repr = self.retriever_client.text_to_repr[sample["text"]]
            next_sample = self.retriever_client.retrieve(repr)
        else:
            next_sample = next(self.dataset_iter)
            self.priority_queue.push(next_sample)
        return next_sample


def build_trainer(rank, train_dataset_name, classifier_name, retriever_client, classification_device, retriever_device):
    
    classifier = transformers.AutoModelForSequenceClassification.from_pretrained(
        classifier_name, num_labels=NUM_LABELS
    )
    classifier_tokenizer = transformers.AutoTokenizer.from_pretrained(classifier_name)

    def preprocess_function(examples, tokenizer):
        return tokenizer(examples["text"], truncation=True, padding=True)

    def preprocess_function(examples, tokenizer):
        return tokenizer(examples["text"], truncation=True, padding=True)
    
    dataset = datasets.load_dataset(train_dataset_name)
    
    tokenized_training = BoostingIterator(
        dataset=dataset["train"], 
        retriever_client=retriever_client, 
        classification_model=classifier, 
        epsilon=0.5, 
        retriever_device=retriever_device, 
        classification_device=classification_device,
    )
    tokenized_validation = dataset["test"].map(
        lambda examples: preprocess_function(examples, classifier_tokenizer), 
        batched=True
    )

    training_args = transformers.TrainingArguments(
        output_dir="./results",
        learning_rate=1e-5,
        per_device_train_batch_size=80,
        per_device_eval_batch_size=100,
        num_train_epochs=5,
        weight_decay=0.01,
    )

    trainer = transformers.Trainer(
        model=classifier.to(rank), 
        args=training_args, 
        tokenizer=classifier_tokenizer, 
        train_dataset=tokenized_training, 
        eval_dataset=tokenized_validation,
        data_collator=transformers.DataCollatorWithPadding(
            tokenizer=classifier_tokenizer
        ),
    )

    return trainer

In [3]:
RETRIEVER_NAME = "facebook/contriever"
PATH_TO_VECTORS = "./vectors_imdb_contriever/"
DATASET_NAME = "imdb"

ALL_LABELS = set(dataset["train"]["label"])
NUM_LABELS = len(ALL_LABELS)
assert ALL_LABELS == set(range(NUM_LABELS))
retriever = make_retrival_model_and_vectors(RETRIEVER_NAME, PATH_TO_VECTORS, 0, DATASET_NAME)
trainer = build_trainer(1, "imdb", "roberta-base", retriever)

Reusing dataset imdb (/home/mila/g/gagnonju/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

  0%|          | 0/25 [00:00<?, ?ba/s]

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction


  0%|          | 0/25 [00:00<?, ?ba/s]

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
huggingface/tokenizers: T

In [4]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 25000
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 41670
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjulesgm[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss
500,0.5138
1000,0.3766
1500,0.3932
2000,0.3753
2500,0.4163
3000,0.3373
3500,0.3873
4000,0.3552
4500,0.3319
5000,0.3553


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

RuntimeError: [enforce fail at inline_container.cc:300] . unexpected pos 47552 vs 47456