In [1]:
!nvidia-smi

Tue Aug  8 21:01:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.191.01   Driver Version: 450.191.01   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   28C    P0    58W / 300W |  31022MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   47C    P0   259W / 300W |   7984MiB / 32510MiB |     98%      Default |
|       

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [3]:
from datasets import Dataset, DatasetDict

tokenized_wow = DatasetDict.load_from_disk("wow_rank_kel")

In [4]:
tokenized_wow.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", "pass_label"])

In [5]:
from transformers import T5ForConditionalGeneration, T5Config
import torch
from torch import nn

class RankT5GPE(T5ForConditionalGeneration):
    def __init__(self, config: T5Config):
        config.rank_score_index = 32019
        config.n_pass = 7
        config.output_hidden_states = True
        super().__init__(config)
        self.rank_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
        self.rank_id = config.rank_score_index
        self.n_pass = config.n_pass

    def forward(self, input_ids=None, attention_mask=None, decoder_input_ids=None, labels=None, pass_label=None, **kwargs):

        batch_size_n, seq_len = input_ids.size()
        batch_size = int(batch_size_n/self.n_pass)

        # input_ids = input_ids.view(batch_size*n_pass, -1)
        # attention_mask = attention_mask.view(batch_size*n_pass, -1)
        
        
        
        if labels != None and decoder_input_ids == None:
#             batch_size, decoder_seq_len = labels.size()
#             labels = labels.view(batch_size, 1, decoder_seq_len).contiguous()
#             labels = labels.expand(batch_size, n_pass, decoder_seq_len).contiguous()

#             labels = labels.view(batch_size*n_pass, -1)
            decoder_input_ids = self._shift_right(labels)
#             print(decoder_input_ids.size())


        out = super().forward(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, **kwargs)
        rank_score = self.rank_head(out.decoder_hidden_states[-1][:, 0, :])
        out.rank_score = rank_score[:, self.rank_id].view(-1, self.n_pass)


        if labels != None:
            logits = out.logits
            batch_size_n, sequence_length, vocab_size = logits.size()

            logits_flat = logits.view(batch_size_n * sequence_length, vocab_size)
            labels_flat = labels.view(-1)
            mask = (labels_flat != -100)
            arry = torch.arange(batch_size_n * sequence_length).to(logits_flat.device)

            selected_logits = logits_flat[arry[mask], labels_flat[mask]]
            output_logits = torch.full((batch_size_n * sequence_length,), 0, dtype=logits.dtype, device=logits.device)
            output_logits[mask] = selected_logits

            output_logits = output_logits.view(batch_size_n, -1).sum(-1)
            out.gpe_score = output_logits.view(int(batch_size_n/self.n_pass), self.n_pass)

        else:
            out.gpe_score = None


        if pass_label != None:
            print(pass_label)
#             assert 21
            pass_label = pass_label[::self.n_pass]
            rank_score = out.rank_score
            gen_score = out.gpe_score

            loss_fct1 = nn.CrossEntropyLoss()
            loss_fct2 = nn.CrossEntropyLoss()

            rank_loss = loss_fct1(rank_score, pass_label.view(-1))
            gen_loss = loss_fct2(gen_score, pass_label.view(-1))

            loss = rank_loss + gen_loss
            out.loss = loss

        return out

In [6]:
from transformers import T5Tokenizer

mod_ckp = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(mod_ckp)
config = T5Config.from_pretrained(mod_ckp)
model = RankT5GPE(config).from_pretrained(mod_ckp)
model.config.output_hidden_states = True
model.config.rank_score_index = tokenizer.convert_tokens_to_ids("<extra_id_80>")
model.config.n_pass = 7

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Some weights of RankT5GPE were not initialized from the model checkpoint at t5-small and are newly initialized: ['rank_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tokenizer.convert_tokens_to_ids("<extra_id_80>")

32019

In [8]:
model.config.n_pass

7

In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

batch_size = 8*model.config.n_pass 
model_dir = f"wow_rank_{mod_ckp}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=10,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="tensorboard",
    gradient_accumulation_steps=4
)

In [10]:
from torch.utils.data import DataLoader, Dataset
import datasets
from typing import Optional
from transformers.trainer import is_datasets_available, seed_worker
data_collator = DataCollatorForSeq2Seq(tokenizer)


class CustomTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
#         print(inputs.get("input_ids").size())
        outputs = model(**inputs)

        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss
    
    def get_train_dataloader(self) -> DataLoader:
        """
        Returns the training [`~torch.utils.data.DataLoader`].

        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
        training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        data_collator = self.data_collator
        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
            train_dataset = self._remove_unused_columns(train_dataset, description="training")
        else:
            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")

        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }

        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
#             dataloader_params["sampler"] = self._get_train_sampler()
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker

        return self.accelerator.prepare(DataLoader(train_dataset, shuffle=False, **dataloader_params))
    
    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
        """
        Returns the evaluation [`~torch.utils.data.DataLoader`].

        Subclass and override this method if you want to inject some custom behavior.

        Args:
            eval_dataset (`torch.utils.data.Dataset`, *optional*):
                If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted
                by the `model.forward()` method are automatically removed. It must implement `__len__`.
        """
        if eval_dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        data_collator = self.data_collator

        if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
            eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation")
        else:
            data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation")

        dataloader_params = {
            "batch_size": self.args.eval_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }

        if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
#             dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
            dataloader_params["drop_last"] = self.args.dataloader_drop_last

        return self.accelerator.prepare(DataLoader(eval_dataset, shuffle=False, **dataloader_params))

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_wow["train"],
    eval_dataset=tokenized_wow["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [11]:
# tokenizer

In [12]:
trainer.train()



tensor([0, 3, 0, 4, 2, 5, 0, 0, 3, 5, 3, 5, 2, 2, 0, 2, 0, 1, 4, 3, 0, 6, 0, 4,
        1, 6, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 6, 4, 2,
        0, 0, 1, 0, 4, 0, 1, 0], device='cuda:0')
tensor([0, 2, 0, 1, 6, 0, 1, 3, 4, 0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 1,
        2, 6, 4, 0, 0, 0, 0, 0, 6, 3, 3, 0, 0, 3, 0, 0, 5, 0, 0, 6, 6, 0, 1, 1,
        0, 1, 1, 0, 0, 0, 1, 1], device='cuda:0')
tensor([0, 0, 2, 0, 0, 6, 0, 3, 3, 0, 3, 0, 0, 6, 1, 6, 0, 0, 1, 2, 5, 3, 1, 1,
        5, 2, 0, 0, 1, 3, 4, 0, 2, 4, 0, 0, 4, 0, 3, 0, 0, 4, 0, 2, 6, 1, 4, 0,
        6, 0, 0, 4, 1, 6, 0, 0], device='cuda:0')
tensor([3, 2, 1, 3, 2, 1, 1, 2, 0, 6, 5, 1, 6, 5, 0, 6, 2, 0, 0, 2, 4, 0, 0, 5,
        4, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 1, 0, 0, 1, 3, 0, 1, 0, 0, 1, 3, 3, 4,
        3, 0, 3, 2, 1, 1, 4, 4], device='cuda:0')


Step,Training Loss,Validation Loss


tensor([1, 0, 6, 0, 0, 0, 0, 5, 0, 0, 0, 2, 2, 1, 0, 0, 3, 0, 0, 0, 0, 0, 2, 3,
        3, 0, 2, 1, 0, 3, 4, 6, 0, 4, 0, 4, 2, 4, 1, 0, 1, 1, 1, 3, 0, 0, 3, 0,
        5, 0, 1, 4, 0, 0, 1, 1], device='cuda:0')
tensor([6, 6, 0, 2, 0, 0, 5, 0, 5, 0, 0, 0, 2, 0, 1, 1, 3, 6, 0, 0, 1, 0, 5, 1,
        0, 5, 0, 1, 2, 0, 3, 3, 1, 0, 2, 2, 1, 1, 0, 2, 0, 4, 1, 0, 2, 6, 5, 0,
        3, 0, 4, 6, 3, 0, 4, 5], device='cuda:0')
tensor([4, 1, 0, 1, 4, 1, 1, 2, 6, 0, 1, 3, 4, 2, 6, 0, 0, 1, 0, 6, 2, 1, 0, 1,
        3, 0, 0, 3, 2, 0, 1, 6, 0, 0, 6, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 6, 5, 1,
        0, 3, 0, 0, 4, 1, 1, 0], device='cuda:0')
tensor([0, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 3, 4, 2, 0, 5, 3, 0, 3, 1, 5, 0, 3,
        5, 0, 6, 3, 0, 0, 6, 5, 0, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 3, 4, 2, 1, 3,
        0, 4, 0, 0, 2, 2, 2, 2], device='cuda:0')
tensor([0, 3, 0, 1, 0, 3, 4, 0, 0, 0, 5, 4, 2, 0, 5, 0, 2, 0, 0, 2, 1, 0, 0, 0,
        1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 1, 6, 0, 5, 1, 1, 0, 0, 1, 1, 6, 0,


KeyboardInterrupt: 

In [15]:
!ls wow_rank_t5-small/runs/Aug08_12-42-17_dgx1-2

events.out.tfevents.1691478743.dgx1-2


In [18]:
trainer.save_model("wow_kle_t5_base")

In [20]:
!ls wow_kle_t5_base/

config.json		special_tokens_map.json  training_args.bin
generation_config.json	spiece.model
pytorch_model.bin	tokenizer_config.json
