In [1]:
!nvidia-smi

Wed Oct 11 10:17:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.191.01   Driver Version: 450.191.01   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   33C    P0    60W / 300W |  29704MiB / 32510MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   50C    P0    78W / 300W |  19316MiB / 32510MiB |     79%      Default |
|       

In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [5]:
from datasets import Dataset, DatasetDict
from transformers import XLMRobertaForSequenceClassification, XLMRobertaConfig, XLMRobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification
from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaClassificationHead
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import datasets
from typing import Optional
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.trainer import is_datasets_available, seed_worker
from datasets import load_dataset

In [6]:
from datasets import load_dataset
tokenized_sur = load_dataset("carnival13/test_DA_tokenized2")

In [7]:
model_ckpt = "massive_da_eng4/final"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [8]:
class ReRanker(XLMRobertaForSequenceClassification):
    def __init__(self, config: XLMRobertaConfig):
        # config.rank_score_index = 32019
        config.n_pass = 10
        # config.output_hidden_states = True
        super().__init__(config)
#         self.rank_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
        # self.rank_id = config.rank_score_index
        self.n_pass = config.n_pass


    def forward(self, input_ids=None, attention_mask=None, labels=None, pass_label=None, **kwargs):

        batch_size_n, seq_len = input_ids.size()
        batch_size = int(batch_size_n/self.n_pass)
        labels = None


        out = super().forward(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
#         rank_score = self.rank_head(out.decoder_hidden_states[-1][:, 0, :])
        rank_score = out.logits.view(batch_size, -1)
        loss = None


        if pass_label != None:
            pass_label = pass_label[::self.n_pass]
            rank_score = rank_score
#             gen_score = out.gpe_score

            loss_fct1 = nn.CrossEntropyLoss()
#             loss_fct2 = nn.CrossEntropyLoss()

            rank_loss = loss_fct1(rank_score, pass_label.view(-1))
#             gen_loss = loss_fct2(gen_score, pass_label.view(-1))

#             loss = rank_loss + gen_loss

            loss = rank_loss
            wandb.log({"loss": loss})

        ret =  SequenceClassifierOutput(
            loss=loss,
            logits=out.logits,
            hidden_states=out.hidden_states,
            attentions=out.attentions
        )
        ret.rank_score = rank_score
        return ret

In [9]:
model = ReRanker.from_pretrained(model_ckpt)

In [10]:
# model

In [11]:
# from huggingface_hub import notebook_login
# notebook_login()

In [12]:
# model.push_to_hub(model_ckpt)

In [13]:
device = "cuda:5" if torch.cuda.is_available() else "cpu"

In [14]:
device

'cuda:5'

In [15]:
tokenized_sur.set_format(type="torch", columns=["input_ids", "attention_mask", "pass_label"])

In [18]:
help(tokenizer.decode)

Help on method decode in module transformers.tokenization_utils_base:

decode(token_ids: Union[int, List[int], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), ForwardRef('tf.Tensor')], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = None, **kwargs) -> str method of transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast instance
    Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
    tokens and clean up tokenization spaces.
    
    Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
    
    Args:
        token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
            List of tokenized input ids. Can be obtained using the `__call__` method.
        skip_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to remove special tokens in the decoding.
        clean_up_tokenization_spa

In [19]:
tokenizer.decode(tokenized_sur["train"][3]["input_ids"], skip_special_tokens=True)

"query: Can I make a reservation at Buffalo Wild Wings? intent: apr examples: what is the apr for my capital one card? <sep> How good is the APR on my credit card? <sep> How does the APR on my Wells Fargo card compare to other cards in the market? <sep> Can you please provide me with my credit card's apr details? <sep> What's the APR of my Discover card? <sep> I'm not sure if I understand the difference between apr and interest rate. Can you explain it to me in simple terms? <sep> what is the apr on my wells fargo credit card? <sep> Could you please tell me the apr of my credit card? I want to stay on top of my finances and avoid any surprise charges. <sep> Kindly provide me with my credit card's apr details. <sep> how do I calculate my apr on my discovery card? <sep> Is it possible to lower my credit card's APR <sep> could you please provide me with the apr details of my credit card? <sep> Can you tell me the current APR for my US Bank Cash+ Visa Card? <sep> I would love to learn abou

In [15]:
data_collator = DataCollatorWithPadding(tokenizer)
dl = DataLoader(tokenized_sur["train"], collate_fn=data_collator, batch_size=60, shuffle=False)

In [16]:
from tqdm.notebook import tqdm
model = model.to(device)
batch_len = []
labels = []
scores = []

model.eval()
with torch.no_grad():
    for b in tqdm(dl):
        batch_len.append(int(b["input_ids"].size()[0]))
        input_ids = b["input_ids"].to(device)
        attention_mask = b["attention_mask"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        rank_score = list(outputs.logits)
        labels += list(b["pass_label"])
        scores += rank_score
        
#         print(pass_label.size())
#         mrr.append(cal_mrr(rank_score, pass_label))
#         if pred == None:
#             pred = rank_score
#             lab_p = pass_label
#         else:
#             pred = torch.cat((pred, rank_score), 0)
#             lab_p = torch.cat((lab_p, pass_label), 0)

#         for i in r_k.keys():
#             r_k[i].append(r_at_k(rank_score, pass_label, i))

  0%|          | 0/5598 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [17]:
scores = torch.tensor(scores)
labels = torch.tensor(labels[::150])

In [18]:
scores = scores.view(-1, 150)

In [19]:
scores.shape

torch.Size([2239, 150])

In [20]:
prediction = torch.argmax(scores, dim=1)

In [21]:
dct = {"id": [], "intent": []}
for i, j in enumerate(list(prediction)):
    dct["id"].append(i)
    dct["intent"].append(j)


In [22]:
def dict_to_dataset(dict):
    dataset = datasets.Dataset.from_dict(dict)
    return dataset

pred = dict_to_dataset(dct)
pred.save_to_disk(f"pred_xlmr")

Saving the dataset (0/1 shards):   0%|          | 0/2239 [00:00<?, ? examples/s]

In [23]:
acc_lst = labels == prediction

In [24]:
print(f"Accuracy = {acc_lst.sum()/2239}")

Accuracy = 0.8311746120452881


In [43]:
labels.view(-1, 1) in torch.argsort(scores, dim=1)[:, :10]

True

In [46]:
prediction

tensor([101, 109,   3,  ...,  47,   6,   3])