# Adversarial attacks against Legal-BERT Model (BertForSequenceClassification)

In [1]:
# Global variables

BATCH_SIZE = 32
MODEL_NAME = 'nlpaueb/legal-bert-small-uncased'#'bert-base-uncased'
EPOCHS = 3
EMBEDDING_SIZE = 512
NUM_CLASSES = 2
VOCABULARY_SIZE = 30522
NUM_TOKENS = 3


### Installation of packages

In [2]:
!pip install transformers
!pip install torch-lr-finder

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Imports

In [3]:
import torch
import os
from transformers import BertTokenizer
from google.colab import drive
from torch.utils.data import TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
import gc
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from copy import deepcopy

### Device

In [4]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


### Reading dataset

In [5]:
# Mount drive to have access to your files
drive.mount('/content/drive')
%cd /content/drive/MyDrive/"Colab Notebooks"/DefenseAdvAttacks

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/DefenseAdvAttacks


In [6]:
# Funtion to read all sentences
def get_sentences(path):
    sentences= []
    for filename in os.listdir(path):
        with open(path+filename, 'r') as f:
            for sentence in f :
                sentences.append(sentence)
    return sentences

In [7]:
# Function to read get all labels
def get_labels(path):
    all_labels = []
    for filename in os.listdir(path):
        file_labels = []
        with open(path+filename, 'r') as f:
            for label in f :
                all_labels.append(int(label))
    return all_labels

In [8]:
# Reading sentences and labels
all_sentences = get_sentences("ToS/Sentences/")
all_labels = get_labels("ToS/Labels/")

In [9]:
# Since unfair sentences are marked as "-1", we change them to "0" for simplicity. Zero means fair, One means unfair
all_labels =  [0 if label ==-1 else label for label in all_labels]

### Bert Tokenizer

In [10]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True) # the model 'bert-base-uncased' only contains lower case sentences

Loading BERT tokenizer...


In [11]:
# ==> Example of first sentence

# Print the original sentence.
print(' Original: ', all_sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(all_sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(all_sentences[0])))

 Original:  * accepting the terms of service 

Tokenized:  ['*', 'accept', '##ing', 'the', 'terms', 'of', 'service']
Token IDs:  [113, 1599, 235, 207, 333, 210, 446]


In [12]:
"""
# ==> Get the max length of a sentence

max_len = 0

# For every sentence...
for sent in all_sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)
# Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
# Max sentence length:  598
"""

"\n# ==> Get the max length of a sentence\n\nmax_len = 0\n\n# For every sentence...\nfor sent in all_sentences:\n\n    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n    input_ids = tokenizer.encode(sent, add_special_tokens=True)\n\n    # Update the maximum sentence length.\n    max_len = max(max_len, len(input_ids))\n\nprint('Max sentence length: ', max_len)\n# Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors\n# Max sentence length:  598\n"

### Model BertForSequenceClassification (Load model)

In [13]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = NUM_CLASSES, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at nlpaueb/legal-bert-small-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, element

In [14]:
# Load the model and dictionary
model.load_state_dict(torch.load('Bert4SeqClassif_202207072015.pt'))#, map_location=torch.device('cpu') or cuda. Both work


<All keys matched successfully>

### Trigger generation

##### General functions

In [15]:
# hook used in add_hooks()
extracted_grads = []
def extract_grad_hook(module, grad_in, grad_out):
    extracted_grads.append(grad_out[0])

In [16]:
# returns the wordpiece embedding weight matrix
def get_embedding_weight(language_model):
    for module in language_model.modules():
        if isinstance(module, torch.nn.Embedding):
            if module.weight.shape[0] == 30522: # only add a hook to wordpiece embeddings, not position embeddings 
                ##50257 is the size of the vocabulary of GPT
                return module.weight.detach()

In [17]:
# add hooks for embeddings
def add_hooks(language_model):
    for module in language_model.modules():
        if isinstance(module, torch.nn.Embedding):
            if module.weight.shape[0] == 30522: # only add a hook to wordpiece embeddings, not position
                ##50257 is the size of the vocabulary of GPT
                module.weight.requires_grad = True
                #module.register_backward_hook(extract_grad_hook)
                module.register_full_backward_hook(extract_grad_hook)

In [18]:
# Gets the loss of the target_tokens using the triggers as the context
def get_loss(language_model, batch_size, trigger, target, device='cuda'):
    # context is trigger repeated batch size
    print(f'Arrive to get_loss\n\t batch_size {batch_size}\n\t trigger {trigger.shape}\n\t target {target.shape}')
    print(f'LANGUAGE_MODEL {language_model}')
    tensor_trigger = torch.tensor(trigger, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
    print(f'tensor_trigger {tensor_trigger}')
    mask_out = -1 * torch.ones_like(tensor_trigger) # we zero out the loss for the trigger tokens
    print(f'mask_out {mask_out}')
    lm_input = torch.cat((tensor_trigger, target), dim=1) # we feed the model the trigger + target texts
    print(f'lm_input {lm_input.shape} == {lm_input}')
    print(f'lm_input[0] {lm_input[0]}')
    mask_and_target = torch.cat((mask_out, target), dim=1) # has -1's + target texts for loss computation
    print(f'mask_and_target {mask_and_target.shape} == {mask_and_target}')
    lm_input[lm_input == -1] = 1   # put random token of 1 at end of context (its masked out)
    print(f'lm_input {lm_input.shape} == {lm_input}')
    loss = language_model(lm_input, labels=mask_and_target)#[0]
    print(f'loss {loss}')
    return loss

In [19]:
# creates the batch of target texts with -1 placed at the end of the sequences for padding (for masking out the loss).
def make_target_batch(tokenizer, device, target_texts):
    # encode items and get the max length
    encoded_texts = []
    max_len = 0
    for target_text in target_texts:
        encoded_target_text = tokenizer.encode_plus(
            target_text,
            add_special_tokens = True,
            max_length = EMBEDDING_SIZE - NUM_TOKENS,
            pad_to_max_length = True,
            return_attention_mask = True
        )
        #print(f'ENCODED_TARGET_TEXT {type(input_ids)} == {encoded_target_text.keys()}') # ENCODED_TARGET_TEXT <class 'list'> == dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
        """
        print(f'ENCODED_TARGET_TEXT {type(encoded_target_text)} == {encoded_target_text}')
        ENCODED_TARGET_TEXT <class 'list'> == [101, 218, 4527, 237, 366, 212, 1260, 207, 446, 115, 799, 2277, 212, 781, 216, 119, 102]
        ENCODED_TARGET_TEXT <class 'list'> == [101, 799, 356, 432, 145, 782, 438, 225, 3457, 13409, 115, 102]
        """
        encoded_texts.append(encoded_target_text.input_ids)
        if len(encoded_target_text.input_ids) > max_len:
            max_len = len(encoded_target_text)

    # pad tokens, i.e., append -1 to the end of the non-longest ones
    for indx, encoded_text in enumerate(encoded_texts):
        if len(encoded_text) < max_len:
            encoded_texts[indx].extend([-1] * (max_len - len(encoded_text)))

    # convert to tensors and batch them up
    target_tokens_batch = None
    for encoded_text in encoded_texts:
        target_tokens = torch.tensor(encoded_text, device=device, dtype=torch.long).unsqueeze(0)
        if target_tokens_batch is None:
            target_tokens_batch = target_tokens
        else:
            target_tokens_batch = torch.cat((target_tokens, target_tokens_batch), dim=0)
    return target_tokens_batch

In [20]:
# Got from https://github.com/Eric-Wallace/universal-triggers/blob/master/attacks.py

def hotflip_attack(averaged_grad, embedding_matrix, trigger_token_ids,
                   increase_loss=False, num_candidates=1):
    """
    The "Hotflip" attack described in Equation (2) of the paper. This code is heavily inspired by
    the nice code of Paul Michel here https://github.com/pmichel31415/translate/blob/paul/
    pytorch_translate/research/adversarial/adversaries/brute_force_adversary.py
    This function takes in the model's average_grad over a batch of examples, the model's
    token embedding matrix, and the current trigger token IDs. It returns the top token
    candidates for each position.
    If increase_loss=True, then the attack reverses the sign of the gradient and tries to increase
    the loss (decrease the model's probability of the true class). For targeted attacks, you want
    to decrease the loss of the target class (increase_loss=False).
    """
    averaged_grad = averaged_grad.cpu()
    embedding_matrix = embedding_matrix.cpu()
    trigger_token_embeds = torch.nn.functional.embedding(torch.LongTensor(trigger_token_ids),
                                                         embedding_matrix).detach().unsqueeze(0)
    averaged_grad = averaged_grad.unsqueeze(0)
    gradient_dot_embedding_matrix = torch.einsum("bij,kj->bik",
                                                 (averaged_grad, embedding_matrix))        
    if not increase_loss:
        gradient_dot_embedding_matrix *= -1    # lower versus increase the class probability.
    if num_candidates > 1: # get top k options
        _, best_k_ids = torch.topk(gradient_dot_embedding_matrix, num_candidates, dim=2)
        return best_k_ids.detach().cpu().numpy()[0]
    _, best_at_each_step = gradient_dot_embedding_matrix.max(2)
    return best_at_each_step[0].detach().cpu().numpy()

In [21]:
def get_input_masks_and_labels_with_tokens(sentences, labels, tokens):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in sentences:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        sent_with_tokens = " ".join(tokens) + " " + sent

        encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,          # Pad & truncate all sentences.
                        pad_to_max_length = True, #is deprecated
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        
        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

In [22]:
def get_loss_and_metrics(model, dataloader, device):
    # get initial loss for the trigger
    model.zero_grad()

    test_preds = []
    test_targets = []

    # Tracking variables 
    total_test_accuracy = 0
    total_test_loss = 0
    io_total_test_acc = 0
    io_total_test_prec = 0
    io_total_test_recall = 0
    io_total_test_f1 = 0

    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        result = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels,
                    return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the 
        # output values prior to applying an activation function like the 
        # softmax.
        loss = result.loss
        logits = result.logits

        test_preds.extend(logits.argmax(dim=1).cpu().numpy())
        test_targets.extend(batch[2].numpy())

        # Accumulate the validation loss.
        total_test_loss += loss.item()

        test_preds.extend(logits.argmax(dim=1).cpu().numpy())
        test_targets.extend(batch[2].numpy())

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        loss.backward()        

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.        
        test_acc = accuracy_score(test_targets, test_preds)
        test_precision = precision_score(test_targets, test_preds)
        test_recall = recall_score(test_targets, test_preds)
        test_f1 = f1_score(test_targets, test_preds)

        io_total_test_acc += test_acc
        io_total_test_prec += test_precision
        io_total_test_recall += test_recall
        io_total_test_f1 += test_f1

    io_avg_test_loss = total_test_loss/len(dataloader)
    io_avg_test_acc = io_total_test_acc / len(dataloader)
    io_avg_test_prec = io_total_test_prec / len(dataloader)
    io_avg_test_recall = io_total_test_recall / len(dataloader)
    io_avg_test_f1 = io_total_test_f1 / len(dataloader)
    print(
            f'Loss {io_avg_test_loss} : \t\
            Valid_acc : {io_avg_test_acc}\t\
            Valid_F1 : {io_avg_test_f1}\t\
            Valid_precision : {io_avg_test_prec}\t\
            Valid_recall : {io_avg_test_recall}'
          )

    #print(f"total_test_loss {total_test_loss/len(dataloader)}")

    return io_avg_test_loss, io_avg_test_acc, io_avg_test_prec, io_avg_test_recall, io_avg_test_f1

In [23]:
def change_input_ids_with_candidate_token(input_ids, position, candidate):
    for elem in range(input_ids.shape[0]):
        input_ids[elem][position] = candidate

    return input_ids

In [24]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [25]:
### Get positions of unfair sentences

positions_unfair = np.where(np.array(all_labels) == 1)[0]
print(f'First 32 positions: {positions_unfair[0:32]} with total of unfair sentences {len(positions_unfair)}')

target_unfair_sentences = []
labels_unfair_sentences = []
for index in range(len(positions_unfair)):
    target_unfair_sentences.append(all_sentences[positions_unfair[index]])
    labels_unfair_sentences.append(all_labels[positions_unfair[index]])


First 32 positions: [  4   9  10  11  12  13  24  25  43  45  61  62  78  79  87  89  91  92
 100 104 109 111 143 151 154 157 169 195 206 258 260 266] with total of unfair sentences 1032


In [26]:
model.eval()
model.to(device)

add_hooks(model) # add gradient hooks to embeddings
embedding_weight = get_embedding_weight(model) # save the word embedding matrix

In [27]:
print(f'embedding_weight {embedding_weight} \n\nwith shape {embedding_weight.shape}')

embedding_weight tensor([[ 0.0641, -0.0185, -0.0232,  ..., -0.0211,  0.0466, -0.0678],
        [-0.0175,  0.0522, -0.1289,  ..., -0.0658,  0.0291, -0.1561],
        [ 0.0128, -0.0119, -0.0850,  ..., -0.0592,  0.0799, -0.1387],
        ...,
        [ 0.0040,  0.0531, -0.0814,  ...,  0.0393,  0.0525, -0.0063],
        [-0.0143,  0.0036, -0.0973,  ..., -0.0562,  0.0196, -0.1135],
        [-0.0785, -0.0090, -0.1799,  ...,  0.0115,  0.0191, -0.0859]],
       device='cuda:0') 

with shape torch.Size([30522, 512])


In [28]:
#target_tokens = make_target_batch(tokenizer, device, target_sentences)

#target_tokens.shape

# sample random initial trigger
trigger_tokens = np.array([621, 19353, 7063])#np.array([598, 275, 3523])#np.random.randint(VOCABULARY_SIZE, size=NUM_TOKENS)
print(tokenizer.decode(trigger_tokens))

unless impulse author


In [29]:
#trigger_tokens #.shape (3,) => array([ 8972, 27350, 25382])
#target_tokens #.shape => torch.Size([32, 163])
"""
tensor([[  101, 12017,   179,  ...,    -1,    -1,    -1],
        [  101,   233,   223,  ...,    -1,    -1,    -1],
        [  101, 12017,   179,  ...,    -1,    -1,    -1],
        ...,
        [  101,   206,  4313,  ...,    -1,    -1,    -1],
        [  101,   206,  4313,  ...,    -1,    -1,    -1],
        [  101,   218,  1260,  ...,    -1,    -1,    -1]], device='cuda:0')
"""

"\ntensor([[  101, 12017,   179,  ...,    -1,    -1,    -1],\n        [  101,   233,   223,  ...,    -1,    -1,    -1],\n        [  101, 12017,   179,  ...,    -1,    -1,    -1],\n        ...,\n        [  101,   206,  4313,  ...,    -1,    -1,    -1],\n        [  101,   206,  4313,  ...,    -1,    -1,    -1],\n        [  101,   218,  1260,  ...,    -1,    -1,    -1]], device='cuda:0')\n"

In [30]:
input_ids, attention_masks, labels = get_input_masks_and_labels_with_tokens(target_unfair_sentences, labels_unfair_sentences, tokenizer.decode(trigger_tokens))

dataset = TensorDataset(input_ids, attention_masks, labels)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [31]:

extracted_grads = []

#loss_obtained = get_loss_and_metrics(model, dataloader, device)
loss_obtained, acc_obtained, prec_obtained, recall_obtained, f1_obtained = get_loss_and_metrics(model, dataloader, device)
#print(f'loss_obtained {loss_obtained}')
print(f'f1_obtained {f1_obtained}')

#best_loss = loss_obtained
candidates_selented = [0,0,0]
# try all the candidates and pick the best
curr_best_loss = f1_obtained
curr_best_trigger_tokens = None

for id_token_to_flip in range(0, NUM_TOKENS):
    # Get average gradient w.r.t. the triggers
    #extracted_grads = []
    #loss_obtained.backward()

    averaged_grad = torch.sum(extracted_grads[0], dim=0)
    averaged_grad = averaged_grad[id_token_to_flip].unsqueeze(0)

    # Use hotflip (linear approximation) attack to get the top num_candidates
    candidates = hotflip_attack(averaged_grad, embedding_weight,
                                        [trigger_tokens[id_token_to_flip]], 
                                        increase_loss=False, num_candidates=100)[0]
    print(f'candidates {candidates}')
    
    for index, cand in enumerate(candidates):
        # replace one token with new candidate
        extracted_grads = []

        input_ids_with_candidate_trigger = change_input_ids_with_candidate_token(deepcopy(input_ids), id_token_to_flip+1, cand)
        dataset_with_candidate_trigger = TensorDataset(input_ids_with_candidate_trigger, attention_masks, labels)
        dataloader_with_candidate_trigger = torch.utils.data.DataLoader(dataset_with_candidate_trigger, batch_size=BATCH_SIZE)

        #current_loss = get_loss_and_metrics(model, dataloader_with_candidate_trigger, device)
        current_loss, current_acc, current_prec, current_recall, current_f1 = get_loss_and_metrics(model, dataloader_with_candidate_trigger, device)

        if curr_best_loss > current_f1:
            curr_best_loss = current_f1
            candidates_selented[id_token_to_flip] = cand

        del input_ids_with_candidate_trigger
        del dataset_with_candidate_trigger
        del dataloader_with_candidate_trigger

        gc.collect()
        torch.cuda.empty_cache()

        #print(f'[{id_token_to_flip}][{index}] loss[{index}] {current_loss} ({curr_best_loss})')
        print(f'[{id_token_to_flip}][{index}] f1[{index}] {current_f1} ({curr_best_loss})')
    input_ids = change_input_ids_with_candidate_token(deepcopy(input_ids), id_token_to_flip+1, candidates_selented[id_token_to_flip])
    print(f'Worst f1 {curr_best_loss} with candidates {candidates_selented}')

#Best loss 0.5344366431236267 with candidates [598, 275, 3523]
#Best loss 0.9147895276546478 with candidates [621, 19353, 7063]

Loss 0.3013410617907842 : 	            Valid_acc : 0.919868349164074	            Valid_F1 : 0.95823260150304	            Valid_precision : 1.0	            Valid_recall : 0.919868349164074
f1_obtained 0.95823260150304
candidates [  457 12986  5102   660  7742   232  2176  2705 10902   587   591  3522
  2940  1607  1914   454 20458  6966  1750   378  1890 23836   382  1635
  3477   936  1382   679   266  4004   531   572  1672  3626  1401  1753
  1731  2126  1363   410  4203  1427  1281  1297   799 18253   318   358
 29799  1050   599  1089  1688   635   782 23272   338  8498   621  1415
   226  6665  8628  1807  3774  1375  2127  7455   271  2872   547  2586
   644  7784  7110  1327 13056 23357  1924   598  1626  2527  2318  1280
 12491 24438  2090 11472  1073  2764  1735   606  1492   390  1414   435
   775  6767  8707   419]
Loss 0.3754258600599838 : 	            Valid_acc : 0.8974287983898493	            Valid_F1 : 0.9458666137005209	            Valid_precision : 1.0	            Vali

In [39]:
#print(tokenizer.decode([621, 4865, 21241]))
#print(tokenizer.decode([621, 13890, 21241]))# Loss => unless communist normativ
#print(tokenizer.decode([621, 13890, 13064]))# Accuracy => unless communist tolerate
print(tokenizer.decode([621, 13890, 13064]))# F1 => unless communist tolerate

unless communist tolerate


In [33]:
"""
extracted_grads = []
model.eval()
model.to(device)
loss_obtained = get_loss_and_metrics(model, dataloader, device)
print(f'loss_obtained {loss_obtained}')
"""

"\nextracted_grads = []\nmodel.eval()\nmodel.to(device)\nloss_obtained = get_loss_and_metrics(model, dataloader, device)\nprint(f'loss_obtained {loss_obtained}')\n"

In [34]:
"""
extracted_grads = []

# get initial loss for the trigger
model.zero_grad()
loss = get_loss(model, BATCH_SIZE, trigger_tokens, target_tokens, device)
best_loss = loss
counter = 0
end_iter = False
"""

'\nextracted_grads = []\n\n# get initial loss for the trigger\nmodel.zero_grad()\nloss = get_loss(model, BATCH_SIZE, trigger_tokens, target_tokens, device)\nbest_loss = loss\ncounter = 0\nend_iter = False\n'

In [35]:
"""
del extracted_grads
del dataloader
del model
del input_ids
del attention_masks
del labels
del loss_obtained
del dataset
"""

'\ndel extracted_grads\ndel dataloader\ndel model\ndel input_ids\ndel attention_masks\ndel labels\ndel loss_obtained\ndel dataset\n'

In [36]:
"""import torch, gc
gc.collect()
torch.cuda.empty_cache()
"""

'import torch, gc\ngc.collect()\ntorch.cuda.empty_cache()\n'

In [37]:
len(extracted_grads)

33

In [38]:
input_ids[0][1]

tensor(621)