# Import necessary libraries

In [1]:
import torch

In [2]:
from datasets import load_dataset

In [3]:
from transformers import AutoModel, AutoTokenizer, AdamW, AutoConfig

# Download datasets

In [4]:
mnli_path = None

In [5]:
if mnli_path == None:
    mnli = load_dataset("multi_nli")
    mnli_path = "./datasets/mnli"
    mnli.save_to_disk(mnli_path)

Using custom data configuration default
Found cached dataset multi_nli (/home/yz709/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
qnli_path = None

In [7]:
if qnli_path == None:
    qnli = load_dataset("SetFit/qnli")
    qnli_path = "./datasets/qnli"
    qnli.save_to_disk(qnli_path)

Using custom data configuration SetFit--qnli-324fd6914ad1beff
Found cached dataset json (/home/yz709/.cache/huggingface/datasets/SetFit___json/SetFit--qnli-324fd6914ad1beff/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

# Inspect datasets

## For qnli dataset

In [8]:
qnli

DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 104743
    })
    test: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 5463
    })
    validation: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 5463
    })
})

In [9]:
qnli_train = qnli['train']
qnli_test = qnli['test']
qnli_val = qnli['validation']

In [10]:
# label 0 - entailment, label 1 - not entailment
qnli_train[-2:]

{'text1': ['What individual was responsible for law and maintaining order in the county?',
  'How much of the gross domestic product was spent on public health in 2004?'],
 'text2': ['He was the top civil and military leader of the commandery and handled defense, lawsuits, seasonal instructions to farmers and recommendations of nominees for office sent annually to the capital in a quota system first established by Emperor Wu.',
  'Public expenditure health was at 8.9% of the GDP in 2004, whereas private expenditure was at 1.3%.'],
 'label': [1, 0],
 'idx': [104741, 104742],
 'label_text': ['not entailment', 'entailment']}

In [11]:
qnli_train['text1'][-2:]

['What individual was responsible for law and maintaining order in the county?',
 'How much of the gross domestic product was spent on public health in 2004?']

## For mnli dataset

In [12]:
mnli

DatasetDict({
    train: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9832
    })
})

In [13]:
mnli_train = mnli['train']
mnli_val_match = mnli['validation_matched']
mnli_val_mismatch = mnli['validation_mismatched']

In [14]:
mnli_train

Dataset({
    features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
    num_rows: 392702
})

In [15]:
mnli_train[0]

{'promptID': 31193,
 'pairID': '31193n',
 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'premise_binary_parse': '( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )',
 'premise_parse': '(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'hypothesis_binary_parse': '( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )',
 'hypothesis_parse': '(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))',
 'genre': 'government',
 'label': 1}

In [16]:
mnli_train[3]

{'promptID': 37397,
 'pairID': '37397e',
 'premise': 'How do you know? All this is their information again.',
 'premise_binary_parse': '( ( How ( ( ( do you ) know ) ? ) ) ( ( All this ) ( ( ( is ( their information ) ) again ) . ) ) )',
 'premise_parse': '(ROOT (S (SBARQ (WHADVP (WRB How)) (SQ (VBP do) (NP (PRP you)) (VP (VB know))) (. ?)) (NP (PDT All) (DT this)) (VP (VBZ is) (NP (PRP$ their) (NN information)) (ADVP (RB again))) (. .)))',
 'hypothesis': 'This information belongs to them.',
 'hypothesis_binary_parse': '( ( This information ) ( ( belongs ( to them ) ) . ) )',
 'hypothesis_parse': '(ROOT (S (NP (DT This) (NN information)) (VP (VBZ belongs) (PP (TO to) (NP (PRP them)))) (. .)))',
 'genre': 'fiction',
 'label': 0}

# Preprocess dataset QNLI

In [17]:
# roberta-base or roberta-large
PLM = "roberta-base"

In [18]:
# load model
model = AutoModel.from_pretrained(PLM)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# load tokeniser for the specific model
tokeniser = AutoTokenizer.from_pretrained(PLM)

In [20]:
# test the tokeniser
test_input = "Simple text to test -- tokenizer [roberta-base]"
tokenised_test_input = tokeniser.encode(test_input, return_tensors="pt")
print(tokenised_test_input)

tensor([[    0, 45093,  2788,     7,  1296,   480, 19233,  6315,   646,  1001,
          6747,   102,    12, 11070,   742,     2]])


In [21]:
test_token_id_to_text = tokeniser.convert_ids_to_tokens(tokeniser.encode(test_input))
print(test_token_id_to_text)

['<s>', 'Simple', 'Ġtext', 'Ġto', 'Ġtest', 'Ġ--', 'Ġtoken', 'izer', 'Ġ[', 'ro', 'bert', 'a', '-', 'base', ']', '</s>']


In [22]:
# test the model
test_output = model(tokenised_test_input)

In [23]:
"""
First output:
# (1,16,768): batch_size * #tokens * embedding_size_defined_by_model
# our input is one sentence with 16 tokens
Second output - pooler output, the embedding result of the first token of the sequence <s>
"""
test_output[0].size(), test_output[1].size()

(torch.Size([1, 16, 768]), torch.Size([1, 768]))

In [24]:
def manual_prompt(questions, answers, template):
    prompt = questions + template + answers
    prompt_token_id = tokeniser(prompt, return_tensors="pt").input_ids
    prompt_id_to_text = tokeniser.convert_ids_to_tokens(prompt_token_id[0])
    mask_token_pos = prompt_id_to_text.index(tokeniser.mask_token)
    return prompt_token_id, mask_token_pos

In [25]:
manual_template = " <mask>. "
prompt_token_id, mask_token_pos = manual_prompt(qnli_train[0]['text1'], qnli_train[0]['text2'], manual_template)

In [26]:
model.eval()
with torch.no_grad():
    predictions = model(prompt_token_id)[0]
print(predictions.size())

torch.Size([1, 62, 768])


In [27]:
values, indices = torch.sort(predictions[0, mask_token_pos], descending=True)

In [28]:
result = list(zip(tokeniser.convert_ids_to_tokens(indices), values))

In [29]:
result[0]

('Ġreal', tensor(10.4360))

In [30]:
qnli_train[0]['label']

1

In [31]:
# check whether dataset is balanced
def is_balanced(dataset, threshold=0.1):
    all_labels = torch.tensor(dataset)
    ratio = all_labels.sum() / len(all_labels)
    return abs(2 * ratio - 1) < 0.1

In [32]:
is_balanced(qnli_train['label']), is_balanced(qnli_test['label']), is_balanced(qnli_val['label'])

(tensor(True), tensor(False), tensor(True))

In [33]:
qnli_train[0]['label'], qnli_train[0]['label_text']

(1, 'not entailment')

In [34]:
# check the max length of question and answer
max([len(i) for i in qnli_train['text1']]), max([len(i) for i in qnli_train['text2']])

(270, 2458)

## Train

In [37]:
import pytorch_lightning as pl
PARAMS = {
    "batch_size": 24,
    "lr": 1e-3,
    "max_epochs": 1,
    "model_name": "roberta-base",
    "max_token_count": 2048,
    "random_seed": 42
}
pl.seed_everything(PARAMS['random_seed'])

Global seed set to 42


42

In [38]:
from torch.utils.data import Dataset

class QNLIDataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len, template):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len
        self.template = template
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        question = row.text1
        answer = row.text2
        label = row.label
        label_text = row.label_text
        
        # prompt = <question><SEP><template><answer><SEP> 
        prompt = question + " " + template + ", " + answer
        end_prompt = self.tokenizer.encode_plus(
            prompt,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return dict(
            prompt=prompt,
            input_ids=end_prompt["input_ids"].flatten(),
            attention_mask=end_prompt["attention_mask"].flatten(),
            label=torch.FloatTensor(label),
            label_text=label_text
        )
        

In [39]:
from torch.utils.data import DataLoader
class QNLIDataModule(pl.LightningDataModule):
    def __init__(self, train_data, val_data, test_data, tokenizer, template, batch_size, max_token_len):
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len
        
    def setup(self):
        self.train_dataset = QNLIDataset(
            self.train_data,
            self.tokenizer,
            self.max_token_len,
            self.template
        )
        
        self.val_dataset = QNLIDataset(
            self.val_data,
            self.tokenizer,
            self.max_token_len,
            self.template
        )
        
        self.test_dataset = QNLIDataset(
            self.test_data,
            self.tokenizer,
            self.max_token_len,
            self.template
        )
        
    def train_dataloader(self):
        return Dataloader(
            self.train_dataset,
            batch_size = self.batch_size,
            shuffle = True,
            num_workers = 2
        )
    
    def val_dataloader(self):
        return Dataloader(
            self.val_dataset,
            batch_size = self.batch_size,
            shuffle = True,
            num_workers = 2
        )
    
    def test_dataloader(self):
        return Dataloader(
            self.test_dataset,
            batch_size = self.batch_size,
            shuffle = True,
            num_workers = 2
        )

In [40]:
from transformers import AutoModel
from transformers import AdamW, get_linear_schedule_with_warmup
class QNLIClassifier(pl.LightningModule):
    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = AutoModel.from_pretrained(PARAMS['model_name'], return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_epoch_end(self, outputs):
        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)
        
        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)
        
        class_roc_auc = auroc(predictions[:, 0], labels[:, 0])
        self.logger.experiment.add_scalar(f"{label}_roc_auc/Train", class_roc_auc, self.current_epoch)
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=PARAMS['lr'])
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        ) # learning rate scheduler
        
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )

In [43]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import torch.nn as nn
def prepare_and_train(train_df, val_df, test_df, template):
    """ Training best practices
        - Checkpointing that saves the best model based on validation loss
        - Logging the progress in TensorBoard
        - Early stopping that terminates the training when the loss has not improved for the last 2 epochs
    """
    logger = TensorBoardLogger('/local/scratch-3/yz709/nlp-prompt-attack/tb_logs', name='discrete-prompt-roberta-base')
    checkpoint_callback = ModelCheckpoint(
        dirpath='checkpoints',
        filename='discrete-prompt-roberta-base-{epoch:02d}-{val_loss:.2f}',
        verbose=True,
        monitor='val_loss',
        mode='min'
    )
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)
    
    # preprocess data
    tokenizer = AutoTokenizer.from_pretrained(PARAMS['model_name'])
    train_dataset = QNLIDataset(train_df, tokenizer, PARAMS['max_token_count'], template)
    data_module = QNLIDataModule(
        train_df,
        val_df,
        test_df,
        tokenizer,
        template,
        batch_size=PARAMS['batch_size'],
        max_token_len=PARAMS['max_token_count']
    )

    # model
    steps_per_epoch = len(train_df) // PARAMS['batch_size']
    total_training_steps = steps_per_epoch * PARAMS['max_epochs']
    warmup_steps = total_training_steps // 5
    model = QNLIClassifier(
        n_classes=2,
        n_warmup_steps=warmup_steps,
        n_training_steps=total_training_steps
    )
    
    # train
    trainer = pl.Trainer(
        logger = logger,
        callbacks=[early_stopping_callback,checkpoint_callback],
        max_epochs=PARAMS['max_epochs'],
        accelerator="gpu", 
        devices=[1],
    )
    trainer.fit(model, data_module)

In [44]:
prepare_and_train(qnli_train, qnli_val, qnli_test, template = " <mask> ")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


MisconfigurationException: `QNLIDataModule.setup` does not have a `stage` argument.