In [1]:
! pip install transformers[torch] datasets evaluate

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


## Define the model we fine-tune

In [2]:
model_checkpoint = "FacebookAI/roberta-large"
batch_size = 16

# Load Dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("tau/commonsense_qa")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

In [5]:
dataset["train"][0]

{'id': '075e483d21c29a511267ef62bedc0461',
 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
 'question_concept': 'punishing',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']},
 'answerKey': 'A'}

In [6]:
dataset["validation"][0]

{'id': '1afa02df02c908a558b4036e80242fac',
 'question': 'A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?',
 'question_concept': 'revolving door',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['bank', 'library', 'department store', 'mall', 'new york']},
 'answerKey': 'A'}

Notice, all the answerKey in test dataset are ""

In [7]:
dataset["test"][0]

{'id': '90b30172e645ff91f7171a048582eb8b',
 'question': 'The townhouse was a hard sell for the realtor, it was right next to a high rise what?',
 'question_concept': 'townhouse',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['suburban development',
   'apartment building',
   'bus stop',
   'michigan',
   'suburbs']},
 'answerKey': ''}

#### The following function will show some examples picked randomly in the dataset to show what the data looks like

In [8]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [9]:
show_random_elements(dataset["train"])

Unnamed: 0,id,question,question_concept,choices,answerKey
0,244b9eb979ac7b292fd437d717d0e2a6,What do people usually do when listening to music?,listening to music,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['dance', 'learning', 'enjoyment', 'shout', 'keep time']}",A
1,4f23188516403427fbe4ee7d63b39fda,Where is a good place to buy a fishing rod?,rod,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['catch fish', 'fishing camp', 'dolphins', 'sporting goods store', 'engine']}",D
2,46e2bed1736cdb103318245fe46e1462,What do you hold the handle of after going to Starbucks?,handle,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['gripping', 'briefcase', 'carry object', 'frying pan', 'coffee cup']}",E
3,80c99f8ede4e2297cac15f099270997b,"The hobbit was timid in front of the dragon, but what did he have to be to get the gold?",timid,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['mean', 'aggressive', 'bellicose', 'reckless', 'dauntless']}",E
4,b101fb5d095d6c28caae45a7779338b0,How might someone relieve stress with friends?,stress,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['hang out at bar', 'running', 'drink alcohol', 'clean house', 'dream']}",A
5,20cea584258e4de641f4331ccf815abd,"All people need to move, otherwise what will atrophy?",all people,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['feelings', 'shrink', 'free will', 'muscles', 'parents']}",D
6,666fc2a353ed388ac74fe8e6fd0fba8c,What do you have when you are learning?,learning,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['attention', 'attention', 'study', 'thought', 'exposure']}",D
7,9299a6bedf08ee7bbc3a79b8390a4018,"If I was getting drunk and lost control of my inhibitions, what might happen to me?",getting drunk,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['pregnancy', 'forgetfulness', 'pass out', 'death', 'slurred speech']}",C
8,6d4d55ea0479e63d6a0d37b19722b054,Where is the sky fake?,sky,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['shed', 'atmosphere', 'photo', 'outdoors', 'planetarium']}",E
9,46e9f10bc0eb4a8e04f3eaa39783c243,"If you have some excess corn, where would you put it?",corn,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['silo', 'storage building', 'restaurant', 'pennsylvania', 'supermarket']}",A


#### Define a function to check the ground truth of a specific question

In [10]:
def show_one(example):
    print(f"Question: {example['question']}")
    print(f"  {example['choices']['label'][0]}:  {example['choices']['text'][0]}")
    print(f"  {example['choices']['label'][1]}:  {example['choices']['text'][1]}")
    print(f"  {example['choices']['label'][2]}:  {example['choices']['text'][2]}")
    print(f"  {example['choices']['label'][3]}:  {example['choices']['text'][3]}")
    print(f"  {example['choices']['label'][4]}:  {example['choices']['text'][4]}")
    print(f"\nGround truth: option {example['answerKey']}")

In [11]:
show_one(dataset["train"][0])

Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?
  A:  ignore
  B:  enforce
  C:  authoritarian
  D:  yell at
  E:  avoid

Ground truth: option A


In [12]:
show_one(dataset["train"][12])

Question: Johnny sat on a bench and relaxed after doing a lot of work on his hobby.  Where is he?
  A:  state park
  B:  bus depot
  C:  garden
  D:  gym
  E:  rest area

Ground truth: option C


# Data Processing

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Test the pretrained tokenizer

In [14]:
tokenizer("Hello, this is a sentence!", "This is another sentence.")

{'input_ids': [0, 31414, 6, 42, 16, 10, 3645, 328, 2, 2, 713, 16, 277, 3645, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

#### Define the function for batch encoding, which mainly connect the choices with its question sentance, then encode them.

This function works with one or a batch of examples. In the case of a batch of examples, the tokenizer will return a list of lists of lists for each key, which is a list of all examples (here 6), then a list of all choices (5) and a list of input IDs (length varying here since we did not apply any padding)

In [15]:
def preprocess_function(examples):
    # Extract the question stem
    first_sentences = examples["question"]  # List of question stems

    # Extract all the answer texts (choices) from the 'choices' field
    second_sentences = [choice_dict["text"] for choice_dict in examples["choices"]]  # List of lists

    # Flatten the lists for tokenization
    first_sentences = [stem for stem in first_sentences for _ in range(5)]  # Repeat each question 5 times
    second_sentences = [choice for choices in second_sentences for choice in choices]  # Flatten choices

    # Tokenize the question and choices
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)

    # Group tokenized inputs by example (5 choices per question)
    grouped_inputs = {
        k: [v[i:i + 5] for i in range(0, len(v), 5)]  # Group every 5 entries
        for k, v in tokenized_examples.items()
    }

    return grouped_inputs


Try to work on only 6 data examples to see if it can work corretly

In [16]:
examples = dataset["train"][:6]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

6 5 [29, 30, 30, 31, 29]


To make sure we didn't do anything wrong when grouping all possibilites and unflattening. We have a look at the decoded inputs for a given example. We will decode the encoded examples to see the sentences.

In [17]:
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(5)]

['<s>Google Maps and other highway and street GPS services have replaced what?</s></s>united states</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>mexico</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>countryside</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>atlas</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>oceans</s>']

Then, we compare it with the ground truth from the original dataset

In [18]:
show_one(dataset["train"][3])

Question: Google Maps and other highway and street GPS services have replaced what?
  A:  united states
  B:  mexico
  C:  countryside
  D:  atlas
  E:  oceans

Ground truth: option D


#### They look correct. Then we can go to encode the entire dataset, including our training, validation and testing data.

In [19]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

### Important! some postprocessing to our encoded_dataset
Before using the Trainer API or defining the dataloaders for training loops,
we have to apply a bit of postprocessing to our encoded_dataset, to take care of some things that the Trainer did for us automatically. Specifically, we need to:
1. Remove the columns corresponding to values the model does not expect (like the question, choices and question_concept columns).
2. Rename the column 'answerKey' to 'labels' (because the model expects the argument to be named 'labels').
3. Set the format of the datasets so they return PyTorch tensors instead of lists.

Rename the column 'answerKey' to 'labels'

In [20]:
encoded_dataset = encoded_dataset.rename_column("answerKey", "labels")

In [21]:
encoded_dataset["train"]

Dataset({
    features: ['id', 'question', 'question_concept', 'choices', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 9741
})

Set the format of the datasets so they return PyTorch tensors instead of lists

In [22]:
encoded_dataset.set_format("torch")

Remove the columns corresponding to values the model does not expect

In [23]:
encoded_dataset["train"].column_names

['id',
 'question',
 'question_concept',
 'choices',
 'labels',
 'input_ids',
 'attention_mask']

In [24]:
encoded_dataset = encoded_dataset.remove_columns([
    'id',
     'question',
     'question_concept',
     'choices',
])

In [25]:
encoded_dataset["train"].column_names

['labels', 'input_ids', 'attention_mask']

### Define DataCollatorForMultipleChoice for batch padding
We need to add batch padding to the tokenized data using data collator.

Hugging Face transformers doesn't have a data collator for multiple choice, so we need to adapt the DataCollatorWithPadding to create a batch of examples. It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

DataCollatorForMultipleChoice flattens all the model inputs, applies padding, and then unflattens the results

In [26]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        
        labels = [feature.pop("labels") for feature in features]

        # Map answerKey (e.g., "A", "B", ...) to numerical indices
        labels = torch.tensor(
            [["A", "B", "C", "D", "E"].index(label) for label in labels],
            dtype=torch.int64
        )

        # Determine batch size and number of choices
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])

        # Flatten features for tokenization
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])  # Flatten the list of lists

        # Apply padding to the flattened features
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten to restore batch structure (batch_size, num_choices, sequence_length)
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}

        # Add back the labels as a tensor
        # batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        batch["labels"] = labels

        return batch


When called on a list of examples, it will flatten all the inputs/attentions masks etc. in big lists that it will pass to the tokenizer.pad method. This will return a dictionary with big tensors (of shape (batch_size * 5) x seq_length) that we then unflatten.

We can check this data collator works on a list of features, we just have to make sure to remove all features that are not inputs accepted by our model (something the Trainer will do automatically for us after)

In [27]:
accepted_keys = ["input_ids", "attention_mask", "labels"]
# pick out only 10 data examples
features = [{k: v for k, v in encoded_dataset["train"][i].items() if k in accepted_keys} for i in range(10)]
collator = DataCollatorForMultipleChoice(tokenizer)
batch = collator(features)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Check the data collator works on a question

In [28]:
# The example 7, and its 5 combinations
[tokenizer.decode(batch["input_ids"][7][i].tolist()) for i in range(5)]

['<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>carpet</s><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>refrigerator</s><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>breadbox</s><pad><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>fridge</s><pad><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>coach</s><pad><pad><pad>']

Compare it with the ground truth

In [29]:
show_one(dataset["train"][7])

Question: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?
  A:  carpet
  B:  refrigerator
  C:  breadbox
  D:  fridge
  E:  coach

Ground truth: option B


### Define the dataloaders
We need to define the dataloaders that we will use to iterate over batches. Before that, we need to get a instance of the DataCollatorForMultipleChoice we defined earlier, which will be used to defined the dataloaders.

In [30]:
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [31]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    encoded_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    encoded_dataset["validation"], batch_size=8, collate_fn=data_collator
)

To quickly check there is no mistake in the data processing, we can inspect a batch like this.

The shapes will probably be slightly different after each time running the code, since we set shuffle=True for the training dataloader and we are padding to the maximum length inside the batch.

In [32]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 5, 29]),
 'attention_mask': torch.Size([8, 5, 29]),
 'labels': torch.Size([8])}

# Fine-tune the RoBERTa-Large model - with self-defined training loops
To utilize the multiple GPU, we need to rather putting them into a train.py script and run the command `accelerate config` and `accelerate launch train.py`, or using the `notebook_launcher` if we are using multiple GPU inside the notebook

In [33]:
def training_function():
    # load the model
    from transformers import AutoModelForMultipleChoice
    model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

    # initialize optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-05)

    # initialize accelerator
    from accelerate import Accelerator
    accelerator = Accelerator()

    # prepare dataloaders, model, and optimizer with the accelerator
    train_dl, eval_dl, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, model, optimizer
    )

    # learning rate scheduler
    from transformers import get_scheduler
    num_epochs = 3
    # important! we should use train_dl here rather than train_dataloader, thus ensuring the compatibility with distributed training.
    # because, train_dl is split across the number of devices (e.g., GPUs, TPUs) used for distributed training.
    num_training_steps = num_epochs * len(train_dl)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    # initialize the progress bar
    from tqdm.auto import tqdm
    # to avoid showing progress bar for each GPU, we only show it for the main process
    progress_bar = tqdm(range(num_training_steps), disable=not accelerator.is_main_process)

    # load evaluation metric
    import evaluate
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    
    # initialize variables to track best accuracy and save best model
    best_accuracy = 0.0
    best_model_state = None

    # ======================================== training loop ========================================
    model.train()
    for epoch in range(num_epochs):
        # training for this epoch
        for batch in train_dl:
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss

            # Backward pass and optimization
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            # Update progress bar
            progress_bar.update(1)

        # evaluation loop for this epoch
        model.eval()
        all_predictions = []
        all_labels = []

        for batch in eval_dl:
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            # gather predictions and labels across all devices
            all_predictions.append(accelerator.gather(predictions))
            all_labels.append(accelerator.gather(batch["labels"]))

        # concatenate all predictions and labels
        all_predictions = torch.cat(all_predictions)
        all_labels = torch.cat(all_labels)

        # compute metric
        accuracy = accuracy_metric.compute(predictions=all_predictions, references=all_labels)
        f1 = f1_metric.compute(predictions=all_predictions, references=all_labels, average="weighted")
        precision = precision_metric.compute(predictions=all_predictions, references=all_labels, average="weighted")
        recall = recall_metric.compute(predictions=all_predictions, references=all_labels, average="weighted")

        # combine all metrics into a single dictionary
        eval_metrics = {
            "accuracy": accuracy["accuracy"],
            "f1": f1["f1"],
            "precision": precision["precision"],
            "recall": recall["recall"],
        }

        # use accelerator.print to print only on the main process
        accelerator.print("Evaluation results:", eval_metrics)
        
        # check if this is the best model so far
        if accuracy["accuracy"] > best_accuracy:
            best_accuracy = accuracy["accuracy"]
            # save model state
            best_model_state = accelerator.unwrap_model(model).state_dict()

        # set the model back to training mode for next epoch
        model.train()
    # ====================================================================================================
        
    # Load the best model at the end of training
    if best_model_state is not None:
        accelerator.unwrap_model(model).load_state_dict(best_model_state)

    # Print the best accuracy
    accelerator.print(f"Best Accuracy: {best_accuracy}")

In [34]:
from accelerate import notebook_launcher

notebook_launcher(training_function, num_processes=2) # num_processes is the number of GPUs we would like to use

Launching training on 2 GPUs.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForMultipleChoice were not initialized from the model ch

  0%|          | 0/1827 [00:00<?, ?it/s]

Evaluation results: {'accuracy': 0.20535714285714285, 'f1': 0.2055868750547292, 'precision': 0.20589373480688242, 'recall': 0.20535714285714285}
Evaluation results: {'accuracy': 0.19724025974025974, 'f1': 0.19687786081756548, 'precision': 0.19689130489402723, 'recall': 0.19724025974025974}
Evaluation results: {'accuracy': 0.21753246753246752, 'f1': 0.21801789374033176, 'precision': 0.2191892324118451, 'recall': 0.21753246753246752}
Best Accuracy: 0.21753246753246752


W1129 20:48:48.297000 139665202651776 torch/distributed/elastic/multiprocessing/api.py:727] Closing process 890287 via signal SIGTERM


### Notice:
This is not the optimized accuracy ressult. We have done grid search in another notebook, which reaches a highest accuracy of 0.764. Please, refer to the file `RoBERTa_finetune_CSQA_optimized.ipynb` for details.
