In [1]:
! pip install transformers[torch] datasets evaluate

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import transformers

print(transformers.__version__)

4.46.2


## Define the model we fine-tune

In [3]:
model_checkpoint = "roberta-large"
# batch_size = 8

# Load Dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("tau/commonsense_qa")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

In [6]:
dataset["train"][0]

{'id': '075e483d21c29a511267ef62bedc0461',
 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
 'question_concept': 'punishing',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']},
 'answerKey': 'A'}

In [7]:
dataset["validation"][0]

{'id': '1afa02df02c908a558b4036e80242fac',
 'question': 'A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?',
 'question_concept': 'revolving door',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['bank', 'library', 'department store', 'mall', 'new york']},
 'answerKey': 'A'}

Notice, all the answerKey in test dataset are ""

In [8]:
dataset["test"][0]

{'id': '90b30172e645ff91f7171a048582eb8b',
 'question': 'The townhouse was a hard sell for the realtor, it was right next to a high rise what?',
 'question_concept': 'townhouse',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['suburban development',
   'apartment building',
   'bus stop',
   'michigan',
   'suburbs']},
 'answerKey': ''}

#### The following function will show some examples picked randomly in the dataset to show what the data looks like

In [9]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [10]:
show_random_elements(dataset["train"])

Unnamed: 0,id,question,question_concept,choices,answerKey
0,fc3e1a04c5924a8216d37f2ac5c54afa,If more than one person is talking to another what are they having?,talking,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['sore throat', 'gossip', 'conversation', 'communication', 'dry mouth']}",C
1,9caab008d1833e6f7876dbfc512110d8,"She left the states and traveled across the pond, her next art gallery was in what city?",gallery,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['soho', 'london', 'art museum', 'new york city', 'on a mural']}",B
2,353bf00e1541b4ab68982c654bbd4072_1,"There was a bacteria scare, so bleach was added to the well what?",bacteria,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['water', 'septic tank', 'petri dish', 'leg', 'ground']}",A
3,3c67815352fec6a0fdf7656ccb5b5fd5,"The bald eagle has been a protected species, this is in part due to a dwindling what?",bald eagle,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['nest', 'everglades', 'high places', 'in washington', 'natural habitat']}",E
4,503847f8410ce5dfa13eab81bbfe7bcc,Where would you find a monitor wall that is used for watching moving lines?,monitor wall,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['studios', 'security office', 'hospital', 'department store', 'guard station']}",C
5,24b6686f5f165c242aac53edbefe6157,Sarah did all of it. She was the best saleswoman in the city. But she was never acknowledge for her what?,ali,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['work hard', 'sell products', 'field question', 'slow down', 'productivity']}",A
6,36cad84e95bee315c05af70994e0aff8,Frankie found a bone in her good. It probably belonged to what?,bone,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['father', 'arm', 'cow', 'museum', 'human body']}",C
7,f61d08b807fe47d926e0c3f0c8355bc0,What is the likelihood of drowning when getting wet?,getting wet,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['shrinking', 'feeling cold', 'become cold', 'cool off', 'could']}",E
8,fc0e9a19e07526173bdfc3d384364695,"If you're a small dog owned by your father's mother, where would you live?",small dog,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['the enterprise', 'baby carriage', 'basket', 'grandma's house', 'barbeque']}",D
9,f7726ea3db26d5a0a0b42fb5ce204cb5,"The guests were soon to arrive, she carefully arranged the cookies and set them out on a what?",cookies,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['jar', 'house', 'plate', 'mouth', 'hubcap']}",C


#### Define a function to check the ground truth of a specific question

In [11]:
def show_one(example):
    print(f"Question: {example['question']}")
    print(f"  {example['choices']['label'][0]}:  {example['choices']['text'][0]}")
    print(f"  {example['choices']['label'][1]}:  {example['choices']['text'][1]}")
    print(f"  {example['choices']['label'][2]}:  {example['choices']['text'][2]}")
    print(f"  {example['choices']['label'][3]}:  {example['choices']['text'][3]}")
    print(f"  {example['choices']['label'][4]}:  {example['choices']['text'][4]}")
    print(f"\nGround truth: option {example['answerKey']}")

In [12]:
show_one(dataset["train"][0])

Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?
  A:  ignore
  B:  enforce
  C:  authoritarian
  D:  yell at
  E:  avoid

Ground truth: option A


In [13]:
show_one(dataset["train"][12])

Question: Johnny sat on a bench and relaxed after doing a lot of work on his hobby.  Where is he?
  A:  state park
  B:  bus depot
  C:  garden
  D:  gym
  E:  rest area

Ground truth: option C


# Data Processing

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Test the pretrained tokenizer

In [15]:
tokenizer("Hello, this is a sentence!", "This is another sentence.")

{'input_ids': [0, 31414, 6, 42, 16, 10, 3645, 328, 2, 2, 713, 16, 277, 3645, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

#### Define the function for batch encoding, which mainly connect the choices with its question sentance, then encode them.

This function works with one or a batch of examples. In the case of a batch of examples, the tokenizer will return a list of lists of lists for each key, which is a list of all examples (here 6), then a list of all choices (5) and a list of input IDs (length varying here since we did not apply any padding)

In [16]:
def preprocess_function(examples):
    # Extract the question stem
    first_sentences = examples["question"]  # List of question stems

    # Extract all the answer texts (choices) from the 'choices' field
    second_sentences = [choice_dict["text"] for choice_dict in examples["choices"]]  # List of lists

    # Flatten the lists for tokenization
    first_sentences = [stem for stem in first_sentences for _ in range(5)]  # Repeat each question 5 times
    second_sentences = [choice for choices in second_sentences for choice in choices]  # Flatten choices

    # Tokenize the question and choices
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)

    # Group tokenized inputs by example (5 choices per question)
    grouped_inputs = {
        k: [v[i:i + 5] for i in range(0, len(v), 5)]  # Group every 5 entries
        for k, v in tokenized_examples.items()
    }

    return grouped_inputs


Try to work on only 6 data examples to see if it can work corretly

In [17]:
examples = dataset["train"][:6]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

6 5 [29, 30, 30, 31, 29]


To make sure we didn't do anything wrong when grouping all possibilites and unflattening. We have a look at the decoded inputs for a given example. We will decode the encoded examples to see the sentences.

In [18]:
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(5)]

['<s>Google Maps and other highway and street GPS services have replaced what?</s></s>united states</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>mexico</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>countryside</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>atlas</s>',
 '<s>Google Maps and other highway and street GPS services have replaced what?</s></s>oceans</s>']

Then, we compare it with the ground truth from the original dataset

In [19]:
show_one(dataset["train"][3])

Question: Google Maps and other highway and street GPS services have replaced what?
  A:  united states
  B:  mexico
  C:  countryside
  D:  atlas
  E:  oceans

Ground truth: option D


#### They look correct. Then we can go to encode the entire dataset, including our training, validation and testing data.

In [20]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

### Important! some postprocessing to our encoded_dataset
Before using the Trainer API or defining the dataloaders for training loops,
we have to apply a bit of postprocessing to our encoded_dataset, to take care of some things that the Trainer did for us automatically. Specifically, we need to:
1. Remove the columns corresponding to values the model does not expect (like the question, choices and question_concept columns).
2. Rename the column 'answerKey' to 'labels' (because the model expects the argument to be named 'labels').
3. Set the format of the datasets so they return PyTorch tensors instead of lists.

Rename the column 'answerKey' to 'labels'

In [21]:
encoded_dataset = encoded_dataset.rename_column("answerKey", "labels")

In [22]:
encoded_dataset["train"]

Dataset({
    features: ['id', 'question', 'question_concept', 'choices', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 9741
})

Set the format of the datasets so they return PyTorch tensors instead of lists

In [23]:
encoded_dataset.set_format("torch")

Remove the columns corresponding to values the model does not expect

In [24]:
encoded_dataset["train"].column_names

['id',
 'question',
 'question_concept',
 'choices',
 'labels',
 'input_ids',
 'attention_mask']

In [25]:
encoded_dataset = encoded_dataset.remove_columns([
    'id',
     'question',
     'question_concept',
     'choices',
])

In [26]:
encoded_dataset["train"].column_names

['labels', 'input_ids', 'attention_mask']

### Define DataCollatorForMultipleChoice for batch padding
We need to add batch padding to the tokenized data using data collator.

Hugging Face transformers doesn't have a data collator for multiple choice, so we need to adapt the DataCollatorWithPadding to create a batch of examples. It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

DataCollatorForMultipleChoice flattens all the model inputs, applies padding, and then unflattens the results

In [27]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        
        labels = [feature.pop("labels") for feature in features]

        # Map answerKey (e.g., "A", "B", ...) to numerical indices
        labels = torch.tensor(
            [["A", "B", "C", "D", "E"].index(label) for label in labels],
            dtype=torch.int64
        )

        # Determine batch size and number of choices
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])

        # Flatten features for tokenization
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])  # Flatten the list of lists

        # Apply padding to the flattened features
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten to restore batch structure (batch_size, num_choices, sequence_length)
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}

        # Add back the labels as a tensor
        # batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        batch["labels"] = labels

        return batch


When called on a list of examples, it will flatten all the inputs/attentions masks etc. in big lists that it will pass to the tokenizer.pad method. This will return a dictionary with big tensors (of shape (batch_size * 5) x seq_length) that we then unflatten.

We can check this data collator works on a list of features, we just have to make sure to remove all features that are not inputs accepted by our model (something the Trainer will do automatically for us after)

In [28]:
accepted_keys = ["input_ids", "attention_mask", "labels"]
# pick out only 10 data examples
features = [{k: v for k, v in encoded_dataset["train"][i].items() if k in accepted_keys} for i in range(10)]
collator = DataCollatorForMultipleChoice(tokenizer)
batch = collator(features)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Check the data collator works on a question

In [29]:
# The example 7, and its 5 combinations
[tokenizer.decode(batch["input_ids"][7][i].tolist()) for i in range(5)]

['<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>carpet</s><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>refrigerator</s><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>breadbox</s><pad><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>fridge</s><pad><pad><pad>',
 '<s>The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>coach</s><pad><pad><pad>']

Compare it with the ground truth

In [30]:
show_one(dataset["train"][7])

Question: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?
  A:  carpet
  B:  refrigerator
  C:  breadbox
  D:  fridge
  E:  coach

Ground truth: option B


# Fine-tune the RoBERTa-large model - with Trainer API
Then we should download the pretrained model and fine-tune it on our commonsense QA dataset. Since all our task is about mutliple choice, we use the AutoModelForMultipleChoice class. Like with the tokenizer, the from_pretrained method will download and cache the model for us.

In [31]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluation function for training

In [32]:
import evaluate

# Load the accuracy metric
accu_metric = evaluate.load("accuracy")

In [33]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpack predictions and labels
    predictions, labels = eval_pred

    # Get the index of the highest logit for each example
    predictions = np.argmax(predictions, axis=1)

    # Compute accuracy
    return accu_metric.compute(predictions=predictions, references=labels)

### Fine-tune using Hugging Face Trainer API
Here we define TrainingArguments for the Trainer, which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional. Here we set the evaluation to be done at the end of each epoch, and adjust the learning rate, using the batch_size defined at the top of the notebook and customize the number of epochs for training, as well as the weight decay.

In [34]:
import itertools

# Define the hyperparameter grid
learning_rates = [1e-5, 2e-5, 3e-5]
batch_sizes = [8, 16]
num_epochs = [2, 3, 4]

# Track the best configuration
best_config = None
best_accuracy = 0.0

model_name = model_checkpoint.split("/")[-1]

# Loop through all combinations of hyperparameters
for lr, batch_size, epochs in itertools.product(learning_rates, batch_sizes, num_epochs):
    print(f"Testing configuration: LR={lr}, Batch Size={batch_size}, Epochs={epochs}")
    
    # Reinitialize the model for each combination
    model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"{model_name}-finetuned-csQA",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,  # Preloaded model
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    
    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate()

    # Track the best configuration
    accuracy = eval_results["eval_accuracy"]
    print(f"Accuracy for this configuration: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_config = {"learning_rate": lr, "batch_size": batch_size, "num_epochs": epochs}

# Print the best configuration
print("Best Configuration:")
print(best_config)
print(f"Best Accuracy: {best_accuracy}")


Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing configuration: LR=1e-05, Batch Size=8, Epochs=2


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6178,1.392691,0.46683
2,1.3455,0.929361,0.652744




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.6527436527436528
Testing configuration: LR=1e-05, Batch Size=8, Epochs=3


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3252,0.739212,0.709255
2,0.82,0.708592,0.735463
3,0.6319,0.728488,0.739558




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.7354627354627354
Testing configuration: LR=1e-05, Batch Size=8, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3632,0.816831,0.68878
2,0.8958,0.720332,0.728092
3,0.6923,0.755936,0.728092
4,0.5195,0.803881,0.73792




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.7280917280917281
Testing configuration: LR=1e-05, Batch Size=16, Epochs=2


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.95612,0.62326
2,1.247700,0.780437,0.704341




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.7043407043407044
Testing configuration: LR=1e-05, Batch Size=16, Epochs=3


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.125929,0.579034
2,1.352200,0.774437,0.700246
3,1.352200,0.744992,0.710074




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.7100737100737101
Testing configuration: LR=1e-05, Batch Size=16, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.792426,0.683047
2,1.133000,0.712443,0.718264
3,1.133000,0.725256,0.724816
4,0.672000,0.757302,0.724816




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.7182637182637183
Testing configuration: LR=2e-05, Batch Size=8, Epochs=2




Epoch,Training Loss,Validation Loss,Accuracy
1,1.6226,1.609438,0.214578
2,1.6194,1.609438,0.218673




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.21457821457821458
Testing configuration: LR=2e-05, Batch Size=8, Epochs=3


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6255,1.609438,0.194922
2,1.616,1.609438,0.203112
3,1.6161,1.609438,0.176085




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.19492219492219492
Testing configuration: LR=2e-05, Batch Size=8, Epochs=4




Epoch,Training Loss,Validation Loss,Accuracy
1,1.626,1.609443,0.194103
2,1.6172,1.609438,0.211302
3,1.6157,1.609438,0.179361
4,1.6133,1.609438,0.20475




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.2113022113022113
Testing configuration: LR=2e-05, Batch Size=16, Epochs=2


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.753752,0.716626
2,1.010900,0.7063,0.72154




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.7215397215397216
Testing configuration: LR=2e-05, Batch Size=16, Epochs=3




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609439,0.185913
2,1.615500,1.609439,0.182637
3,1.615500,1.609438,0.183456




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.18345618345618345
Testing configuration: LR=2e-05, Batch Size=16, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609438,0.19656
2,1.617800,1.609438,0.202293
3,1.617800,1.609438,0.20475
4,1.615000,1.609438,0.19656




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.20475020475020475
Testing configuration: LR=3e-05, Batch Size=8, Epochs=2


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6181,1.609438,0.18018
2,1.614,1.609438,0.185913




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.18018018018018017
Testing configuration: LR=3e-05, Batch Size=8, Epochs=3




Epoch,Training Loss,Validation Loss,Accuracy
1,1.6233,1.609438,0.180999
2,1.6152,1.609438,0.179361
3,1.6132,1.609438,0.199017




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.180999180999181
Testing configuration: LR=3e-05, Batch Size=8, Epochs=4




Epoch,Training Loss,Validation Loss,Accuracy
1,1.6229,1.609438,0.173628
2,1.6176,1.609438,0.200655
3,1.6135,1.609438,0.193284
4,1.6156,1.609438,0.19656




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.19328419328419327
Testing configuration: LR=3e-05, Batch Size=16, Epochs=2


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.604555,0.298116
2,1.527200,0.923412,0.647011




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.647010647010647
Testing configuration: LR=3e-05, Batch Size=16, Epochs=3




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609438,0.179361
2,1.616300,1.609438,0.199017
3,1.616300,1.609438,0.187551




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.19901719901719903
Testing configuration: LR=3e-05, Batch Size=16, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609434,0.241605
2,1.618300,1.609425,0.310401
3,1.618300,1.609345,0.424242
4,1.615000,1.600943,0.369369




Accuracy for this configuration: 0.36936936936936937
Best Configuration:
{'learning_rate': 1e-05, 'batch_size': 8, 'num_epochs': 3}
Best Accuracy: 0.7354627354627354


## Try to optimize with adding prefix Q and A at the beginning
refer to the paper github repo:

We also found it helpful to prepend a prefix of Q: to the question and A: to the answer. The complete input format is:

\<s> Q: Where would I not want a fox? \</s> A: hen house \</s>

In [35]:
dataset = load_dataset("tau/commonsense_qa")

In [36]:
def preprocess_function_qa(examples):
    # Add "Q:" prefix to all questions
    first_sentences = ["Q: " + question for question in examples["question"]]  # List of question stems with prefix

    # Extract all the answer texts and add "A:" prefix to each answer choice
    second_sentences = [["A: " + choice for choice in choices["text"]] for choices in examples["choices"]]  # List of lists

    # Flatten the lists for tokenization
    first_sentences = [stem for stem in first_sentences for _ in range(5)]  # Repeat each question 5 times
    second_sentences = [choice for choices in second_sentences for choice in choices]  # Flatten choices

    # Tokenize the question and choices
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=128)

    # Group tokenized inputs by example (5 choices per question)
    grouped_inputs = {
        k: [v[i:i + 5] for i in range(0, len(v), 5)]  # Group every 5 entries
        for k, v in tokenized_examples.items()
    }

    return grouped_inputs

In [37]:
examples = dataset["train"][:6]
features = preprocess_function_qa(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

6 5 [33, 33, 33, 34, 33]


In [38]:
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(5)]

['<s>Q: Google Maps and other highway and street GPS services have replaced what?</s></s>A: united states</s>',
 '<s>Q: Google Maps and other highway and street GPS services have replaced what?</s></s>A: mexico</s>',
 '<s>Q: Google Maps and other highway and street GPS services have replaced what?</s></s>A: countryside</s>',
 '<s>Q: Google Maps and other highway and street GPS services have replaced what?</s></s>A: atlas</s>',
 '<s>Q: Google Maps and other highway and street GPS services have replaced what?</s></s>A: oceans</s>']

In [39]:
show_one(dataset["train"][3])

Question: Google Maps and other highway and street GPS services have replaced what?
  A:  united states
  B:  mexico
  C:  countryside
  D:  atlas
  E:  oceans

Ground truth: option D


In [40]:
encoded_dataset = dataset.map(preprocess_function_qa, batched=True)

Map:   0%|          | 0/9741 [00:00<?, ? examples/s]

Map:   0%|          | 0/1221 [00:00<?, ? examples/s]

Map:   0%|          | 0/1140 [00:00<?, ? examples/s]

In [41]:
encoded_dataset = encoded_dataset.rename_column("answerKey", "labels")
encoded_dataset.set_format("torch")
encoded_dataset = encoded_dataset.remove_columns([
    'id',
     'question',
     'question_concept',
     'choices',
])
encoded_dataset["train"].column_names

['labels', 'input_ids', 'attention_mask']

In [42]:
accepted_keys = ["input_ids", "attention_mask", "labels"]
# pick out only 10 data examples
features = [{k: v for k, v in encoded_dataset["train"][i].items() if k in accepted_keys} for i in range(10)]
collator = DataCollatorForMultipleChoice(tokenizer)
batch = collator(features)

In [43]:
# The example 7, and its 5 combinations
[tokenizer.decode(batch["input_ids"][7][i].tolist()) for i in range(5)]

['<s>Q: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>A: carpet</s><pad><pad><pad>',
 '<s>Q: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>A: refrigerator</s><pad><pad><pad>',
 '<s>Q: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>A: breadbox</s><pad><pad>',
 '<s>Q: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>A: fridge</s><pad><pad><pad>',
 '<s>Q: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?</s></s>A: coach</s><pad><pad><pad>']

In [44]:
show_one(dataset["train"][7])

Question: The forgotten leftovers had gotten quite old, he found it covered in mold in the back of his what?
  A:  carpet
  B:  refrigerator
  C:  breadbox
  D:  fridge
  E:  coach

Ground truth: option B


In [45]:
import itertools

# Define the hyperparameter grid
learning_rates = [1e-5, 2e-5, 3e-5]
batch_sizes = [8, 16]
num_epochs = [2, 3, 4]

# Track the best configuration
best_config = None
best_accuracy = 0.0

model_name = model_checkpoint.split("/")[-1]

# Loop through all combinations of hyperparameters
for lr, batch_size, epochs in itertools.product(learning_rates, batch_sizes, num_epochs):
    print(f"Testing configuration: LR={lr}, Batch Size={batch_size}, Epochs={epochs}")
    
    # Reinitialize the model for each combination
    model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"{model_name}-finetuned-csQA",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,  # Preloaded model
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    
    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate()

    # Track the best configuration
    accuracy = eval_results["eval_accuracy"]
    print(f"Accuracy for this configuration: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_config = {"learning_rate": lr, "batch_size": batch_size, "num_epochs": epochs}

# Print the best configuration
print("Best Configuration:")
print(best_config)
print(f"Best Accuracy: {best_accuracy}")


Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Testing configuration: LR=1e-05, Batch Size=8, Epochs=2




Epoch,Training Loss,Validation Loss,Accuracy
1,1.6228,1.608983,0.228501
2,1.6178,1.609396,0.209664




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.2285012285012285
Testing configuration: LR=1e-05, Batch Size=8, Epochs=3




Epoch,Training Loss,Validation Loss,Accuracy
1,1.1823,0.685313,0.746929
2,0.7712,0.612872,0.76249
3,0.5996,0.640147,0.764128




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.7624897624897625
Testing configuration: LR=1e-05, Batch Size=8, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2428,0.754395,0.715807
2,0.7979,0.643932,0.748567
3,0.6066,0.688192,0.750205
4,0.4691,0.749663,0.751843




Accuracy for this configuration: 0.7485667485667485
Testing configuration: LR=1e-05, Batch Size=16, Epochs=2


Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.379032,0.407043
2,1.502600,1.056256,0.592138




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.5921375921375921
Testing configuration: LR=1e-05, Batch Size=16, Epochs=3




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.732668,0.719902
2,1.092800,0.645822,0.752662
3,1.092800,0.646029,0.758395




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.7526617526617526
Testing configuration: LR=1e-05, Batch Size=16, Epochs=4




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.922585,0.669124
2,1.210600,0.666077,0.740377
3,1.210600,0.647536,0.755938
4,0.719100,0.663766,0.750205




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.7559377559377559
Testing configuration: LR=2e-05, Batch Size=8, Epochs=2


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6173,1.609435,0.281736
2,1.6133,1.609438,0.203931




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.28173628173628174
Testing configuration: LR=2e-05, Batch Size=8, Epochs=3


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1342,0.694932,0.741196
2,0.6875,0.710082,0.743653
3,0.4628,0.8226,0.74611




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.7411957411957412
Testing configuration: LR=2e-05, Batch Size=8, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6246,1.609424,0.186732
2,1.6172,1.609438,0.202293
3,1.6157,1.609438,0.195741
4,1.6156,1.609438,0.182637




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.18673218673218672
Testing configuration: LR=2e-05, Batch Size=16, Epochs=2


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609435,0.208026
2,1.616700,1.609437,0.220311




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.20802620802620803
Testing configuration: LR=2e-05, Batch Size=16, Epochs=3


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609438,0.200655
2,1.613000,1.609438,0.200655
3,1.613000,1.609438,0.18837




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.20065520065520065
Testing configuration: LR=2e-05, Batch Size=16, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609438,0.201474
2,1.615500,1.609438,0.183456
3,1.615500,1.609439,0.182637
4,1.613600,1.609438,0.191646




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.19164619164619165
Testing configuration: LR=3e-05, Batch Size=8, Epochs=2




Epoch,Training Loss,Validation Loss,Accuracy
1,1.6211,1.609438,0.208026
2,1.6195,1.609437,0.211302




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.2113022113022113
Testing configuration: LR=3e-05, Batch Size=8, Epochs=3


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6222,1.609438,0.190827
2,1.6158,1.609438,0.199017
3,1.6137,1.609438,0.206388




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.19082719082719082
Testing configuration: LR=3e-05, Batch Size=8, Epochs=4


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6234,1.609439,0.194103
2,1.6198,1.609438,0.205569
3,1.6149,1.609438,0.215397
4,1.6151,1.609437,0.207207




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.2072072072072072
Testing configuration: LR=3e-05, Batch Size=16, Epochs=2




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609437,0.194922
2,1.616400,1.609437,0.211302




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy for this configuration: 0.2113022113022113
Testing configuration: LR=3e-05, Batch Size=16, Epochs=3


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609437,0.201474
2,1.613600,1.609438,0.175266
3,1.613600,1.609438,0.18837




Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Accuracy for this configuration: 0.20147420147420148
Testing configuration: LR=3e-05, Batch Size=16, Epochs=4




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.609438,0.206388
2,1.617200,1.609438,0.208026
3,1.617200,1.609438,0.18837
4,1.613800,1.609438,0.179361




Accuracy for this configuration: 0.20638820638820637
Best Configuration:
{'learning_rate': 1e-05, 'batch_size': 8, 'num_epochs': 3}
Best Accuracy: 0.7624897624897625
