In [None]:
!pip install datasets==2.14.6
!pip install transformers
!pip install --no-cache-dir transformers sentencepiece
!pip install accelerate -U
!pip install evaluate

In [2]:
import torch
import os
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from datasets import concatenate_datasets
import evaluate
import accelerate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForMultipleChoice, get_scheduler, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# model_name = "DeepPavlov/roberta-large-winogrande"
# model_name = "JazibEijaz/bert-base-uncased-finetuned-semeval2020-task4b-append-e3-b32-l4e5"
# model_name = "FacebookAI/roberta-large"
# model_name = "bert-large-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
train_data = np.load('/content/drive/MyDrive/data/SP-train.npy', allow_pickle=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [5]:
df = pd.DataFrame(train_data.tolist())
col = ['id','distractor1','distractor2','distractor(unsure)']
for c in col:
  df[c] = df[c].astype(str)
df['label'] = df['label'].astype(int)
train_dataset = Dataset.from_pandas(df, split = "train")

def preprocess_function(sample):
    questions = [[ques] * 4 for ques in sample["question"]]
    options = [option for option in sample["choice_list"]]
    questions = sum(questions, [])
    options = sum(options, [])
    tokenizer_output = tokenizer(questions, options, truncation = True)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenizer_output.items()}

tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
od = tokenized_train.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
sd = tokenized_train.filter(lambda data: "_SR" in data["id"])
cd = tokenized_train.filter(lambda data: "_CR" in data["id"])

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

In [7]:
def split(dataset1, dataset2, dataset3):
    df1 = dataset1.to_pandas()
    df2 = dataset2.to_pandas()
    df3 = dataset3.to_pandas()

    train_df1, temp_df1 = train_test_split(df1, test_size = 0.2, shuffle = False)
    val_df1, test_df1 = train_test_split(temp_df1, test_size = 0.5, shuffle = False)

    train_df2, temp_df2 = train_test_split(df2, test_size = 0.2, shuffle = False)
    val_df2, test_df2 = train_test_split(temp_df2, test_size = 0.5, shuffle = False)

    train_df3, temp_df3 = train_test_split(df3, test_size = 0.2, shuffle = False)
    val_df3, test_df3 = train_test_split(temp_df3, test_size = 0.5, shuffle = False)

    train_dataset1 = Dataset.from_pandas(train_df1)
    val_dataset1 = Dataset.from_pandas(val_df1)
    test_dataset1 = Dataset.from_pandas(test_df1)

    train_dataset2 = Dataset.from_pandas(train_df2)
    val_dataset2 = Dataset.from_pandas(val_df2)
    test_dataset2 = Dataset.from_pandas(test_df2)

    train_dataset3 = Dataset.from_pandas(train_df3)
    val_dataset3 = Dataset.from_pandas(val_df3)
    test_dataset3 = Dataset.from_pandas(test_df3)

    train_dataset = (concatenate_datasets([train_dataset1, train_dataset2, train_dataset3])).shuffle(seed = 42)
    val_dataset = (concatenate_datasets([val_dataset1, val_dataset2, val_dataset3])).shuffle(seed = 42)
    test_dataset = (concatenate_datasets([test_dataset1, test_dataset2, test_dataset3])).shuffle(seed = 42)

    return test_dataset1, test_dataset2, test_dataset3, DatasetDict({"train": train_dataset,"val": val_dataset,"test": test_dataset})


ori_test, sem_test, cont_test, my_dataset = split(od,sd,cd)

print("Training dataset size:", len(my_dataset['train']))
print("Validation dataset size:", len(my_dataset['val']))
print("Testing dataset size:", len(my_dataset['test']))

Training dataset size: 405
Validation dataset size: 51
Testing dataset size: 51


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [8]:
lst = ['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order']
final_dataset = (my_dataset.rename_column('label','labels')).remove_columns(lst)
final_dataset.set_format("torch")
os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = AutoModelForMultipleChoice.from_pretrained(model_name, ignore_mismatched_sizes = True).to(device)

bs = 2
lr = 3e-5
epochs = 5
num_samples = len(final_dataset["train"])
optimizer = torch.optim.AdamW(model.parameters(), lr = lr)
lr_scheduler = get_scheduler(name = "linear", optimizer = optimizer, num_warmup_steps = 0, num_training_steps = (num_samples // bs) * epochs)

config.json:   0%|          | 0.00/18.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at sileod/deberta-v3-large-tasksource-nli and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([1, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [10]:
@dataclass
class DataCollatorForMultipleChoice:

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [11]:
training_arguments = TrainingArguments(
    output_dir = "/content/drive/MyDrive/last_try_output",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    learning_rate = lr,
    num_train_epochs = epochs,
    per_device_train_batch_size = bs,
    per_device_eval_batch_size = bs,
    save_strategy = "epoch"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
data_collator = DataCollatorForMultipleChoice(tokenizer = tokenizer)
trainer = Trainer(
    model = model,
    args = training_arguments,
    data_collator = data_collator,
    train_dataset = final_dataset["train"],
    eval_dataset = final_dataset["val"],
    optimizers = (optimizer, lr_scheduler),
    compute_metrics = compute_metrics
)

In [13]:
train_result = trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3787,0.615816,0.803922
2,0.0257,0.798732,0.764706
3,0.0,0.861898,0.784314
4,0.0,0.880991,0.803922
5,0.0,0.881758,0.803922


In [14]:
def get_predictions(dataset, model, tokenizer):

    model.eval()
    predictions = []
    targets = []

    for sample in dataset:
        ques = sample['question'].strip()
        choices = sample['choice_list']
        true_label = sample['label']
        inputs = tokenizer([[ques, choices[0].strip()], [ques, choices[1].strip()], [ques, choices[2].strip()], [ques, choices[3].strip()]], return_tensors = "pt", padding = True).to(device)

        with torch.no_grad():
            outputs = model(**{key: value.unsqueeze(0) for key, value in inputs.items()})
        logits = outputs.logits
        predicted_class = logits.argmax().item()
        predictions.append(predicted_class)
        targets.append(true_label)

    return predictions, targets

In [27]:
checkpoint_path = "/content/drive/MyDrive/last_try_output/checkpoint-1015"
model = AutoModelForMultipleChoice.from_pretrained(checkpoint_path, ignore_mismatched_sizes = True).to(device)

In [28]:
pred_o, tar_o = get_predictions(ori_test, model, tokenizer)
pred_s, tar_s = get_predictions(sem_test, model, tokenizer)
pred_c, tar_c = get_predictions(cont_test, model, tokenizer)
pred_f, tar_f = get_predictions(my_dataset['test'], model, tokenizer)

original_acc = accuracy_score(tar_o, pred_o)
semantic_acc = accuracy_score(tar_s, pred_s)
context_acc = accuracy_score(tar_c, pred_c)
overall_acc = accuracy_score(tar_f, pred_f)

In [17]:
print("203")
print("Accuracy on Original Dataset:", original_acc)
print("Accuracy on Semantic Dataset:", semantic_acc)
print("Accuracy on Context Dataset:", context_acc)
print("Overall Accuracy:", overall_acc)

203
Accuracy on Original Dataset: 0.8235294117647058
Accuracy on Semantic Dataset: 0.8235294117647058
Accuracy on Context Dataset: 0.8235294117647058
Overall Accuracy: 0.8235294117647058
