In [1]:
data_path = "./data/"
sp_data_path = data_path + "SP-train.npy"
wp_data_path = data_path + "WP-train.npy"

In [2]:
import numpy as np
import torch
import transformers
import datasets
from datasets import load_dataset, load_metric
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from transformers import AutoTokenizer

# # do this once
# np_load_old = np.load
# np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sp_train = np.load(sp_data_path)
wp_train = np.load(wp_data_path)

In [4]:
print(len(sp_train))
print(len(wp_train))

507
396


In [5]:
model_checkpoint = "bert-base-uncased"
batch_size = 16

In [6]:
random.seed(0)
train_data_sp, val_data_sp = train_test_split(sp_train, test_size=0.1, random_state = 17)
train_data_wp, val_data_wp = train_test_split(wp_train, test_size=0.1, random_state = 17)

In [7]:
def combine_sp_wp(sp, wp):
    combined_p = [x for x in sp]
    for x in wp:
        combined_p.append(x)
    return combined_p

In [8]:
train_data = combine_sp_wp(train_data_sp, train_data_wp)
val_data = combine_sp_wp(val_data_sp, val_data_wp)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [23]:
print(len(train_data))
print(train_data[0])
print(train_data[-1])

812
{'id': 'SP-196_SR', 'question': 'A tank contains ten fish. Two fish drown. Four fish are swimming away.Three fish perish.\xa0How many fish are there left in the tank?', 'answer': 'None of above.', 'distractor1': 'Seven.', 'distractor2': 'Twenty.', 'distractor(unsure)': 'Five.', 'label': 3, 'choice_list': ['Seven.', 'Twenty.', 'Five.', 'None of above.'], 'choice_order': [1, 2, 3, 0]}
{'id': 'WP-46', 'question': 'What 4 days in the week start with T?', 'answer': 'Tuesday, Thursday, Today and Tomorrow.', 'distractor1': 'Monday, Tuesday, Wednesday and Thursday.', 'distractor2': 'Tuesday, Wednesday,Thursday and Friday.', 'distractor(unsure)': 'None of above.', 'label': 1, 'choice_list': ['Monday, Tuesday, Wednesday and Thursday.', 'Tuesday, Thursday, Today and Tomorrow.', 'Tuesday, Wednesday,Thursday and Friday.', 'None of above.'], 'choice_order': [1, 0, 2, 3]}


In [45]:
def preprocess_function(data):
    # Repeat each quesiton four times to go with the four possibilities of second sentences.
    questions = [[i["question"]] * 4 for i in data]
    # Grab all choices possible for each context.
    choices = [i["choice_list"] for i in data]
    # Flatten everything
    questions = sum(questions, [])
    choices = sum(choices, [])
    # Tokenize
    tokenized_qa = tokenizer(questions, choices, truncation=True, padding=True)
    # Un-flatten
    return {k: [v[i : i+4] for i in range(0, len(v), 4)] for k, v in tokenized_qa.items()}

In [19]:
examples = train_data[:5]
# 'input_ids', 'token_type_ids', 'attention_mask', (#data, 4, #feature)
features = preprocess_function(examples) 
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(4)]

[['A tank contains ten fish. Two fish drown. Four fish are swimming away.Three fish perish.\xa0How many fish are there left in the tank?', 'A tank contains ten fish. Two fish drown. Four fish are swimming away.Three fish perish.\xa0How many fish are there left in the tank?', 'A tank contains ten fish. Two fish drown. Four fish are swimming away.Three fish perish.\xa0How many fish are there left in the tank?', 'A tank contains ten fish. Two fish drown. Four fish are swimming away.Three fish perish.\xa0How many fish are there left in the tank?'], ['I break down walls and devour towers. I consume iron and corrode steel, but still, I am essential to all. Sometimes, people desire me, yet they fear my uncontrolled presence. What am I?', 'I break down walls and devour towers. I consume iron and corrode steel, but still, I am essential to all. Sometimes, people desire me, yet they fear my uncontrolled presence. What am I?', 'I break down walls and devour towers. I consume iron and corrode stee

["[CLS] jimy went in the middle of the warzone and bombarded a lot of territories that were in the possession of the enemy. when he got back, he didn't get any medals or any praises. why? [SEP] jimy was the name of the fighter - jet that was flying over enemy's territory. [SEP]",
 "[CLS] jimy went in the middle of the warzone and bombarded a lot of territories that were in the possession of the enemy. when he got back, he didn't get any medals or any praises. why? [SEP] jimy was really sweaty and smells. [SEP]",
 "[CLS] jimy went in the middle of the warzone and bombarded a lot of territories that were in the possession of the enemy. when he got back, he didn't get any medals or any praises. why? [SEP] jimy is could not lift the trophy because it was too heavy. [SEP]",
 "[CLS] jimy went in the middle of the warzone and bombarded a lot of territories that were in the possession of the enemy. when he got back, he didn't get any medals or any praises. why? [SEP] none of above. [SEP]"]

In [12]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [95]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [14]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [92]:
accepted_keys = ["input_ids", "attention_mask"]
train_features = preprocess_function(train_data) 
for i in range(len(train_data)):
    for k in accepted_keys:
        train_data[i][k] = train_features[k][i]

val_features = preprocess_function(val_data) 
for i in range(len(val_data)):
    for k in accepted_keys:
        val_data[i][k] = val_features[k][i]

In [90]:
accepted_keys = ["input_ids", "attention_mask", "label"]
# 'input_ids', 'attention_mask', (#data, 4, #feature)
train_features = [{k: v for k, v in train_data[i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(train_features)

In [91]:
[tokenizer.decode(batch["input_ids"][-1][i].tolist()) for i in range(4)]

["[CLS] mark gave james the video tape of a tv advertisement, and mark told james that the advertisement was really good. mark already knew that james didn't see the tape. how did mark knew about that? [SEP] because the tape doesn't smell like the usual perfume james uses. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",
 "[CLS] mark gav

In [88]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [96]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [97]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.018414,0.615385
2,No log,0.825593,0.681319
3,No log,0.753733,0.703297


TrainOutput(global_step=39, training_loss=0.9365406525440705, metrics={'train_runtime': 1060.1331, 'train_samples_per_second': 2.298, 'train_steps_per_second': 0.037, 'total_flos': 876275279863200.0, 'train_loss': 0.9365406525440705, 'epoch': 3.0})

In [99]:
trainer.save_model("./bert-base-uncased-finetuned/")

In [101]:
test_args = TrainingArguments(
    output_dir = "sample-test",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = batch_size,   
    dataloader_drop_last = False    
)
# init trainer
trainer = Trainer(
              model = model, 
              args = test_args, 
              compute_metrics = compute_metrics)
test_results = trainer.predict(val_data)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
