In [1]:
!pip install datasets
!pip install accelerate -U transformers[torch]

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K

In [2]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from transformers import DistilBertTokenizerFast, DistilBertForMultipleChoice, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")
model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-cased").to(device)

dataset = load_dataset("race", "all")

#mapping from answer letters to integers
answer_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}



Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForMultipleChoice were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading readme:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

In [3]:

class RACEDataset(Dataset):
    def __init__(self, tokenizer, examples):
        self.tokenizer = tokenizer
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        context = example['article']
        question = example['question']
        options = example['options']
        label = answer_mapping[example['answer']]  # convert label from letter to integer

        context_question = context + " " + question
        tokenized_context_question = self.tokenizer(
            context_question,
            truncation=True,
            max_length=460, 
            padding="max_length",
            return_tensors="pt"
        )

        #tokenize each option separately and concatenate with tokenized context + question
        input_ids = []
        attention_mask = []
        for option in options:
            tokenized_option = self.tokenizer(
                option,
                truncation=True,
                max_length=52,  
                padding="max_length",
                return_tensors="pt"
            )

            #concatenate context+question and options 
            concatenated_ids = torch.cat((tokenized_context_question['input_ids'], tokenized_option['input_ids']), dim=1)
            concatenated_mask = torch.cat((tokenized_context_question['attention_mask'], tokenized_option['attention_mask']), dim=1)

            input_ids.append(concatenated_ids.squeeze()[:512])
            attention_mask.append(concatenated_mask.squeeze()[:512])

        input_ids = torch.stack(input_ids)
        attention_mask = torch.stack(attention_mask)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_data = dataset["train"].select(range(7000))  
train_dataset = RACEDataset(tokenizer, train_data)

In [4]:


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size=4,
    num_train_epochs=4,
    learning_rate=3e-05,
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=1,
    lr_scheduler_type="linear",
    save_strategy="epoch"
)

optimizer = AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    betas=(0.9, 0.999),
    eps=1e-08
)

total_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

trainer.train()

model.save_pretrained("./distilbert-race")
tokenizer.save_pretrained("./distilbert-race")

model.eval()
#problem with training, training loss is not decreasing



Step,Training Loss
500,1.3893
1000,1.3893
1500,1.3916
2000,1.3888
2500,1.3902
3000,1.3895
3500,1.388
4000,1.392
4500,1.388
5000,1.389


DistilBertForMultipleChoice(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
         

In [8]:

def generate_answer(context, question, options):
    inputs = tokenizer(
        [context] * len(options),
        options,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding="max_length"
    ).to(device)

    input_ids = inputs['input_ids'].view(1, len(options), -1)
    attention_mask = inputs['attention_mask'].view(1, len(options), -1)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    answer = options[predicted_class]

    return answer

validation_data = dataset["validation"]

for example in validation_data.select(range(7)):  
    context = example['article']
    question = example['question']
    options = example['options']
    generated_answer = generate_answer(context, question, options)
    ground_truth = options[answer_mapping[example['answer']]]  #convert label from letter to integer

    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Options: {options}")
    print(f"Generated Answer: {generated_answer}")
    print(f"True Answer: {ground_truth}\n")


Context: I am a psychologist. I first met Timothy, a quiet, overweight eleven-year-old boy, when his mother brought him to me to discuss his declining grades. A few minutes with Timothy were enough to confirm that his self-esteem  and general happiness were falling right along with _ . I asked about Timothy's typical day. He awoke every morning at six thirty so he could reach his school by eight and arrived home around four thirty each afternoon. He then had a quick snack, followed by either a piano lesson or a lesson with his math tutor. He finished dinner at 7 pm, and then he sat down to do homework for two to three hours. Quickly doing the math in my head, I found that Timothy spent an average of thirteen hours a day at a writing desk.
What if Timothy spent thirteen hours a day at a sewing machine instead of a desk? We would immediately be shocked, because that would be called children being horribly mistreated. Timothy was far from being mistreated, but the mountain of homework he 

In [6]:

correct_answer_count = 0

with open('distilbert-RACE.txt', 'a') as file:
    for i, example in enumerate(validation_data.select(range(100))):
        context = example['article']
        question = example['question']
        options = example['options']
        true_answer = options[answer_mapping[example['answer']]]
        generated_answer = generate_answer(context, question, options)

        if generated_answer == true_answer:
            correct_answer_count += 1

        file.write(f"Question {i}: {question}\n")
        file.write(f"Generated Answer: {generated_answer}\n")
        file.write(f"Options: {options}\n")
        file.write(f"True Answer: {true_answer}\n\n")

correct_answer_score = correct_answer_count / 100
print(f"Exact Match Score: {correct_answer_score * 100:.2f}%")

with open('distilbert-RACE.txt', 'a') as file:
    file.write(f"Exact Match Score: {correct_answer_score * 100:.2f}%\n")

Exact Match Score: 29.00%
