In [1]:
!pip install datasets
!pip install accelerate -U transformers[torch]

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K

In [2]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=2).to(device)

dataset = load_dataset("boolq")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading readme:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [3]:
class BoolQDataset(Dataset):
    def __init__(self, tokenizer, examples):
        self.tokenizer = tokenizer
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        context = example['passage']
        question = example['question']
        label = int(example['answer'])  #convert bool to int (0 for False, 1 for True)

        inputs = self.tokenizer(
            question,
            context,
            truncation="only_second",
            max_length=512,
            padding="max_length",
            return_tensors="pt"
        )

        inputs["labels"] = torch.tensor(label)

        return {k: v.squeeze() for k, v in inputs.items()}

train_data = dataset["train"].select(range(7000))
train_dataset = BoolQDataset(tokenizer, train_data)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:

training_args = TrainingArguments(
    per_device_train_batch_size=4,
    num_train_epochs=4,
    learning_rate=3e-05,
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=1,
    lr_scheduler_type="linear",
    save_strategy="epoch"
)

optimizer = AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    betas=(0.9, 0.999),
    eps=1e-08
)

total_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

trainer.train()

#save the fine-tuned model
model.save_pretrained("./distilbert-boolq")
tokenizer.save_pretrained("./distilbert-boolq")

model.eval()




Step,Training Loss
500,0.6477
1000,0.5813
1500,0.647
2000,0.5829
2500,0.5457
3000,0.5246
3500,0.5627
4000,0.4085
4500,0.4456
5000,0.4319


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [6]:

def generate_answer(context, question):
    inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to(device)
    outputs = model(**inputs)
    logits = outputs.logits

    predicted_class = torch.argmax(logits, dim=1).item()
    answer = "Yes" if predicted_class == 1 else "No"

    return answer

validation_data = dataset["validation"]

for example in validation_data.select(range(10)):  
    context = example['passage']
    question = example['question']
    generated_answer = generate_answer(context, question)
    ground_truth = "Yes" if example['answer'] else "No"

    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Generated Answer: {generated_answer}")
    print(f"Ground Truth: {ground_truth}\n")

Context: All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropic

In [7]:

exact_match_count = 0

with open('gpt2-medium-BOOLQ.txt', 'a') as file:
    for example in validation_data.select(range(100)):
        passage = example['passage']
        question = example['question']
        generated_answer = generate_answer(passage, question)
        true_answer = "yes" if example['answer'] else "no"

        if true_answer==generated_answer.lower():
            exact_match_count += 1

        file.write(f"Question: {question}\n")
        file.write(f"Generated Answer: {generated_answer}\n")
        file.write(f"True Answer: {true_answer}\n\n")

exact_match_score = exact_match_count / 100
print(f"Exact Match Score: {exact_match_score * 100:.2f}%")

with open('gpt2-medium-BOOLQ.txt', 'a') as file:
    file.write(f"Exact Match Score: {exact_match_score * 100:.2f}%\n")

Exact Match Score: 73.00%
