In [1]:
!pip install accelerate -U transformers[torch]
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m163.8/309.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset
import pandas as pd

#load RACE dataset
dataset = load_dataset("race", "all")
print(dataset)

def check_for_missing_data(dataset):
    for example in dataset:
        passage = example['article']
        question = example['question']
        answer = example['answer']

        if not passage or not question or answer not in ("A", "B", "C", "D"):
            print("Found missing data")
            continue

check_for_missing_data(dataset['train'])
check_for_missing_data(dataset['validation'])

pd.DataFrame(dataset['train']).head(2)


Downloading readme:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4934
    })
    train: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 87866
    })
    validation: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4887
    })
})


Unnamed: 0,example_id,article,answer,question,options
0,high19088.txt,Last week I talked with some of my students ab...,C,We can know from the passage that the author w...,"[doctor, model, teacher, reporter]"
1,high19088.txt,Last week I talked with some of my students ab...,C,Many graduates today turn to cosmetic surgery ...,"[marry a better man/woman, become a model, get..."


In [4]:

class RACEDataset(Dataset):
    def __init__(self, tokenizer, examples, block_size, max_context_length):
        self.tokenizer = tokenizer
        self.examples = examples
        self.block_size = block_size
        self.max_context_length=max_context_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        context = " ".join(example['article'].split()[:self.max_context_length])
        question = example['question']
        answer = example['answer']
        options = example['options']

        #combine context, question, options, and answer
        options_text = "".join([f"{chr(ord('A') + i)}: {opt} " for i, opt in enumerate(options)])
        question_text = f"Context: {context}\nQuestion: {question}\nOptions: {options_text}\n"

        #concatenate selected option with answer
        selected_option_index = ord(answer) - ord('A')  #convert answer to index (0 for A, 1 for B, ...)
        selected_option_text = options[selected_option_index]
        answer_with_option = f"Answer: The correct answer is {answer}. Full answer: {selected_option_text}"

        text = f"{question_text}{answer_with_option}"
        tokenized_inputs = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.block_size,
            return_tensors="pt"
        )
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
        return {k: v.squeeze() for k, v in tokenized_inputs.items()}


block_size = 457
max_context_length = 300
train_data = dataset["train"].select(range(7000))

train_dataset = RACEDataset(tokenizer, train_data, block_size, max_context_length)


In [5]:
from transformers import AdamW, get_linear_schedule_with_warmup

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=3e-05,
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=1,
    lr_scheduler_type="linear",
    save_strategy="epoch"
)

optimizer = AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    betas=(0.9, 0.999),
    eps=1e-08
)

total_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

trainer.train()

#save the fine-tuned model
model.save_pretrained("./gpt2-medium-race")
tokenizer.save_pretrained("./gpt2-medium-race")

model.eval()




Step,Training Loss
500,2.4786
1000,2.3216
1500,2.2832
2000,2.181
2500,2.1476
3000,2.107
3500,2.0346
4000,1.8339
4500,1.8277
5000,1.761


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [6]:

def generate_answer(context, question, options):
    capitalized_options = [opt.capitalize() for opt in options]
    options_text = "".join([f"{chr(ord('A') + i)}: {opt} " for i, opt in enumerate(capitalized_options)])
    input_text = f"Context: {context}\nQuestion: {question}\nOptions: {options_text}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    
    output = model.generate(input_ids, max_new_tokens=150, num_return_sequences=1, no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    #extract the answer part
    answer_start = generated_text.find("Answer:") + len("Answer:")
    answer = generated_text[answer_start:].strip()
    return answer


validation_data = dataset["validation"]

for i, example in enumerate(validation_data.select(range(7))):
    context = example['article']
    question = example['question']
    options = example['options']
    true_answer = example['answer']
    generated_answer = generate_answer(context, question, options)

    print(f"Question {i+1}: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Options: {options}")
    print(f"Generated Answer: {generated_answer}\n")



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question 1: What did the writer think of Timothy after learning about his typical day?
True Answer: C
Options: ['Timothy was very hardworking.', 'Timothy was being mistreated.', 'Timothy had a heavy burden.', 'Timothy was enjoying his childhood.']
Generated Answer: The correct answer is C. Full answer: Timothy had an heavy burdens.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question 2: Which of the following statements best describes the writer's opinion?
True Answer: A
Options: ['Children should be allowed enough time to play.', 'Playing board games works better than playing video games.', 'The more they play, the more creative children will become.', 'The depression caused by homework makes children unwilling to play.']
Generated Answer: The correct answer is C. Full answer: the better they do,
   _ .



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question 3: According to the passage, how long should a thirdgrader spend a day doing homework?
True Answer: C
Options: ['About ten minutes.', 'No more than twenty minutes.', 'No more than thirty minutes.', 'About fifty minutes.']
Generated Answer: The correct answer is B. Full answer: no longer than 20 minutes



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question 4: According to the report,   _  .
True Answer: A
Options: ["people won't necessarily lose jobs", 'big companies will face fewer challenges', 'small companies will win against big companies', 'most people will become interested in technology']
Generated Answer: The correct answer is C. Full answer: Little companies won win over big ones



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question 5: We can infer from the text that in the future   _  .
True Answer: B
Options: ['people will face many difficulties', 'people will take up more creative jobs', "artificial intelligence will threaten people's lives", 'most jobs will be done in traditional workplaces']
Generated Answer: The correct answer is B. Full answer: Men will do more inventive jobs



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question 6: What is the attitude of most experts in artificial intelligence and robotics to the future?
True Answer: C
Options: ['Mixed.', 'Worried.', 'Hopeful.', 'Doubtful.']
Generated Answer: The correct answer is B. Full answer: Concerned.

Question 7: What would be the best title for this passage?
True Answer: A
Options: ['History of rings', 'Interesting facts about rings', 'Different beliefs about rings', 'The meaning of the rings.']
Generated Answer: The correct answer is B. Full answer: Interesting facts About rings



In [9]:

correct_answer_count = 0

#open file to save results
with open('gpt2-medium-RACE.txt', 'a') as file:
    for i, example in enumerate(validation_data.select(range(100))):
        context = example['article']
        question = example['question']
        options = example['options']
        true_answer = example['answer']
        generated_answer = generate_answer(context, question, options)

        generated_option_start = generated_answer.find("The correct answer is") + len("The correct answer is")
        generated_option = generated_answer[generated_option_start:generated_option_start + 2].strip().upper()

        if generated_option == true_answer:
            correct_answer_count += 1

        file.write(f"Question {i}: {question}\n")
        file.write(f"Generated Answer: {generated_answer}\n")
        file.write(f"Options: {options}\n")
        file.write(f"True Answer: {true_answer}\n\n")

correct_answer_score = correct_answer_count / 100
print(f"Exact Match Score: {correct_answer_score * 100:.2f}%")

with open('gpt2-medium-RACE.txt', 'a') as file:
    file.write(f"Exact Match Score: {correct_answer_score * 100:.2f}%\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Exact Match Score: 23.00%
