In [1]:
pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [39]:
from datasets import load_dataset

dataset = load_dataset("squad")


In [40]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [31]:
reduced_train_dataset = dataset["train"].select(range(100))
reduced_validation_dataset = dataset["train"].select(range(40))




In [38]:
dataset["train"] = reduced_train_dataset
dataset["validation"] = reduced_validation_dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 37
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 15
    })
})

In [41]:
def filter_one_word_answers(example):
    return all(len(answer.split()) == 1 for answer in example["answers"]["text"])

# Apply the filter to the train and validation datasets
reduced_train_dataset = dataset["train"].filter(filter_one_word_answers)
reduced_validation_dataset = dataset["validation"].filter(filter_one_word_answers)

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [42]:
dataset["train"] = reduced_train_dataset
dataset["validation"] = reduced_validation_dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 30267
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 2584
    })
})

In [43]:
dataset["train"][3]

{'id': '5733bed24776f41900661188',
 'title': 'University_of_Notre_Dame',
 'context': 'The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.',
 'question': 'Where is the headquarters of the Congregation of the Holy Cross?',
 'answers': {'text': ['Rome'], 'answer_start': [119]}}

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments


tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [44]:
def preprocess_function(examples):
    # Inputs: Combine the passage and answer for the model to generate the question
    inputs = [
        f" Context: {context} Answer: {answer['text'][0]}"
        for context, answer in zip(examples["context"], examples["answers"]) ]

    targets = examples['question']

    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/30267 [00:00<?, ? examples/s]

Map:   0%|          | 0/2584 [00:00<?, ? examples/s]

In [45]:
tokenized_dataset['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 30267
})

In [None]:
import os
os.environ["WANDB_DISABLED"] = "false"

In [46]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2475,0.211099
2,0.2338,0.203629
3,0.2149,0.202037


TrainOutput(global_step=11352, training_loss=0.278860696043407, metrics={'train_runtime': 4293.8703, 'train_samples_per_second': 21.147, 'train_steps_per_second': 2.644, 'total_flos': 1.2289170915459072e+16, 'train_loss': 0.278860696043407, 'epoch': 3.0})

In [52]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./results/checkpoint-1000")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Prepare input for question generation
input_text = (
    "context: The Lobund Institute grew out of pioneering research in germ-free-life "
    "which began in 1928. This area of research originated in a question posed by Pasteur as to whether "
    "animal life was possible without bacteria. Though others had taken up this idea, their research was short-lived "
    "and inconclusive. Lobund was the first research organization to answer definitively, that such life is possible "
    "and that it can be prolonged through generations. But the objective was not merely to answer Pasteur's question "
    "but also to produce the germ-free animal as a new tool for biological and medical research. answer: 1928"
)

# Tokenize the input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate the question
output_ids = model.generate(
    input_ids,
    max_length=128,  # Adjust as needed
    num_beams=4,     # Beam search for better results
    early_stopping=True
)

# Decode the generated tokens to get the predicted question
predicted_question = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the predicted question
print(f"Predicted Question: {predicted_question}")


Predicted Question: What year did The Lobund Institute begin research in germ-free-life?


In [55]:
!zip -r /content/mcqgen2.zip ./results/checkpoint-11000

  adding: results/checkpoint-11000/ (stored 0%)
  adding: results/checkpoint-11000/generation_config.json (deflated 29%)
  adding: results/checkpoint-11000/training_args.bin (deflated 51%)
  adding: results/checkpoint-11000/trainer_state.json (deflated 82%)
  adding: results/checkpoint-11000/rng_state.pth (deflated 25%)
  adding: results/checkpoint-11000/config.json (deflated 62%)
  adding: results/checkpoint-11000/scheduler.pt (deflated 56%)
  adding: results/checkpoint-11000/optimizer.pt (deflated 7%)
  adding: results/checkpoint-11000/model.safetensors (deflated 9%)
