In [None]:
!pip install transformers datasets sentencepiece accelerate



In [None]:
from datasets import load_dataset
from collections import defaultdict
import pandas as pd

In [None]:
def preprocess_squad_dataset(max_qa_per_context=5):
    dataset = load_dataset("squad", split="train")

    context_dict = defaultdict(list)

    for item in dataset:
        context = item["context"].strip()
        question = item["question"].strip()
        answer = item["answers"]["text"][0].strip() if item["answers"]["text"] else ""
        context_dict[context].append((question, answer))

    inputs, targets = [], []

    for context, qa_list in context_dict.items():
        if len(qa_list) < 2:
            continue
        qa_list = qa_list[:max_qa_per_context]

        input_text = f"generate questions and answers: {context}"
        target_text = "\n".join([f"Q{i+1}: {q}\nA{i+1}: {a}" for i, (q, a) in enumerate(qa_list)])
        inputs.append(input_text)
        targets.append(target_text)

    return inputs, targets

In [None]:
def preprocess_sciq_dataset(max_qa_per_context=3):
    dataset = load_dataset("sciq", split="train")

    context_dict = defaultdict(list)

    for item in dataset:
        context = item["support"].strip()
        question = item["question"].strip()
        answer = item["correct_answer"].strip()
        context_dict[context].append((question, answer))

    inputs, targets = [], []

    for context, qa_list in context_dict.items():
        if len(qa_list) < 2:
            continue
        qa_list = qa_list[:max_qa_per_context]

        input_text = f"generate questions and answers: {context}"
        target_text = "\n".join([f"Q{i+1}: {q}\nA{i+1}: {a}" for i, (q, a) in enumerate(qa_list)])
        inputs.append(input_text)
        targets.append(target_text)

    return inputs, targets

In [None]:
squad_inputs, squad_targets = preprocess_squad_dataset()
sciq_inputs, sciq_targets = preprocess_sciq_dataset()

all_inputs = squad_inputs + sciq_inputs
all_targets = squad_targets + sciq_targets

df = pd.DataFrame({'input': all_inputs, 'target': all_targets})

ValueError: Invalid pattern: '**' can only be an entire path component

In [None]:
df.to_csv("./qa_dataset.csv", index=False)

In [None]:
raw_dataset = load_dataset('csv', data_files='qa_dataset.csv')['train']

# Split: 90% train, 10% validation
split_dataset = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
valid_dataset = split_dataset['test']

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

max_input_length = 512
max_target_length = 256

# Fungsi tokenisasi
def preprocess_function(example):
    model_input = tokenizer(
        example['input'],
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example['target'],
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
        )
    model_input["labels"] = labels["input_ids"]
    return model_input

# tokenized_dataset = dataset.map(preprocess_function, batched=True)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir= "./t5-mqg-base",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.map(preprocess_function, batched=True),
    eval_dataset=valid_dataset.map(preprocess_function, batched=True),
    tokenizer=tokenizer,
)

Map:   0%|          | 0/16726 [00:00<?, ? examples/s]



Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mliu-hengky-laurencio[0m ([33mliu-hengky-laurencio-universitas-tarumanagara[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.4863,0.433964
2,0.5321,0.493659
3,0.5359,0.479024


TrainOutput(global_step=12546, training_loss=0.5343335326657082, metrics={'train_runtime': 6010.4309, 'train_samples_per_second': 8.348, 'train_steps_per_second': 2.087, 'total_flos': 3.055628903251968e+16, 'train_loss': 0.5343335326657082, 'epoch': 3.0})

In [None]:
trainer.save_model("t5-mqg-base-model")
tokenizer.save_pretrained("t5-mqg-base-model")

('t5-mqg-base-model/tokenizer_config.json',
 't5-mqg-base-model/special_tokens_map.json',
 't5-mqg-base-model/spiece.model',
 't5-mqg-base-model/added_tokens.json')

In [None]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model="./t5-mqg-base-model", tokenizer="./t5-mqg-base-model")

context = "generate questions and answers: The lungs are the main organs in the human respiratory system. When we inhale, oxygen from the air enters through the nose, passes through the throat, and reaches the lungs. In the lungs, oxygen is exchanged for carbon dioxide in the alveoli, and then carbon dioxide is released when we exhale. This process is called external respiration and is essential for human survival."
output = pipe(context, max_length=256, clean_up_tokenization_spaces=True)[0]['generated_text']

print(output)

Device set to use cuda:0


Q1: What are the main organs in the human respiratory system? A1: The lungs Q2: What is the process called when oxygen enters through the nose? A2: external respiration Q3: What is the process called when oxygen is exchanged for carbon dioxide? A3: carbon dioxide Q4: What is the process called when oxygen is exchanged for carbon dioxide? A4: alveoli Q5: What is the process called when carbon dioxide is released? A5: exhale


In [None]:
!zip -r t5-mqg-base-model.zip t5-mqg-base-model

  adding: t5-mqg-base-model/ (stored 0%)
  adding: t5-mqg-base-model/tokenizer_config.json (deflated 94%)
  adding: t5-mqg-base-model/added_tokens.json (deflated 83%)
  adding: t5-mqg-base-model/model.safetensors (deflated 8%)
  adding: t5-mqg-base-model/special_tokens_map.json (deflated 85%)
  adding: t5-mqg-base-model/config.json (deflated 63%)
  adding: t5-mqg-base-model/training_args.bin (deflated 51%)
  adding: t5-mqg-base-model/spiece.model (deflated 48%)
  adding: t5-mqg-base-model/generation_config.json (deflated 29%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!unzip t5-mqg-base-model.zip

Archive:  t5-mqg-base-model.zip
   creating: t5-mqg-base-model/
  inflating: t5-mqg-base-model/tokenizer_config.json  
  inflating: t5-mqg-base-model/added_tokens.json  
  inflating: t5-mqg-base-model/model.safetensors  
  inflating: t5-mqg-base-model/special_tokens_map.json  
  inflating: t5-mqg-base-model/config.json  
  inflating: t5-mqg-base-model/training_args.bin  
  inflating: t5-mqg-base-model/spiece.model  
  inflating: t5-mqg-base-model/generation_config.json  


In [None]:
valid_dataset

Dataset({
    features: ['input', 'target'],
    num_rows: 1859
})

In [None]:
!pip install evaluate rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=8886a04c5f2b4e5ce70341d03e3a78a5fea6d2a5bf4cf5b1b2c31619e185dc46
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from transformers import pipeline
from datasets import Dataset
import evaluate
from tqdm import tqdm

In [None]:
pipe = pipeline(
    "text2text-generation",
    model="./t5-mqg-base-model",
    tokenizer="./t5-mqg-base-model",
    device=0
)

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

inputs = valid_dataset['input']
targets = valid_dataset['target']

Device set to use cuda:0


In [None]:
batch_size = 16
predictions = []

print("Generating predictions in batch...")
for i in tqdm(range(0, len(inputs), batch_size), desc="Predicting"):
    batch_inputs = inputs[i:i+batch_size]

    batch_outputs = pipe(
        batch_inputs,
        max_length=256,
        clean_up_tokenization_spaces=True,
        batch_size=batch_size
    )

    batch_preds = [output['generated_text'] for output in batch_outputs]
    predictions.extend(batch_preds)

Generating predictions in batch...


Predicting: 100%|██████████| 117/117 [06:38<00:00,  3.40s/it]


In [None]:
references = targets

In [None]:
bleu_result = bleu.compute(predictions=predictions, references=references)
print(f"BLEU: {bleu_result['bleu']:.4f}")
print("")
rouge_result = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE-1: {rouge_result['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_result['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")

BLEU: 0.2003
ROUGE-1: 0.4624
ROUGE-2: 0.1755
ROUGE-L: 0.3418
