In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install datasets

In [None]:
import torch
from datasets import load_dataset
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Trainer, TrainingArguments

In [None]:
# Load the training and validation sets
train_data = load_dataset("json", data_files={"train": "/content/drive/MyDrive/NQuAD_train.json"})
eval_data = load_dataset("json", data_files={"test": "/content/drive/MyDrive/NQuAD_test.json"})

train_dataset = train_data["train"]
eval_dataset = eval_data["test"]

In [None]:
print(f"Size of the training set: {len(train_dataset)}")
print(f"Size of the testing set: {len(eval_dataset)}")

In [None]:
# mT5 tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# data preprocessing
def preprocess_function(examples):
    inputs = [
        f'''
        Choose a correct answer to the following questions.
        Context: {c}
        Question: {q}
        Options: {opts}
        Answer: '''
        for c, q, opts in zip(
            examples["sentences_containing_the_numeral_in_answer_options"],
            examples["question_stem"],
            examples["answer_options"]
        )
    ]
    targets = [str(ans) for ans in examples["target_num"]]

    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding=True)
    labels = tokenizer(text_target=targets, max_length=2, truncation=True, padding=True)

    model_inputs["labels"] = np.array(labels["input_ids"], dtype=np.int64)
    return model_inputs

In [None]:
import numpy as np

# Input Example
example_data = {
    "sentences_containing_the_numeral_in_answer_options": [
        "隨著iPhone 5、三星Galaxy S4(見附圖)等高階智慧型手機銷售不如預期，也讓市場轉而看好中、低階機種的成長力道，並對晶圓代工40奈米製程的需求轉趨樂觀。"
    ],
    "question_stem": [
        "___nm需求趨緊俏，大摩大升聯電/中芯目標價"
    ],
    "answer_options": [
        [22, 28, 30, 40]
    ],
    "target_num": [
        40
    ]
}

processed_data = preprocess_function(example_data)

print("Input Example:", tokenizer.decode(processed_data["input_ids"][0]))
print("Target Example:", tokenizer.decode(processed_data["labels"][0]))


In [None]:
# Preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Remove other columns
columns_to_remove = ["news_article", "question_stem", "answer_options", "ans", "target_num", "sentences_containing_the_numeral_in_answer_options"]
train_dataset = train_dataset.remove_columns(columns_to_remove)
eval_dataset = eval_dataset.remove_columns(columns_to_remove)

# Convert to PyTorch format
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, EarlyStoppingCallback

# Clear GPU memory
torch.cuda.empty_cache()

# Define the data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=-100
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    run_name="mt5-finetuning-run1",
    report_to="none",
    eval_strategy="steps",
    eval_steps=50,
    logging_steps=100,
    logging_dir="./logs",
    logging_first_step=True,
    logging_nan_inf_filter=False,
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=4,
    predict_with_generate=False,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Add early stopping
)

# Start training
trainer.train()


In [None]:
################# Model Save

In [None]:
# Save the model to the specified directory
output_dir = "./mt5_finetuned_model_new"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


In [None]:
# Save to Google Drive
import shutil
shutil.move(output_dir, "/content/drive/MyDrive/mt5_finetuned_model")

print("Model saved to Google Drive!")


In [None]:
#######################Evaluate

In [None]:
# Evaluation dataset with options column
eval_dataset_new = eval_data["test"]
eval_dataset_new = eval_dataset_new.map(preprocess_function, batched=True)
columns_remove = ["news_article", "question_stem", "ans", "target_num", "sentences_containing_the_numeral_in_answer_options"]
eval_dataset_new = eval_dataset_new.remove_columns(columns_remove)
eval_dataset_new.set_format("torch")


In [None]:
def evaluate_samples(dataset, model, tokenizer, max_length=2, max_retries=3):
    results = []
    model.eval()
    with torch.no_grad():
        for sample in dataset:
            # Retrieve and process input, target, and options
            input_ids = sample["input_ids"].unsqueeze(0).to(model.device)  # Add batch dimension
            labels = sample["labels"].unsqueeze(0).to(model.device)     # Target values
            target_text = tokenizer.decode(labels[0], skip_special_tokens=True).strip()  # Decode target
            options = sample["answer_options"]  # Options provided in the dataset

            # Skip this sample if the target is not in the options
            if target_text not in options:
                continue

            predicted_text = ""
            retries = 0

            # Generate predictions and handle empty predictions
            while not predicted_text.strip() and retries < max_retries:
                outputs = model.generate(
                    input_ids=input_ids,
                    max_length=max_length,
                    early_stopping=False
                )
                predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
                retries += 1

            # Ensure the prediction is in the options
            if predicted_text not in options:
                # Find the closest match among the options
                corrected_prediction = min(
                    options, key=lambda opt: len(set(opt) ^ set(predicted_text))
                )
            else:
                corrected_prediction = predicted_text

            # Append results if prediction is valid
            if corrected_prediction:
                results.append({
                    "input": tokenizer.decode(input_ids[0], skip_special_tokens=True),
                    "target": target_text,
                    "prediction": predicted_text,
                    "corrected_prediction": corrected_prediction,
                })

    return results


In [None]:
def calculate_accuracy(results):
    correct = 0
    total = len(results)

    for result in results:
        # Compare whether target and prediction are equal
        if result["target"] == result["prediction"]:
            correct += 1

    # Ensure total is not 0
    accuracy = correct / total if total > 0 else 0.0
    return accuracy


In [None]:
# from torch.utils.data import Subset

#
# subset_indices = list(range(50))  # Select the first 50 data
# subset_eval_dataset = Subset(eval_dataset_new, subset_indices)


In [None]:
# Evaluate
results = evaluate_samples(eval_dataset_new, model, tokenizer)
# Compute Accuracy
accuracy = calculate_accuracy(results)
print(f"Accuracy: {accuracy:.2%}")


In [None]:
#################

In [None]:
# Check
for i, result in enumerate(results[:50]):
    print(f"Sample {i + 1}:")
    print(f"Input: {result['input']}")
    print(f"Target: {result['target']}")
    print(f"Prediction: {result['prediction']}")
    print("-" * 50)