In [None]:
import torch
from torch.utils.data import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
with open('train.json', 'r', encoding='utf-8') as file:
    train_data = json.load(file)
with open('validation.json', 'r', encoding='utf-8') as file:
    validation_data = json.load(file)
with open('test.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

In [None]:
# Define your dataset class
class TranslationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        input_text = example["en"]
        target_text = example["zh"]

        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )["input_ids"]

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": labels.squeeze()
        }

In [None]:
# Load the mT5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/mt5-small")

In [None]:
# Create datasets
train_dataset = TranslationDataset(train_data, tokenizer)
validation_dataset = TranslationDataset(validation_data, tokenizer)
test_dataset = TranslationDataset(test_data, tokenizer)

**trainer**

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./mt5_model',      # 儲存模型的位置
    num_train_epochs=10,           # epochs數
    per_device_train_batch_size=4, # train_batch_size
    per_device_eval_batch_size=4,  # eval_batch_size
    gradient_accumulation_steps=2, # 梯度累積的步數
    eval_accumulation_steps=2,     # 每幾步把eval_dataset從顯卡丟到cpu
    weight_decay=0.01,             # 權重係數
    logging_dir='./mt5_logs',      # 儲存log的位置
    logging_steps=2500,            # 每隔多少步更新一次log
    evaluation_strategy="steps",   # 驗證的策略，steps表示按照步數驗證
    eval_steps=2500,               # 每隔多少步驗證一次
    save_total_limit=5,            # 保存的模型數量限制
    push_to_hub=False,             # 是否push模型到Hugging Face Hub
)

# Custom data collator function
def data_collator(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels),
    }
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
    }


# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./mt5_translation_model")
tokenizer.save_pretrained("./mt5_translation_model")

In [None]:
# Evaluate the model on the test dataset
results = trainer.evaluate(test_dataset)

# Print the evaluation results
print("Evaluation Results:", results)

**Seq2SeqTrainer**

In [None]:
# Define training arguments for Seq2SeqTrainer
training_args_seq2seq = Seq2SeqTrainingArguments(
    output_dir='./mt5_model_seq2seq',  # 儲存模型的位置
    num_train_epochs=10,               # epochs數
    per_device_train_batch_size=4,     # train_batch_size
    per_device_eval_batch_size=4,      # eval_batch_size
    gradient_accumulation_steps=2,     #梯度累積的步數
    eval_accumulation_steps=2,         #梯度累積的步數(evaluation)
    weight_decay=0.01,                 # 權重係數
    logging_dir='./mt5_logs_seq2seq',  # 儲存log的位置
    logging_steps=2500,                # 每隔多少步更新一次log
    evaluation_strategy="steps",       # 驗證的策略，steps表示按照步数驗證
    eval_steps=2500,                   # 每隔多少步驗證一次
    save_total_limit=5,                # 保存的模型數量限制
    push_to_hub=False,                 # 是否push模型到Hugging Face Hub
)

# Custom data collator function
def data_collator(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels),
    }

# Create Seq2SeqTrainer
seq2seq_trainer = Seq2SeqTrainer(
    model=model,
    args=training_args_seq2seq,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
)

# Train the model using Seq2SeqTrainer
seq2seq_trainer.train()

# Save the model
model.save_pretrained("./mt5_translation_model_seq2seq")
tokenizer.save_pretrained("./mt5_translation_model_seq2seq")

In [None]:
# Evaluate the model on the test dataset
seq2seq_results = seq2seq_trainer.evaluate(test_dataset)

# Print the evaluation results
print("seq2seq_Evaluation Results:", seq2seq_results)

**Inference**

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the trained model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./mt5_translation_model")
tokenizer = T5Tokenizer.from_pretrained("./mt5_translation_model")
model_seq2seq = T5ForConditionalGeneration.from_pretrained("./mt5_translation_model_seq2seq")
tokenizer_seq2seq = T5Tokenizer.from_pretrained("./mt5_translation_model_seq2seq")

def translate_text(input_text, model, tokenizer, max_length=512):
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=max_length, truncation=True)
    output_ids = model.generate(input_ids)
    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return translated_text

**trainer**

In [None]:
# Example inference
input_text = "hello,how are you today?"
translated_text = translate_text(input_text, model, tokenizer)
print(f"Input Text: {input_text}")
print(f"Translated Text: {translated_text}")

**Seq2SeqTrainer**

In [None]:
# Example inference
input_text = "hello,how are you today?"
translated_text = translate_text(input_text, model_seq2seq, tokenizer_seq2seq)
print(f"Input Text: {input_text}")
print(f"Translated Text: {translated_text}")