In [None]:
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load the pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ja-en"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)


data = load_dataset("NilanE/ParallelFiction-Ja_En-100k", split="train")

dataset = data.train_test_split(test_size=0.1, seed=42)
train_data = dataset['train']
test_data = dataset['test']

def preprocess_function(examples):
    # Extract Japanese source text and English target text
    inputs = examples['src']  # Japanese text
    targets = examples['trg']  # English text

    # Tokenize the source text
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Tokenize the target text as labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs





In [12]:
# Preprocess the data
tokenized_train_data = train_data.map(preprocess_function, batched=True)

tokenized_train_data.save_to_disk("./tokenized_data")


Saving the dataset (5/5 shards): 100%|██████████| 95443/95443 [00:02<00:00, 46849.67 examples/s]


In [17]:
# Define training arguments with evaluation disabled
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation
    save_strategy="steps",  # Save checkpoints periodically
    save_steps=500,  # Save a checkpoint every 500 steps
    save_total_limit=3,  # Keep only the last 3 checkpoints
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use FP16 if a GPU is available
    logging_dir="./logs",
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,  # Training data
    tokenizer=tokenizer,
)


  trainer = Seq2SeqTrainer(


In [19]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Fine-tune the model
trainer.train(resume_from_checkpoint=True)

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
