In [None]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import nltk
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
nltk.download('wordnet')  # Required for METEOR score
import random
import os

# Load the pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-mul-en"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)

def preprocess_function(examples):
    # Extract Japanese source text and English target text
    inputs = [">>jpn<< " + src for src in examples['src']]  # Japanese text with language token
    targets = examples['trg']  # English text

    # print(inputs[0])
    # print(targets[0])

    # Tokenize the source text
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Tokenize the target text as labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=512,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
# print if cuda is available
print(torch.cuda.is_available())

In [None]:
data = load_dataset("NilanE/ParallelFiction-Ja_En-100k", split="train")

dataset = data.train_test_split(test_size=0.1, seed=42)
train_data = dataset['train']
test_data = dataset['test']

In [None]:
tokenized_train_data = train_data.map(preprocess_function, batched=True)


tokenized_train_data.save_to_disk("./tokenized_data")

print(tokenized_train_data[0])


In [None]:
# Define training arguments with evaluation disabled
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation
    save_strategy="steps",  # Save checkpoints periodically
    save_steps=500,  # Save a checkpoint every 500 steps
    save_total_limit=3,  # Keep only the last 3 checkpoints
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use FP16 if a GPU is available
    logging_dir="./logs",
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_train_data,  # Training data
    tokenizer=tokenizer,
)


In [None]:


# Path to the checkpoint directory
checkpoint_dir = "./results"

# Check if a checkpoint exists
if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
    print("Checkpoint found! Resuming training from checkpoint...")
    trainer.train(resume_from_checkpoint=True)
else:
    print("No checkpoint found. Starting training from scratch...")
    trainer.train()

# Save the fine-tuned model and tokenizer
original_model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
