# **Import Library**

In [3]:
# import libs
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
import re
from datasets import Dataset, DatasetDict
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.metrics import bleu_score
import wandb

# **Load dataset**

In [4]:
train_filepaths=[
    r'/kaggle/input/pho-mt/train.en',
    r'/kaggle/input/pho-mt/train.vi'
]
dev_filepaths=[
    r'/kaggle/input/pho-mt/dev.en',
    r'/kaggle/input/pho-mt/dev.vi'
]
test_filepaths=[
    r'/kaggle/input/pho-mt/test.en',
    r'/kaggle/input/pho-mt/test.vi'
]

In [5]:
BATCH_SIZE=32
lower=True
SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'vi'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
load_model = True
save_model = True

In [6]:
# Function to load data from files
def load_data(en_path, vi_path):
    with open(en_path, encoding='utf-8') as f:
        en_data = f.readlines()
    with open(vi_path, encoding='utf-8') as f:
        vi_data = f.readlines()
    return {'en': en_data, 'vi': vi_data}

# Load train, dev, and test data
train_data = load_data(train_filepaths[0], train_filepaths[1])
dev_data = load_data(dev_filepaths[0], dev_filepaths[1])
test_data = load_data(test_filepaths[0], test_filepaths[1])

# Create DatasetDict
datasets = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'validation': Dataset.from_dict(dev_data),
    'test': Dataset.from_dict(test_data)
})
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['vi', 'en'],
        num_rows: 2977999
    })
    validation: Dataset({
        features: ['vi', 'en'],
        num_rows: 18719
    })
    test: Dataset({
        features: ['vi', 'en'],
        num_rows: 19151
    })
})


In [7]:
# from datasets import load_dataset

# # Function to sample a fraction of the dataset
# def sample_dataset(dataset, fraction=1/500):
#     return dataset.train_test_split(test_size=(1 - fraction))['train']

# # Sample train, validation, and test sets
# sampled_train = sample_dataset(datasets['train'])
# sampled_validation = sample_dataset(datasets['validation'])
# sampled_test = datasets['test']

# # Combine the sampled datasets into a new DatasetDict
# sampled_dataset = DatasetDict({
#     'train': sampled_train,
#     'validation': sampled_validation,
#     'test': sampled_test
# })

# # Display the sampled dataset information
# print(sampled_dataset)

DatasetDict({
    train: Dataset({
        features: ['vi', 'en'],
        num_rows: 5955
    })
    validation: Dataset({
        features: ['vi', 'en'],
        num_rows: 37
    })
    test: Dataset({
        features: ['vi', 'en'],
        num_rows: 19151
    })
})


**Import model**

In [8]:
# Load the pretrained ViT5 model
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_name = "VietAI/vit5-base"  # replace with the actual model name if available
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [9]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = ["translate English to Vietnamese: " + ex for ex in examples['en']]
    targets = [ex for ex in examples['vi']]
    model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/5955 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/19151 [00:00<?, ? examples/s]

In [10]:
# Define function to collate data samples into batch tensors
def generate_batch(batch):
    src_batch = [sample['input_ids'] for sample in batch]
    tgt_batch = [sample['labels'] for sample in batch]
    src_batch = pad_sequence(src_batch, padding_value=1)
    tgt_batch = pad_sequence(tgt_batch, padding_value=1)
    return {'input_ids': src_batch, 'labels': tgt_batch}

# Create DataLoaders
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

# **Training**

In [11]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps = 1000,
    save_strategy="steps", # Change to "steps" to save after a certain number of steps
    save_steps=1000, # Save after every 1000 steps
    learning_rate=1e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16 = True
)

In [12]:
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)
wandb.login(key = "657caa4a9ec74a7425c69683dc166f64282e7513")
wandb.init(project = "MT")
# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvuduchung3103[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss


TrainOutput(global_step=94, training_loss=3.6187279883851398, metrics={'train_runtime': 110.3888, 'train_samples_per_second': 53.946, 'train_steps_per_second': 0.852, 'total_flos': 453293029785600.0, 'train_loss': 3.6187279883851398, 'epoch': 1.0})

In [13]:
# Save the model
trainer.save_model("./fine-tuned-vit5")
print("Model saved")

Model saved
