In [None]:
!pip install pyvi

In [None]:
# import libs
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
import re
from datasets import Dataset, DatasetDict
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.metrics import bleu_score
from pyvi import ViTokenizer
import wandb

In [None]:
train_filepaths=[
    r'/kaggle/input/pho-mt/train.en',
    r'/kaggle/input/pho-mt/train.vi'
]
dev_filepaths=[
    r'/kaggle/input/pho-mt/dev.en',
    r'/kaggle/input/pho-mt/dev.vi'
]
test_filepaths=[
    r'/kaggle/input/pho-mt/test.en',
    r'/kaggle/input/pho-mt/test.vi'
]

In [None]:
BATCH_SIZE=16
lower=True
SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'vi'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
load_model = True

In [None]:
def vi_tokenizer(sentence):
    tok_trans=ViTokenizer.tokenize(sentence).split()
    result=[]
    for tok in tok_trans:
        result.append(tok.replace('_',' '))
    return result

token_transform = {
    'en': get_tokenizer('spacy', language='en_core_web_sm'),
    'vi': vi_tokenizer
}

In [None]:
# Function to load data from files
def load_data(en_path, vi_path):
    with open(vi_path, encoding='utf-8') as f:
        vi_data = f.readlines()
    with open(en_path, encoding='utf-8') as f:
        en_data = f.readlines()
    return {'en': en_data, 'vi': vi_data}

# Load train, dev, and test data
train_data = load_data(train_filepaths[0], train_filepaths[1])
dev_data = load_data(dev_filepaths[0], dev_filepaths[1])
test_data = load_data(test_filepaths[0], test_filepaths[1])

# Create DatasetDict
datasets = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'validation': Dataset.from_dict(dev_data),
    'test': Dataset.from_dict(test_data)
})
print(datasets)

In [None]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab

In [None]:
# Load the pretrained ViT5 model
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_name = "VietAI/vit5-base"  # replace with the actual model name if available
tokenizer = T5Tokenizer.from_pretrained("VietAI/vit5-base")
model = T5ForConditionalGeneration.from_pretrained('/kaggle/input/vit5-base/fine-tuned-vit5')

In [None]:
from datasets import load_dataset

# Function to sample a fraction of the dataset
def sample_dataset(dataset, fraction=1/500):
    return dataset.train_test_split(test_size=(1 - fraction))['train']

# Sample train, validation, and test sets
sampled_train = sample_dataset(datasets['train'])
sampled_validation = sample_dataset(datasets['validation'])
sampled_test = datasets['test']

# Combine the sampled datasets into a new DatasetDict
sampled_dataset = DatasetDict({
    'train': sampled_train,
    'validation': sampled_validation,
    'test': sampled_test
})

# Display the sampled dataset information
print(sampled_dataset)

In [None]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = ["translate English to Vietnamese: " + ex for ex in examples['en']]
    targets = [ex for ex in examples['vi']]
    model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = sampled_dataset.map(preprocess_function, batched=True)

In [None]:
# Define function to collate data samples into batch tensors
def generate_batch(batch):
    src_batch = [sample['input_ids'] for sample in batch]
    tgt_batch = [sample['labels'] for sample in batch]
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return {'input_ids': src_batch, 'labels': tgt_batch}

# Create DataLoaders
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
val_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
!pip install transformers[torch] datasets torch pyvi sacrebleu accelerate

In [None]:
import sacrebleu
import numpy as np
# Inference functions
def infer(text, model, tokenizer):
    model.eval()
    input_text = "translate English to Vietnamese: " + text
    input_ids = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=128, truncation=True).input_ids

    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

    translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return translated_text

In [None]:
def compute_metrics(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

In [None]:
def infer_test_set(model, tokenizer, tokenized_dataset, times, skip_small_text=0, sample=500):
    count = 0
    bleu = np.array([])
    
    for text_dict in tokenized_dataset:
        if count == sample:
            break

        text = tokenizer.decode(text_dict["input_ids"], skip_special_tokens=True).replace("translate English to Vietnamese: ", "")
        label = tokenizer.decode(text_dict["labels"], skip_special_tokens=True)

        if skip_small_text > 0:
            if len(text) < skip_small_text:
                continue
        count += 1

        output_list = []
        bleu_score_list = []
        
        for i in range(times):
            output = infer(text, model, tokenizer)
            score = compute_metrics([output], [label])
            output_list.append(output)
            bleu_score_list.append(score)
        
        max_index = bleu_score_list.index(max(bleu_score_list))
        bleu_score = bleu_score_list[max_index]
        bleu = np.append(bleu, bleu_score)
        
        if count % 2000 == 0:
            print("Bleu Score: ", bleu_score_list[max_index])
            print("Input: " + text)
            print("Prediction: " + output_list[max_index])
            print("Label: " + label)
            print()

    print(f"Length: {len(bleu)}")
    if skip_small_text > 0:
        print(f"MEAN BLEU SCORE with loop= {times}, skip all text with length shorter than {skip_small_text}: {bleu.mean()}")
    else:
        print(f"MEAN BLEU SCORE with loop= {times}, do not skip small text: {bleu.mean()}")
    print("BLEU SCORE: ", bleu.mean())

In [None]:
infer_test_set(model, tokenizer, tokenized_datasets['test'], times=1, skip_small_text=40, sample=19000)
print()
print()