In [1]:
import os
import time
from tqdm import tqdm
import json

In [None]:
data_dir = "./dataset/test_data/OPUS-Tatoeba"
os.makedirs(data_dir, exist_ok=True)
ru_vi_url = "https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/ru-vi.txt.zip"

# Load MyTranslator

In [None]:
from Translators import OpusTranslator, NLLB200Translator, MBART50Translator

In [18]:
# model_name = "Helsinki-NLP/opus-mt-ru-vi"
# model_name = "Helsinki-NLP/opus-mt-vi-ru"
# model_name = "facebook/nllb-200-distilled-600M"
model_name = "facebook/mbart-large-50-many-to-many-mmt"

# direction = "ru2vi"
direction = "vi2ru"

In [19]:
if model_name.startswith("Helsinki-NLP/opus-mt"):
    # OpusTranslator
    translator = OpusTranslator(model_name)
elif model_name.startswith("facebook/nllb"):
    # NLLB200Translator
    translator = NLLB200Translator(model_name, direction=direction)
elif model_name.startswith("facebook/mbart"):
    # MBART50Translator
    translator = MBART50Translator(model_name, direction=direction)
else:
    raise ValueError(f"Unsupported model name: {model_name}")

# Load TestData

In [20]:
from MyDataset import TransDataset

In [21]:
ru_path = f"{data_dir}/Tatoeba.ru-vi.ru"
vi_path = f"{data_dir}/Tatoeba.ru-vi.vi"

In [22]:
tokenizer = translator.tokenizer
myDataset = TransDataset(ru_path, vi_path, tokenizer, direction=direction)

# Inference

In [23]:
sources = myDataset.sources
targets = myDataset.targets
sources[0], targets[0]

('Tôi phải đi ngủ.', 'Мне пора идти спать.')

In [24]:
predictions = []
start_time = time.time()
# ----
for sentence in tqdm(sources, desc="Translating"):
  predictions.append(translator.translate(sentence))
# ----
end_time = time.time()
total_time = end_time - start_time
average_time = (end_time - start_time) / len(sources)
print(f"Inference time: {total_time} seconds")
print(f"Average time per sentence: {average_time} seconds")

Translating: 100%|██████████| 313/313 [00:45<00:00,  6.81it/s]

Inference time: 45.95970892906189 seconds
Average time per sentence: 0.146836130763776 seconds





In [25]:
predictions[:3], targets[:3]

(['Я должен спать.', 'Что вы делаете?', 'Что это?'],
 ['Мне пора идти спать.', 'Что ты делаешь?', 'Что это?'])

# Load Evaluator

In [26]:
from Evaluator import Evaluator

In [27]:
evaluator = Evaluator()
eval_results = evaluator.compute(predictions, targets)

In [28]:
score = eval_results["score"]
print(f"BLEU score: {score}")

BLEU score: 6.76488028685308


In [29]:
model_log = model_name.split("/")[-1]
log_dir = f"./logs/{model_log}_{direction}"
os.makedirs(log_dir, exist_ok=True)
log_file_path = os.path.join(log_dir, f"log_{model_log}.txt")

In [30]:
results = {}
results["model_name"] = model_name
results["average_time"] = average_time
results["total_time"] = total_time
results["num_sentences"] = len(myDataset)
results["BLEU_score"] = score

with open(log_file_path, "w", encoding="utf-8") as f:
  json.dump(results, f, indent=4, ensure_ascii=False)``
print(f"Results saved to {log_file_path}")

Results saved to ./logs/mbart-large-50-many-to-many-mmt_vi2ru/log_mbart-large-50-many-to-many-mmt.txt


In [31]:
# Save the dict {predictions, target} to a file
predictions_file_path = os.path.join(log_dir, f"predictions_{model_log}.txt")
with open(predictions_file_path, "w", encoding="utf-8") as f:
  for pred, target in zip(predictions, targets):
    pair = {"pred": pred, "target": target}
    f.write(json.dumps(pair, indent=4, ensure_ascii=False) + "\n")
print(f"Predictions saved to {predictions_file_path}")

Predictions saved to ./logs/mbart-large-50-many-to-many-mmt_vi2ru/predictions_mbart-large-50-many-to-many-mmt.txt
