# Machine Translation

In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 09:46:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0
!pip install clean-text[gpl]==0.4.0
!pip install sacrebleu==1.5.1

Collecting hazm==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/22/13/5a7074bc11d20dbbb46239349ac3f85f7edc148b4cf68e9b8c2f8263830c/hazm-0.7.0-py3-none-any.whl (316kB)
[K     |█                               | 10kB 21.7MB/s eta 0:00:01[K     |██                              | 20kB 17.3MB/s eta 0:00:01[K     |███                             | 30kB 15.1MB/s eta 0:00:01[K     |████▏                           | 40kB 14.3MB/s eta 0:00:01[K     |█████▏                          | 51kB 7.4MB/s eta 0:00:01[K     |██████▏                         | 61kB 8.6MB/s eta 0:00:01[K     |███████▎                        | 71kB 9.1MB/s eta 0:00:01[K     |████████▎                       | 81kB 9.4MB/s eta 0:00:01[K     |█████████▎                      | 92kB 9.6MB/s eta 0:00:01[K     |██████████▍                     | 102kB 8.0MB/s eta 0:00:01[K     |███████████▍                    | 112kB 8.0MB/s eta 0:00:01[K     |████████████▍                   | 122kB 8.0MB/s et

In [None]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [None]:
# Import required packages
import os
import gc
import re
import hazm
import time
import json
import sacrebleu
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import transformers
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import MT5Config, MT5ForConditionalGeneration, MT5Tokenizer

from cleantext import clean

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
class MachineTranslationDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Machine Translation. """

    def __init__(self, original_text, translated_text, tokenizer, max_length):
        self.original_text = original_text
        self.translated_text = translated_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.original_text)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.original_text[item],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        inputs = {
            'original': self.original_text[item],
            'translated': self.translated_text[item],
            'input_ids': encoding.input_ids.flatten(),
            'attention_mask': encoding.attention_mask.flatten()
        }
        return inputs


class MachineTranslation:
    def __init__(self, model_name, model_type):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        if model_type.lower() == "mt5":
            self.tokenizer = MT5Tokenizer.from_pretrained(model_name)
            self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
            self.config = MT5Config.from_pretrained(self.model_name)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.id2label = self.config.id2label
            self.label2id = self.config.label2id

    def load_dataset_test_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() == "mizan":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['original', 'translation'], header=None)
            original, translation = data['original'].values.tolist(), data['translation'].values.tolist()
            print(f'test part:\n #original: {len(original)}, #translation: {len(translation)}')
            return original, translation
        if dataset_name.lower() == "combined":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['original', 'translation', 'source'], header=None)
            if 'source' in kwargs:
                data = data[data['source'] == kwargs['source']]
                data = data[['original', 'translation']]
            else:
                data = data[['original', 'translation']]
            original, translation = data['original'].values.tolist(), data['translation'].values.tolist()
            print(f'test part:\n #original: {len(original)}, #translation: {len(translation)}')
            return original, translation

    def load_dataset_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() == "quran" or dataset_name.lower() == "bible":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            original, translation = [], []
            with open(dataset_file, encoding="utf8") as infile:
                for line in infile:
                    parts = line.strip().split('\t')
                    original.append(parts[0])
                    translation.append(parts[1])
            print(f'all data:\n #original: {len(original)}, #translation: {len(translation)}')

            _, original_test, _, translation_test = train_test_split(original, translation, test_size=0.1,
                                                                     random_state=1)
            print(f'test part:\n #original: {len(original_test)}, #translation: {len(translation_test)}')
            return original, translation, original_test, translation_test

    def mt5_machine_translation_inference(self, input_text, device):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return

        tokenized_batch = self.tokenizer(
            input_text,
            padding=True,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        input_ids = tokenized_batch.input_ids.to(device)
        attention_mask = tokenized_batch.attention_mask.to(device)
        outputs = self.model.generate(input_ids=input_ids,
                                      attention_mask=attention_mask)
        predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return predictions

    def mt5_evaluation(self, input_text, translated_text, device, max_length, split_reference=None, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if len(input_text) != len(translated_text):
            print('length of inputs and its translations is not equal!!')
            return

        dataset = MachineTranslationDataset(original_text=input_text, translated_text=translated_text,
                                            tokenizer=self.tokenizer, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#original_text:{len(input_text)}, #translated_text:{len(translated_text)}')
        print("#batch:", len(data_loader))

        if split_reference is None:
            max_num_ref = 1
        else:
            max_num_ref = 0
            for ref in translated_text:
                max_num_ref = max(max_num_ref, len(ref.split(split_reference)))
        print("#maximum_translation_reference:", max_num_ref)

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        output_predictions = []
        golden_translations, predicted_translations = [[] for _ in range(max_num_ref)], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model.generate(input_ids=b_input_ids, attention_mask=b_attention_mask)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = self.tokenizer.batch_decode(b_outputs, skip_special_tokens=True)

            for i in range(len(b_input_ids)):
                if split_reference is None:
                    sample_golden_translation = [batch['translated'][i]]
                else:
                    sample_golden_translation = batch['translated'][i].split(split_reference)
                sample_generated_translation = b_predictions[i]
                bleu_score = sacrebleu.corpus_bleu(sys_stream=[sample_generated_translation],
                                                   ref_streams=[[g] for g in sample_golden_translation]).score
                output_predictions.append((batch['original'][i], batch['translated'][i], b_predictions[i], bleu_score))

                for j in range(max_num_ref):
                    try:
                        golden_translations[j].append(sample_golden_translation[j])
                    except:
                        golden_translations[j].append('')
                predicted_translations.append(sample_generated_translation)

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_text))

        # evaluate
        print("BLEU Score: {}".format(sacrebleu.corpus_bleu(
            sys_stream=predicted_translations, ref_streams=golden_translations).score))
        return output_predictions


In [None]:
model_name='persiannlp/mt5-base-parsinlu-opus-translation_fa_en'
mt_model = MachineTranslation(model_name=model_name, model_type="mt5")
print(mt_model.config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4309802.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=375.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=696.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2329703923.0, style=ProgressStyle(descr…


MT5Config {
  "_name_or_path": "/home/patrick/hugging_face/t5/mt5-base",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.7.0",
  "use_cache": true,
  "vocab_size": 250112
}



## Sample Inference

In [None]:
input_list = [
  "ستایش خدای را که پروردگار جهانیان است.",
  "در هاید پارک کرنر بر گلدانی ایستاده موعظه می‌کند؛",
  "وی از تمامی بلاگرها، سازمان‌ها و افرادی که از وی پشتیبانی کرده‌اند، تشکر کرد.",
  "مشابه سال ۲۰۰۱، تولید آمونیاک بی آب در ایالات متحده در سال ۲۰۰۰ تقریباً ۱۷،۴۰۰،۰۰۰ تن (معادل بدون آب) با مصرف ظاهری ۲۲،۰۰۰،۰۰۰ تن و حدود ۴۶۰۰۰۰۰ با واردات خالص مواجه شد. ",
  "می خواهم دکترای علوم کامپیوتر راجع به شبکه های اجتماعی را دنبال کنم، چالش حل نشده در شبکه های اجتماعی چیست؟"
]
mt_model.mt5_machine_translation_inference(input_list, device)

['Adoration of God, the Lord of the world.',
 'At the High End of the Park, Conrad stands on a vase preaching;',
 'She thanked all the bloggers, organizations, and men who had supported her.',
 'In 2000, the lack of water ammonia in the United States was almost',
 'I want to follow the computer science doctorate on social networks. What is the unsolved challenge']

## Mizan Dataset


In [None]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/mizan/mizan_test_fa_en.tsv
!ls

--2021-07-04 13:35:27--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/mizan/mizan_test_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2296459 (2.2M) [text/plain]
Saving to: ‘mizan_test_fa_en.tsv.1’


2021-07-04 13:35:27 (29.6 MB/s) - ‘mizan_test_fa_en.tsv.1’ saved [2296459/2296459]

adc.json
mizan_test_fa_en.tsv
mizan_test_fa_en.tsv.1
sample_data
translation_combined_fa_en
translation_fa-en_combined_all_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_all_split-refs_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_bible_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_mizan_p

In [None]:
mizan_original_text, mizan_translated_text = mt_model.load_dataset_test_file(dataset_name="mizan", dataset_file="mizan_test_fa_en.tsv")
print(mizan_original_text[:5])
print(mizan_translated_text[:5])
print(len(mizan_original_text))
print(len(mizan_translated_text))

test part:
 #original: 10000, #translation: 10000
['این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.', 'عاقبت گفت: سزار! تو از این موضوع نگران شده\u200cای؛', 'بسیار متأسفم که نسبت به آن احساس بی اعتمادی می\u200cکنی؛', 'اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.', 'مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی\u200cدانم؛']
['a sure sign with him of mental agitation.', ';Cesare, you are worried about this affair,; she said at last.', ';I am very sorry you feel so despondent over it;', 'but I could decide only as seemed right to me.;', ';It is not the affair,; he answered, sullenly; ;I know nothing about it,']
10000
10000


In [None]:
mt_model.mt5_machine_translation_inference(mizan_original_text[:5], device)

['It was a decisive sign of her emotion, and she began to walk about the',
 'You are worried about that, Cesare, she said at last.',
 'I am very sorry that you feel distrustful of it;',
 "But I couldn't decide just because I thought it right.",
 "It's not that, said Martini, frowning. I don't know about"]

In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:36:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    47W /  70W |   3606MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(mizan_original_text, mizan_translated_text, device, max_length=512, batch_size=128)

#original_text:10000, #translated_text:10000
#batch: 79
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 8.366775169000903
inference time for step 1: 8.601903914001014
inference time for step 2: 8.521423998998216
inference time for step 3: 8.18495311300103
inference time for step 4: 8.00950059799834
inference time for step 5: 7.896409086999483
inference time for step 6: 7.856865707000907
inference time for step 7: 7.877456219999658
inference time for step 8: 7.903208613999595
inference time for step 9: 7.989156920999449
inference time for step 10: 8.034687993000261
inference time for step 11: 8.101502176001304
inference time for step 12: 8.129379607000374
inference time for step 13: 8.132988900999408
inference time for step 14: 8.096770193000339
inference time for step 15: 8.058559381999657
inference time for step 16: 8.016212171000006
inference time for step 17: 7.998820540999077
inference time for step 18: 8.004155681001066
inference time f

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.	a sure sign with him of mental agitation.	It was a decisive sign of her emotion, and she began to walk about the	3.21858262703621
عاقبت گفت: سزار! تو از این موضوع نگران شده‌ای؛	;Cesare, you are worried about this affair,; she said at last.	You are worried about that, Cesare, she said at last.	34.5576382124923
بسیار متأسفم که نسبت به آن احساس بی اعتمادی می‌کنی؛	;I am very sorry you feel so despondent over it;	I am very sorry that you feel distrustful of it;	28.947421495675087
اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.	but I could decide only as seemed right to me.;	But I couldn't decide just because I thought it right.	4.839576869824698
مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی‌دانم؛	;It is not the affair,; he answered, sullenly; ;I know nothing about it,	It's not that, said Martini, frowning. I don't know about	2.821711598725708
هنگامی که تو قبول می‌کنی در کاری شرکت جوی

In [None]:
output_file_name = "translation_fa-en_mizan_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## Quran Dataset

In [None]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/quran/quran_fa_en.tsv
!ls

--2021-07-04 13:47:33--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/quran/quran_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9309105 (8.9M) [text/plain]
Saving to: ‘quran_fa_en.tsv’


2021-07-04 13:47:34 (53.7 MB/s) - ‘quran_fa_en.tsv’ saved [9309105/9309105]

adc.json
mizan_test_fa_en.tsv
mizan_test_fa_en.tsv.1
quran_fa_en.tsv
sample_data
translation_combined_fa_en
translation_fa-en_combined_all_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_all_split-refs_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_bible_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_mizan_pers

In [None]:
quran_original_all, quran_translated_all, quran_original_test, quran_translated_test = mt_model.load_dataset_file(dataset_name="quran", dataset_file="quran_fa_en.tsv")
print(quran_original_all[0])
print(quran_translated_all[0])
print(quran_original_test[0])
print(quran_translated_test[0])

all data:
 #original: 6236, #translation: 6236
test part:
 #original: 624, #translation: 624
به نام خداوند رحمتگر مهربان
In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.
حقا که انسان سرکشى مى‌کند،
And yet, but yet man is rebellious,///Yes indeed, man is surely rebellious.///No indeed; surely Man waxes insolent,///By no means: Verily man exorbitateth.///Nay! Verily, man does transgress all bounds (in disbelief and evil deed, etc.).///In fact, man oversteps all bounds.///Nay, surely man transgresses;///Nay! Verily, ma

In [None]:
mt_model.mt5_machine_translation_inference(quran_original_test[:5], device)

["It's a shame to be a rebel",
 'Then, if you lie, what is her bag? they said.',
 'Then go to her and say, We are two messengers of Thy Lord, then',
 'Indeed, those who have been expelled and died, though the nails',
 'My Lord, said she, my bones are weak and my hair is white with']

### test set

In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:47:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    47W /  70W |   3594MiB / 15109MiB |     53%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, split_reference='///', batch_size=128)

#original_text:624, #translated_text:624
#batch: 5
#maximum_translation_reference: 9
Start to evaluate test data ...
inference time for step 0: 7.988629239998772
inference time for step 1: 8.058328048000476
inference time for step 2: 8.121238005998748
inference time for step 3: 8.157142791000297
inference time for step 4: 7.140269774999979
total inference time: 39.46560785999827
total inference time / #samples: 0.06324616644230492
BLEU Score: 9.764575337764299


In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

حقا که انسان سرکشى مى‌کند،	And yet, but yet man is rebellious,///Yes indeed, man is surely rebellious.///No indeed; surely Man waxes insolent,///By no means: Verily man exorbitateth.///Nay! Verily, man does transgress all bounds (in disbelief and evil deed, etc.).///In fact, man oversteps all bounds.///Nay, surely man transgresses;///Nay! Verily, man does transgress.///Nay, but man doth transgress all bounds,	It's a shame to be a rebel	3.283637368030198
گفتند: «پس، اگر دروغ بگویید، کیفرش چیست؟»	"What should be the punishment," they were asked, "in case you are liars?"///They said, “And what shall be the punishment for it, if you are liars?”///They said, 'And what shall be its recompense if you are liars?'///They said: what shall be the meed of him, if ye are found liars!///They [Yusuf's (Joseph) men] said: "What then shall be the penalty of him, if you are (proved to be) liars."///They said, “What shall be his punishment, if you are lying?”///The officials said: "If you are lying, what

In [None]:
output_file_name = "translation_fa-en_quran_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### all dataset

In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:48:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    35W /  70W |   9750MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_all, quran_translated_all, device, max_length=512, split_reference='///', batch_size=128)

#original_text:6236, #translated_text:6236
#batch: 49
#maximum_translation_reference: 9
Start to evaluate test data ...
inference time for step 0: 8.07924799600005
inference time for step 1: 8.091072122999321
inference time for step 2: 8.091438134999407
inference time for step 3: 8.079227563999666
inference time for step 4: 8.081269636000798
inference time for step 5: 8.137243930001205
inference time for step 6: 8.102312872000766
inference time for step 7: 8.133919571999286
inference time for step 8: 8.122920267000154
inference time for step 9: 8.114621116999842
inference time for step 10: 8.136166786998729
inference time for step 11: 8.093243310000616
inference time for step 12: 8.09830978899845
inference time for step 13: 8.11622275300033
inference time for step 14: 8.067702284999541
inference time for step 15: 8.101806357999521
inference time for step 16: 8.09454076899965
inference time for step 17: 8.07355334299973
inference time for step 18: 8.062287095001011
inference time for st

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	In the name of the beneficent Lord	24.549475440235113
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise 

In [None]:
output_file_name = "translation_fa-en_quran_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## Bible Dataset

In [None]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/bible/bible_fa_en.tsv
!ls

--2021-07-04 13:55:24--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/bible/bible_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10023337 (9.6M) [text/plain]
Saving to: ‘bible_fa_en.tsv’


2021-07-04 13:55:25 (27.7 MB/s) - ‘bible_fa_en.tsv’ saved [10023337/10023337]

adc.json
bible_fa_en.tsv
mizan_test_fa_en.tsv
mizan_test_fa_en.tsv.1
quran_fa_en.tsv
sample_data
translation_combined_fa_en
translation_fa-en_combined_all_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_all_split-refs_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_bible_persiannlp-mt5-base-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_

In [None]:
bible_original_all, bible_translated_all, bible_original_test, bible_translated_test = mt_model.load_dataset_file(dataset_name="bible", dataset_file="bible_fa_en.tsv")
print(bible_original_all[0])
print(bible_translated_all[0])
print(bible_original_test[0])
print(bible_translated_test[0])

all data:
 #original: 31020, #translation: 31020
test part:
 #original: 3102, #translation: 3102
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.
و چون قربانی های سوختنی تمام شد، پادشاه و جمیع حاضرین با وی رکوع کرده، سجده نمودند.
And of the sons of Elizaphan; Shimri, and Jeiel: and of the sons of Asaph; Zechariah, and Mattaniah:


In [None]:
mt_model.mt5_machine_translation_inference(bible_original_test[:5], device)

['and when the burning victims were over, the king and the whole company',
 'And you order the Children of Israel to bring to you a controlled olive oil for light',
 'and the Beni Joad, Dibban, Aherfurth, and Arar.',
 'So, since you have a case of arguments in the beginning of days, will',
 'and take a little of it, and tie them in her lap.']

### test set

In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:55:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    44W /  70W |   3614MiB / 15109MiB |     66%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(bible_original_test, bible_translated_test, device, max_length=512, batch_size=128)

#original_text:3102, #translated_text:3102
#batch: 25
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 7.8997574580007495
inference time for step 1: 8.169048843999917
inference time for step 2: 8.496352142999967
inference time for step 3: 8.59380192000026
inference time for step 4: 8.298068955999042
inference time for step 5: 8.10789097199995
inference time for step 6: 7.974594044000696
inference time for step 7: 7.889892172001055
inference time for step 8: 7.888511260000087
inference time for step 9: 7.935229069998968
inference time for step 10: 7.96956802800014
inference time for step 11: 8.037250575000144
inference time for step 12: 8.09429644500051
inference time for step 13: 8.139275554998676
inference time for step 14: 8.1286289210002
inference time for step 15: 8.107955678000508
inference time for step 16: 8.064976851999745
inference time for step 17: 8.061161939000158
inference time for step 18: 8.026893892998487
inference time for st

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

و چون قربانی های سوختنی تمام شد، پادشاه و جمیع حاضرین با وی رکوع کرده، سجده نمودند.	And of the sons of Elizaphan; Shimri, and Jeiel: and of the sons of Asaph; Zechariah, and Mattaniah:	and when the burning victims were over, the king and the whole company	2.0788628272808367
«و تو بنی‌اسرائیل را امر فرما که روغن زیتون مصفی و کوبیده شده برای روشنایی نزد توبیاورند تا چراغها دائم روشن شود.در خیمه اجتماع، بیرون پرده‌ای که در برابر شهادت است، هارون و پسرانش از شام تا صبح، به حضورخداوند آن را درست کنند. و این برای بنی‌اسرائیل نسلا بعد نسل فریضه ابدی باشد.	All the pillars round about the court shall be filleted with silver; their hooks shall be of silver, and their sockets of brass.	And you order the Children of Israel to bring to you a controlled olive oil for light	1.6017504241305096
وبنی جاد، دیبون و عطاروت و عروعیر.	And Moses said unto them, If the children of Gad and the children of Reuben will pass with you over Jordan, every man armed to battle, before the LORD, and the land shall be su

In [None]:
output_file_name = "translation_fa-en_bible_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### all dataset

In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:58:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0    35W /  70W |   9754MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(bible_original_all, bible_translated_all, device, max_length=512, batch_size=128)

#original_text:31020, #translated_text:31020
#batch: 243
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 8.059537223998632
inference time for step 1: 8.06286885100053
inference time for step 2: 8.096230234999894
inference time for step 3: 8.123316519000582
inference time for step 4: 8.114157522000824
inference time for step 5: 8.072635743999854
inference time for step 6: 8.057830245999867
inference time for step 7: 8.058285680001063
inference time for step 8: 8.019618468999397
inference time for step 9: 8.015217889000269
inference time for step 10: 8.029196637000496
inference time for step 11: 8.039915216000736
inference time for step 12: 8.037425647999044
inference time for step 13: 8.04259995999928
inference time for step 14: 8.042018732001452
inference time for step 15: 8.062709119001738
inference time for step 16: 8.051262816999952
inference time for step 17: 8.077349279999908
inference time for step 18: 8.061997118000363
inference time 

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was empty and dark on the sledge, and the soul of God sw	2.8428173313330687
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light up.' And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	And God saw the light that was shining, and God drew the light from the d	34.712833726393406
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he c

In [None]:
output_file_name = "translation_fa-en_bible_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## translation_combined

In [None]:
!mkdir translation_combined_fa_en
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/translation_combined_fa_en/test.tsv -O translation_combined_fa_en/test.tsv
!ls translation_combined_fa_en

--2021-07-04 09:49:31--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/translation_combined_fa_en/test.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22332746 (21M) [text/plain]
Saving to: ‘translation_combined_fa_en/test.tsv’


2021-07-04 09:49:32 (137 MB/s) - ‘translation_combined_fa_en/test.tsv’ saved [22332746/22332746]

test.tsv


### All combined test set

In [None]:
combined_original_test, combined_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv")
print(combined_original_test[0])
print(combined_translated_test[0])

test part:
 #original: 47738, #translation: 47738
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.


In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 09:49:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    29W /  70W |   3400MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(combined_original_test, combined_translated_test, device, max_length=512, batch_size=128)

#original_text:47738, #translated_text:47738
#batch: 373
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 7.123153103999982
inference time for step 1: 7.138013588999968
inference time for step 2: 7.1837293880000175
inference time for step 3: 7.2552586959999985
inference time for step 4: 7.36088403399998
inference time for step 5: 7.47188553999996
inference time for step 6: 7.622355150000033
inference time for step 7: 7.684437299999956
inference time for step 8: 7.757815830999959
inference time for step 9: 7.8348062979999895
inference time for step 10: 7.898155283999984
inference time for step 11: 7.966328658999998
inference time for step 12: 8.034286443999974
inference time for step 13: 8.07167466200002
inference time for step 14: 8.125991316000011
inference time for step 15: 8.110899457999949
inference time for step 16: 8.079648611000039
inference time for step 17: 8.029199832000018
inference time for step 18: 8.01818091299998
inference time

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was empty and dark on the sledge, and the soul of God sw	2.8428173313330687
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light up.' And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	And God saw the light that was shining, and God drew the light from the d	34.712833726393406
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he c

In [None]:
output_file_name = "translation_fa-en_combined_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

In [None]:
evaluation_output = mt_model.mt5_evaluation(combined_original_test, combined_translated_test, device, max_length=512, split_reference='///', batch_size=128)

#original_text:47738, #translated_text:47738
#batch: 373
#maximum_translation_reference: 25
Start to evaluate test data ...
inference time for step 0: 7.965248831000281
inference time for step 1: 8.201935458000207
inference time for step 2: 8.49456716800023
inference time for step 3: 8.593090368000048
inference time for step 4: 8.289852853999946
inference time for step 5: 8.08547224799986
inference time for step 6: 7.966275131999737
inference time for step 7: 7.91225688600025
inference time for step 8: 7.8982734309997795
inference time for step 9: 7.932194549000087
inference time for step 10: 7.994465075999869
inference time for step 11: 8.071606621999763
inference time for step 12: 8.116091438999774
inference time for step 13: 8.116110413000115
inference time for step 14: 8.116616958999657
inference time for step 15: 8.074506931000087
inference time for step 16: 8.02482785300026
inference time for step 17: 8.025334750000184
inference time for step 18: 8.024434130999907
inference time 

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was empty and dark on the sledge, and the soul of God sw	2.8428173313330687
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light up.' And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	And God saw the light that was shining, and God drew the light from the d	34.712833726393406
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he c

In [None]:
output_file_name = "translation_fa-en_combined_all_split-refs_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Bible subset

In [None]:
bible_original_test, bible_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='bible_fa_en')
print(bible_original_test[0])
print(bible_translated_test[0])

test part:
 #original: 31020, #translation: 31020
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.


In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 12:33:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    33W /  70W |   9550MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(bible_original_test, bible_translated_test, device, max_length=512, batch_size=128)

#original_text:31020, #translated_text:31020
#batch: 243
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 7.771509326999876
inference time for step 1: 7.732253968000805
inference time for step 2: 7.806578101000923
inference time for step 3: 7.876541589999761
inference time for step 4: 7.978815063999718
inference time for step 5: 8.052519263999784
inference time for step 6: 8.110075369000697
inference time for step 7: 8.153144391999376
inference time for step 8: 8.14708280800005
inference time for step 9: 8.121651097000722
inference time for step 10: 8.07994028900066
inference time for step 11: 8.03103804300008
inference time for step 12: 7.984988610998698
inference time for step 13: 7.998037483999724
inference time for step 14: 8.007555184000012
inference time for step 15: 8.010569337999186
inference time for step 16: 8.021552924999924
inference time for step 17: 8.049819408000985
inference time for step 18: 8.035510309999154
inference time f

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was empty and dark on the sledge, and the soul of God sw	2.8428173313330687
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light up.' And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	And God saw the light that was shining, and God drew the light from the d	34.712833726393406
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he c

In [None]:
output_file_name = "translation_fa-en_combined_bible_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Mizan subset

In [None]:
mizan_original_test, mizan_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='mizan_test_fa_en')
print(mizan_original_test[0])
print(mizan_translated_test[0])

test part:
 #original: 10000, #translation: 10000
این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.
a sure sign with him of mental agitation.


In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:08:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    32W /  70W |   9752MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(mizan_original_test, mizan_translated_test, device, max_length=512, batch_size=128)

#original_text:10000, #translated_text:10000
#batch: 79
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 7.507030077000309
inference time for step 1: 7.584973842000181
inference time for step 2: 7.680028483000569
inference time for step 3: 7.7342257349992
inference time for step 4: 7.761801612999989
inference time for step 5: 7.807817631999569
inference time for step 6: 7.860172823000539
inference time for step 7: 7.927743008000107
inference time for step 8: 7.993969204999303
inference time for step 9: 8.077550428000905
inference time for step 10: 8.095709208000699
inference time for step 11: 8.098879828001373
inference time for step 12: 8.071405975000744
inference time for step 13: 8.040303214998858
inference time for step 14: 8.026126520999242
inference time for step 15: 8.025495717000013
inference time for step 16: 7.9921363630001
inference time for step 17: 7.994392672999311
inference time for step 18: 7.996710683000856
inference time for

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.	a sure sign with him of mental agitation.	It was a decisive sign of her emotion, and she began to walk about the	3.21858262703621
عاقبت گفت: سزار! تو از این موضوع نگران شده‌ای؛	;Cesare, you are worried about this affair,; she said at last.	You are worried about that, Cesare, she said at last.	34.5576382124923
بسیار متأسفم که نسبت به آن احساس بی اعتمادی می‌کنی؛	;I am very sorry you feel so despondent over it;	I am very sorry that you feel distrustful of it;	28.947421495675087
اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.	but I could decide only as seemed right to me.;	But I couldn't decide just because I thought it right.	4.839576869824698
مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی‌دانم؛	;It is not the affair,; he answered, sullenly; ;I know nothing about it,	It's not that, said Martini, frowning. I don't know about	2.821711598725708
هنگامی که تو قبول می‌کنی در کاری شرکت جوی

In [None]:
output_file_name = "translation_fa-en_combined_mizan_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### qqp subset

In [None]:
qqp_original_test, qqp_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='qqp_test_fa_en')
print(qqp_original_test[0])
print(qqp_translated_test[0])

test part:
 #original: 489, #translation: 489
آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Does the spirit world exist? If yes, who is the initiator and controller of this spirit realm?


In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:19:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    34W /  70W |   9752MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(qqp_original_test, qqp_translated_test, device, max_length=512, batch_size=128)

#original_text:489, #translated_text:489
#batch: 4
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 8.058886218001135
inference time for step 1: 8.356204624000384
inference time for step 2: 8.550859928998761
inference time for step 3: 6.96855494600095
total inference time: 31.93450571700123
total inference time / #samples: 0.0653057376625792
BLEU Score: 26.897897164732818


In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟	Does the spirit world exist? If yes, who is the initiator and controller of this spirit realm?	Is there a Soul World? If yes, who is the creative and controlling representative	29.986344793788927
چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟	How can I stop watching porn videos?	How can I stop watching porn?	72.89545183625967
آیا قرار است دونالد ترامپ رئیس جمهور بعدی ایالات متحده باشد؟	Is Donald Trump going to be the next US President?	Is Donald Trump going to be the next president of the United States?	52.960749334062214
چگونه می توانم سوالی را در این باره بپرسم؟	How do I ask a question on this?	How can I ask a question about it?	33.03164318013809
مضرات لیست پیوندی چیست؟	What are the disadvantages of linked lists?	What are the advantages of a linked list?	21.10534063187263
اگر از تولید مثل افراد با ضریب هوشی ضعیف جلوگیری کنیم، آیا در نهایت تکامل انسان ها باهوش تر می شوند؟	If we prevent people with low IQ 

In [None]:
output_file_name = "translation_fa-en_combined_qqp_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Quran subset

In [None]:
quran_original_test, quran_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='quran_fa_en')
print(quran_original_test[0])
print(quran_translated_test[0])

test part:
 #original: 6229, #translation: 6229
به نام خداوند رحمتگر مهربان
In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.


In [None]:
!nvidia-smi
!lscpu

Sun Jul  4 13:20:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    34W /  70W |   9750MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, batch_size=128)

#original_text:6229, #translated_text:6229
#batch: 49
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 7.896840942999916
inference time for step 1: 8.079135514000882
inference time for step 2: 8.3332555220004
inference time for step 3: 8.37134233299912
inference time for step 4: 8.2620477319997
inference time for step 5: 8.177748050000446
inference time for step 6: 8.069168559999525
inference time for step 7: 8.015882790001342
inference time for step 8: 7.992925300999559
inference time for step 9: 8.032004325999878
inference time for step 10: 8.05433059799907
inference time for step 11: 8.099501471000622
inference time for step 12: 8.112245001000701
inference time for step 13: 8.1292317919997
inference time for step 14: 8.080362003000118
inference time for step 15: 8.107661722999183
inference time for step 16: 8.099896807998448
inference time for step 17: 8.05894172400076
inference time for step 18: 8.05606653399991
inference time for step 19

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	In the name of the beneficent Lord	3.738880136981003e-07
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Prai

In [None]:
output_file_name = "translation_fa-en_combined_quran_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, split_reference='///', batch_size=128)

#original_text:6229, #translated_text:6229
#batch: 49
#maximum_translation_reference: 25
Start to evaluate test data ...
inference time for step 0: 8.14692511100111
inference time for step 1: 8.445976470999085
inference time for step 2: 8.640504502000113
inference time for step 3: 8.396306055999958
inference time for step 4: 8.174118106000606
inference time for step 5: 8.062042096000368
inference time for step 6: 7.931859498999984
inference time for step 7: 7.9621764099993015
inference time for step 8: 7.971205972000462
inference time for step 9: 8.030079922000368
inference time for step 10: 8.1229986620001
inference time for step 11: 8.132766050999635
inference time for step 12: 8.159352138998656
inference time for step 13: 8.156484357999943
inference time for step 14: 8.084165414000381
inference time for step 15: 8.054679411001416
inference time for step 16: 8.03859575199931
inference time for step 17: 8.001057494999259
inference time for step 18: 8.006591197001399
inference time for

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	In the name of the beneficent Lord	24.549475440235113
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise 

In [None]:
output_file_name = "translation_fa-en_combined_quran_split-refs_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## QQP Dataset

## TEP Dataset


## OPUS Dataset

## Global Voice Dataset