# Machine Translation

In [1]:
!nvidia-smi
!lscpu

Thu Jul  8 07:34:55 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0
!pip install clean-text[gpl]==0.4.0
!pip install sacrebleu==1.5.1

Collecting hazm==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/22/13/5a7074bc11d20dbbb46239349ac3f85f7edc148b4cf68e9b8c2f8263830c/hazm-0.7.0-py3-none-any.whl (316kB)
[K     |████████████████████████████████| 317kB 7.7MB/s 
[?25hCollecting libwapiti>=0.2.1; platform_system != "Windows"
[?25l  Downloading https://files.pythonhosted.org/packages/bc/0f/1c9b49bb49821b5856a64ea6fac8d96a619b9f291d1f06999ea98a32c89c/libwapiti-0.2.1.tar.gz (233kB)
[K     |████████████████████████████████| 235kB 14.4MB/s 
[?25hCollecting nltk==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 15.1MB/s 
Building wheels for collected packages: libwapiti, nltk
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86_64.whl size=154012 sha256=bc8ee9749479cb494

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
# Import required packages
import os
import gc
import re
import hazm
import time
import json
import sacrebleu
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import transformers
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import MT5Config, MT5ForConditionalGeneration, MT5Tokenizer

from cleantext import clean

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
class MachineTranslationDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Machine Translation. """

    def __init__(self, original_text, translated_text, tokenizer, max_length):
        self.original_text = original_text
        self.translated_text = translated_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.original_text)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.original_text[item],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        inputs = {
            'original': self.original_text[item],
            'translated': self.translated_text[item],
            'input_ids': encoding.input_ids.flatten(),
            'attention_mask': encoding.attention_mask.flatten()
        }
        return inputs


class MachineTranslation:
    def __init__(self, model_name, model_type):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        if model_type.lower() == "mt5":
            self.tokenizer = MT5Tokenizer.from_pretrained(model_name)
            self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
            self.config = MT5Config.from_pretrained(self.model_name)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.id2label = self.config.id2label
            self.label2id = self.config.label2id

    def load_dataset_test_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() == "mizan":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['original', 'translation'], header=None)
            original, translation = data['original'].values.tolist(), data['translation'].values.tolist()
            print(f'test part:\n #original: {len(original)}, #translation: {len(translation)}')
            return original, translation
        if dataset_name.lower() == "combined":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['original', 'translation', 'source'], header=None)
            if 'source' in kwargs:
                data = data[data['source'] == kwargs['source']]
                data = data[['original', 'translation']]
            else:
                data = data[['original', 'translation']]
            original, translation = data['original'].values.tolist(), data['translation'].values.tolist()
            print(f'test part:\n #original: {len(original)}, #translation: {len(translation)}')
            return original, translation

    def load_dataset_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() == "quran" or dataset_name.lower() == "bible":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            original, translation = [], []
            with open(dataset_file, encoding="utf8") as infile:
                for line in infile:
                    parts = line.strip().split('\t')
                    original.append(parts[0])
                    translation.append(parts[1])
            print(f'all data:\n #original: {len(original)}, #translation: {len(translation)}')

            _, original_test, _, translation_test = train_test_split(original, translation, test_size=0.1,
                                                                     random_state=1)
            print(f'test part:\n #original: {len(original_test)}, #translation: {len(translation_test)}')
            return original, translation, original_test, translation_test

    def mt5_machine_translation_inference(self, input_text, device):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return

        tokenized_batch = self.tokenizer(
            input_text,
            padding=True,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        input_ids = tokenized_batch.input_ids.to(device)
        attention_mask = tokenized_batch.attention_mask.to(device)
        outputs = self.model.generate(input_ids=input_ids,
                                      attention_mask=attention_mask)
        predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return predictions

    def mt5_evaluation(self, input_text, translated_text, device, max_length, split_reference=None, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if len(input_text) != len(translated_text):
            print('length of inputs and its translations is not equal!!')
            return

        dataset = MachineTranslationDataset(original_text=input_text, translated_text=translated_text,
                                            tokenizer=self.tokenizer, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#original_text:{len(input_text)}, #translated_text:{len(translated_text)}')
        print("#batch:", len(data_loader))

        if split_reference is None:
            max_num_ref = 1
        else:
            max_num_ref = 0
            for ref in translated_text:
                max_num_ref = max(max_num_ref, len(ref.split(split_reference)))
        print("#maximum_translation_reference:", max_num_ref)

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        output_predictions = []
        golden_translations, predicted_translations = [[] for _ in range(max_num_ref)], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model.generate(input_ids=b_input_ids, attention_mask=b_attention_mask)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = self.tokenizer.batch_decode(b_outputs, skip_special_tokens=True)

            for i in range(len(b_input_ids)):
                if split_reference is None:
                    sample_golden_translation = [batch['translated'][i]]
                else:
                    sample_golden_translation = batch['translated'][i].split(split_reference)
                sample_generated_translation = b_predictions[i]
                bleu_score = sacrebleu.corpus_bleu(sys_stream=[sample_generated_translation],
                                                   ref_streams=[[g] for g in sample_golden_translation]).score
                output_predictions.append((batch['original'][i], batch['translated'][i], b_predictions[i], bleu_score))

                for j in range(max_num_ref):
                    try:
                        golden_translations[j].append(sample_golden_translation[j])
                    except:
                        golden_translations[j].append('')
                predicted_translations.append(sample_generated_translation)

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_text))

        # evaluate
        print("BLEU Score: {}".format(sacrebleu.corpus_bleu(
            sys_stream=predicted_translations, ref_streams=golden_translations).score))
        return output_predictions


In [6]:
model_name='persiannlp/mt5-large-parsinlu-opus-translation_fa_en'
mt_model = MachineTranslation(model_name=model_name, model_type="mt5")
print(mt_model.config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4309802.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=376.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=698.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4918521655.0, style=ProgressStyle(descr…


MT5Config {
  "_name_or_path": "/home/patrick/hugging_face/t5/mt5-large",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.7.0",
  "use_cache": true,
  "vocab_size": 250112
}



## Sample Inference

In [None]:
input_list = [
  "ستایش خدای را که پروردگار جهانیان است.",
  "در هاید پارک کرنر بر گلدانی ایستاده موعظه می‌کند؛",
  "وی از تمامی بلاگرها، سازمان‌ها و افرادی که از وی پشتیبانی کرده‌اند، تشکر کرد.",
  "مشابه سال ۲۰۰۱، تولید آمونیاک بی آب در ایالات متحده در سال ۲۰۰۰ تقریباً ۱۷،۴۰۰،۰۰۰ تن (معادل بدون آب) با مصرف ظاهری ۲۲،۰۰۰،۰۰۰ تن و حدود ۴۶۰۰۰۰۰ با واردات خالص مواجه شد. ",
  "می خواهم دکترای علوم کامپیوتر راجع به شبکه های اجتماعی را دنبال کنم، چالش حل نشده در شبکه های اجتماعی چیست؟"
]
mt_model.mt5_machine_translation_inference(input_list, device)

['the praise of God, the Lord of the world.',
 'At the Hyde Park Corner, Carpenter is preaching on a vase;',
 'He thanked all the bloggers, organizations, and people who had supported him.',
 'Similarly in 2001, the production of waterless ammonia in the United States was',
 'I want to pursue my degree in Computer Science on social networks, what is the']

## Mizan Dataset


In [None]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/mizan/mizan_test_fa_en.tsv
!ls

--2021-07-05 13:10:14--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/mizan/mizan_test_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2296459 (2.2M) [text/plain]
Saving to: ‘mizan_test_fa_en.tsv’


2021-07-05 13:10:14 (18.3 MB/s) - ‘mizan_test_fa_en.tsv’ saved [2296459/2296459]

adc.json
mizan_test_fa_en.tsv
sample_data
translation_combined_fa_en
translation_fa-en_combined_bible_persiannlp-mt5-large-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_mizan_persiannlp-mt5-large-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_qqp_persiannlp-mt5-large-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_quran_persiannlp-mt5-large-parsinlu-opus

In [None]:
mizan_original_text, mizan_translated_text = mt_model.load_dataset_test_file(dataset_name="mizan", dataset_file="mizan_test_fa_en.tsv")
print(mizan_original_text[:5])
print(mizan_translated_text[:5])
print(len(mizan_original_text))
print(len(mizan_translated_text))

test part:
 #original: 10000, #translation: 10000
['این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.', 'عاقبت گفت: سزار! تو از این موضوع نگران شده\u200cای؛', 'بسیار متأسفم که نسبت به آن احساس بی اعتمادی می\u200cکنی؛', 'اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.', 'مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی\u200cدانم؛']
['a sure sign with him of mental agitation.', ';Cesare, you are worried about this affair,; she said at last.', ';I am very sorry you feel so despondent over it;', 'but I could decide only as seemed right to me.;', ';It is not the affair,; he answered, sullenly; ;I know nothing about it,']
10000
10000


In [None]:
mt_model.mt5_machine_translation_inference(mizan_original_text[:5], device)

['This was a definite sign of his excitement, and he began to pace the',
 'You are worried about this, Cesare, she said at last;',
 'I am very sorry that you feel distrust of it;',
 "but I couldn't decide because it seemed right to me.",
 "It's not about that, I don't know, Martini answered, fr"]

In [None]:
!nvidia-smi
!lscpu

Mon Jul  5 13:10:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    26W /  70W |   6008MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(mizan_original_text, mizan_translated_text, device, max_length=512, batch_size=64)

#original_text:10000, #translated_text:10000
#batch: 157
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.00325888900079
inference time for step 1: 11.017670426999757
inference time for step 2: 11.07389337099994
inference time for step 3: 11.12354136400063
inference time for step 4: 11.1750854790007
inference time for step 5: 11.246063066000715
inference time for step 6: 11.284334967000177
inference time for step 7: 11.344476137999663
inference time for step 8: 11.357302088999859
inference time for step 9: 11.402133399000377
inference time for step 10: 11.399558916000387
inference time for step 11: 11.45953197900053
inference time for step 12: 11.454211731999749
inference time for step 13: 11.47087694900074
inference time for step 14: 11.493651388998842
inference time for step 15: 11.531069239001226
inference time for step 16: 11.527194061000046
inference time for step 17: 11.570545988999584
inference time for step 18: 11.59270880800068
in

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.	a sure sign with him of mental agitation.	This was a definite sign of his excitement, and he began to pace the	3.4585921141027356
عاقبت گفت: سزار! تو از این موضوع نگران شده‌ای؛	;Cesare, you are worried about this affair,; she said at last.	You are worried about this, Cesare, she said at last;	35.31760084168881
بسیار متأسفم که نسبت به آن احساس بی اعتمادی می‌کنی؛	;I am very sorry you feel so despondent over it;	I am very sorry that you feel distrust of it;	28.947421495675087
اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.	but I could decide only as seemed right to me.;	but I couldn't decide because it seemed right to me.	38.09694917244036
مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی‌دانم؛	;It is not the affair,; he answered, sullenly; ;I know nothing about it,	It's not about that, I don't know, Martini answered, fr	5.099408508435378
هنگامی که تو قبول می‌کنی در کاری شرکت جویی قطعا

In [None]:
output_file_name = "translation_fa-en_mizan_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## Quran Dataset

In [None]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/quran/quran_fa_en.tsv
!ls

--2021-07-06 12:20:48--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/quran/quran_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9309105 (8.9M) [text/plain]
Saving to: ‘quran_fa_en.tsv’


2021-07-06 12:20:48 (128 MB/s) - ‘quran_fa_en.tsv’ saved [9309105/9309105]

adc.json  quran_fa_en.tsv  sample_data


In [None]:
quran_original_all, quran_translated_all, quran_original_test, quran_translated_test = mt_model.load_dataset_file(dataset_name="quran", dataset_file="quran_fa_en.tsv")
print(quran_original_all[0])
print(quran_translated_all[0])
print(quran_original_test[0])
print(quran_translated_test[0])

all data:
 #original: 6236, #translation: 6236
test part:
 #original: 624, #translation: 624
به نام خداوند رحمتگر مهربان
In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.
حقا که انسان سرکشى مى‌کند،
And yet, but yet man is rebellious,///Yes indeed, man is surely rebellious.///No indeed; surely Man waxes insolent,///By no means: Verily man exorbitateth.///Nay! Verily, man does transgress all bounds (in disbelief and evil deed, etc.).///In fact, man oversteps all bounds.///Nay, surely man transgresses;///Nay! Verily, ma

In [None]:
mt_model.mt5_machine_translation_inference(quran_original_test[:5], device)

['It is really a rebellion',
 "They said, 'Then, if you lie, what is the sentence?",
 "So go to him and say, 'We are the two messengers of your Lord",
 'Indeed, those who have become heathen and died in heathen',
 'My Lord, said he, my bones have become weak, and my hair is']

### test set

In [None]:
!nvidia-smi
!lscpu

Tue Jul  6 12:21:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    26W /  70W |   6054MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, split_reference='///', batch_size=64)

#original_text:624, #translated_text:624
#batch: 10
#maximum_translation_reference: 9
Start to evaluate test data ...
inference time for step 0: 10.866318652000018
inference time for step 1: 10.99902750299998
inference time for step 2: 11.073077566999984
inference time for step 3: 11.165705927999966
inference time for step 4: 11.255002242000046
inference time for step 5: 11.310379074000025
inference time for step 6: 11.425524596999935
inference time for step 7: 11.488068639999938
inference time for step 8: 11.546032896000042
inference time for step 9: 8.733491746000027
total inference time: 109.86262884499996
total inference time / #samples: 0.17606190520032045
BLEU Score: 11.81974086849965


In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

حقا که انسان سرکشى مى‌کند،	And yet, but yet man is rebellious,///Yes indeed, man is surely rebellious.///No indeed; surely Man waxes insolent,///By no means: Verily man exorbitateth.///Nay! Verily, man does transgress all bounds (in disbelief and evil deed, etc.).///In fact, man oversteps all bounds.///Nay, surely man transgresses;///Nay! Verily, man does transgress.///Nay, but man doth transgress all bounds,	It is really a rebellion	8.745825313180626
گفتند: «پس، اگر دروغ بگویید، کیفرش چیست؟»	"What should be the punishment," they were asked, "in case you are liars?"///They said, “And what shall be the punishment for it, if you are liars?”///They said, 'And what shall be its recompense if you are liars?'///They said: what shall be the meed of him, if ye are found liars!///They [Yusuf's (Joseph) men] said: "What then shall be the penalty of him, if you are (proved to be) liars."///They said, “What shall be his punishment, if you are lying?”///The officials said: "If you are lying, what w

In [None]:
output_file_name = "translation_fa-en_quran_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### all dataset

In [None]:
!nvidia-smi
!lscpu

Tue Jul  6 12:23:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    29W /  70W |  13054MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_all, quran_translated_all, device, max_length=512, split_reference='///', batch_size=64)

#original_text:6236, #translated_text:6236
#batch: 98
#maximum_translation_reference: 9
Start to evaluate test data ...
inference time for step 0: 11.494835861999945
inference time for step 1: 11.583734705999973
inference time for step 2: 11.667045998000049
inference time for step 3: 11.787798978999945
inference time for step 4: 11.883785883999963
inference time for step 5: 11.867800160999991
inference time for step 6: 11.91991203100008
inference time for step 7: 11.930954306999979
inference time for step 8: 11.939887208999949
inference time for step 9: 11.986570874999984
inference time for step 10: 12.056072315999927
inference time for step 11: 12.07955954700003
inference time for step 12: 12.08717556199997
inference time for step 13: 12.116710911999917
inference time for step 14: 12.140323092000017
inference time for step 15: 12.136947064000083
inference time for step 16: 12.163429685000096
inference time for step 17: 12.22781145600004
inference time for step 18: 12.189903631999982
i

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	In the name of the good God	25.694343649393552
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise be to A

In [None]:
output_file_name = "translation_fa-en_quran_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## Bible Dataset

In [None]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/bible/bible_fa_en.tsv
!ls

--2021-07-06 12:44:10--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/bible/bible_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10023337 (9.6M) [text/plain]
Saving to: ‘bible_fa_en.tsv’


2021-07-06 12:44:11 (148 MB/s) - ‘bible_fa_en.tsv’ saved [10023337/10023337]

adc.json
bible_fa_en.tsv
quran_fa_en.tsv
sample_data
translation_fa-en_quran_all_persiannlp-mt5-large-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_quran_test_persiannlp-mt5-large-parsinlu-opus-translation_fa_en_outputs.txt


In [None]:
bible_original_all, bible_translated_all, bible_original_test, bible_translated_test = mt_model.load_dataset_file(dataset_name="bible", dataset_file="bible_fa_en.tsv")
print(bible_original_all[0])
print(bible_translated_all[0])
print(bible_original_test[0])
print(bible_translated_test[0])

all data:
 #original: 31020, #translation: 31020
test part:
 #original: 3102, #translation: 3102
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.
و چون قربانی های سوختنی تمام شد، پادشاه و جمیع حاضرین با وی رکوع کرده، سجده نمودند.
And of the sons of Elizaphan; Shimri, and Jeiel: and of the sons of Asaph; Zechariah, and Mattaniah:


In [None]:
mt_model.mt5_machine_translation_inference(bible_original_test[:5], device)

['and when the burning sacrifices were over, the king and all present knee',
 'And order the Children of Israel to bring to you stained and pulverized olive oil for',
 'Beninjae, Dibban, Ataraut, and Aridhair.',
 'So, having a case in the early days, do you appoint those',
 'take a little of it, and tie them up in your lap.']

### test set

In [None]:
!nvidia-smi
!lscpu

Tue Jul  6 12:44:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    41W /  70W |   6074MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(bible_original_test, bible_translated_test, device, max_length=512, batch_size=64)

#original_text:3102, #translated_text:3102
#batch: 49
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.84078446500007
inference time for step 1: 11.882279812999968
inference time for step 2: 11.93553352899994
inference time for step 3: 11.954951797000149
inference time for step 4: 12.023386095999967
inference time for step 5: 12.031369515000051
inference time for step 6: 12.052006599000151
inference time for step 7: 12.107233317999999
inference time for step 8: 12.133044715000096
inference time for step 9: 12.147898576999978
inference time for step 10: 12.16442962699989
inference time for step 11: 12.186904063999918
inference time for step 12: 12.216903351999918
inference time for step 13: 12.250871404000009
inference time for step 14: 12.26152230699995
inference time for step 15: 12.27969672800009
inference time for step 16: 12.297048546000042
inference time for step 17: 12.282341680000172
inference time for step 18: 12.325129248000167
in

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

و چون قربانی های سوختنی تمام شد، پادشاه و جمیع حاضرین با وی رکوع کرده، سجده نمودند.	And of the sons of Elizaphan; Shimri, and Jeiel: and of the sons of Asaph; Zechariah, and Mattaniah:	and when the burning sacrifices were over, the king and all present knee	2.0788628272808367
«و تو بنی‌اسرائیل را امر فرما که روغن زیتون مصفی و کوبیده شده برای روشنایی نزد توبیاورند تا چراغها دائم روشن شود.در خیمه اجتماع، بیرون پرده‌ای که در برابر شهادت است، هارون و پسرانش از شام تا صبح، به حضورخداوند آن را درست کنند. و این برای بنی‌اسرائیل نسلا بعد نسل فریضه ابدی باشد.	All the pillars round about the court shall be filleted with silver; their hooks shall be of silver, and their sockets of brass.	And order the Children of Israel to bring to you stained and pulverized olive oil for	1.7227831347538063
وبنی جاد، دیبون و عطاروت و عروعیر.	And Moses said unto them, If the children of Gad and the children of Reuben will pass with you over Jordan, every man armed to battle, before the LORD, and the land shall be 

In [None]:
output_file_name = "translation_fa-en_bible_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### all dataset

In [None]:
!nvidia-smi
!lscpu

Tue Jul  6 13:00:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    29W /  70W |  13406MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(bible_original_all, bible_translated_all, device, max_length=512, batch_size=64)

#original_text:31020, #translated_text:31020
#batch: 485
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.371538138999767
inference time for step 1: 11.502435934999994
inference time for step 2: 11.668586969999978
inference time for step 3: 11.815305209999678
inference time for step 4: 11.9069189920001
inference time for step 5: 12.060128422999696
inference time for step 6: 12.214114941999924
inference time for step 7: 12.342226693999692
inference time for step 8: 12.423770515999877
inference time for step 9: 12.45345667399988
inference time for step 10: 12.37940123999988
inference time for step 11: 12.342795864999971
inference time for step 12: 12.313985696000145
inference time for step 13: 12.351822595999693
inference time for step 14: 12.40213668999968
inference time for step 15: 12.412954604000333
inference time for step 16: 12.420817631000318
inference time for step 17: 12.426855868000075
inference time for step 18: 12.422315192000042

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	and the land was empty and deserted, and the darkness on the swamp	2.067469046683853
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light upon it!'And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light which was good, and God separated the light from the	28.93253943064403
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness 

In [None]:
output_file_name = "translation_fa-en_bible_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## translation_combined

In [7]:
!mkdir translation_combined_fa_en
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/translation_combined_fa_en/test.tsv -O translation_combined_fa_en/test.tsv
!ls translation_combined_fa_en

--2021-07-08 07:40:18--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/translation_combined_fa_en/test.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22332746 (21M) [text/plain]
Saving to: ‘translation_combined_fa_en/test.tsv’


2021-07-08 07:40:20 (175 MB/s) - ‘translation_combined_fa_en/test.tsv’ saved [22332746/22332746]

test.tsv


### All combined test set

In [10]:
combined_original_test, combined_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv")
print(combined_original_test[0])
print(combined_translated_test[0])

test part:
 #original: 47738, #translation: 47738
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.


In [None]:
!nvidia-smi
!lscpu

Wed Jul  7 12:19:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(combined_original_test, combined_translated_test, device, max_length=512, batch_size=64)

#original_text:47738, #translated_text:47738
#batch: 746
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.999718928999982
inference time for step 1: 12.219806820999906
inference time for step 2: 12.507507610000062
inference time for step 3: 12.618496002000029
inference time for step 4: 12.467314069000054
inference time for step 5: 12.32206978299996
inference time for step 6: 12.247183161999942
inference time for step 7: 12.229623206000042
inference time for step 8: 12.298171466999975
inference time for step 9: 12.370794174000025
inference time for step 10: 12.422565609999992
inference time for step 11: 12.420679536999955
inference time for step 12: 12.38677261600003
inference time for step 13: 12.366073646000018
inference time for step 14: 12.328986246
inference time for step 15: 12.338163618999943
inference time for step 16: 12.326076052000076
inference time for step 17: 12.355466498999931
inference time for step 18: 12.368564201000027
in

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	and the land was empty and deserted, and the darkness on the swamp	2.067469046683853
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light upon it!'And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light which was good, and God separated the light from the	28.93253943064403
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness 

In [None]:
output_file_name = "translation_fa-en_combined_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

In [8]:
!nvidia-smi
!lscpu

Thu Jul  8 07:40:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
evaluation_output = mt_model.mt5_evaluation(combined_original_test, combined_translated_test, device, max_length=512, split_reference='///', batch_size=64)

#original_text:47738, #translated_text:47738
#batch: 746
#maximum_translation_reference: 25
Start to evaluate test data ...
inference time for step 0: 11.127033530000006
inference time for step 1: 11.02012589200001
inference time for step 2: 11.065410416999953
inference time for step 3: 11.17801886199993
inference time for step 4: 11.344220734999908
inference time for step 5: 11.533076628000003
inference time for step 6: 11.716434865999986
inference time for step 7: 11.64711636200002
inference time for step 8: 11.684676963000015
inference time for step 9: 11.743896370000016
inference time for step 10: 11.73269771899993
inference time for step 11: 11.861390123999968
inference time for step 12: 11.938760141999978
inference time for step 13: 12.056396380000024
inference time for step 14: 12.105518913000083
inference time for step 15: 12.16715608100003
inference time for step 16: 12.183407135000039
inference time for step 17: 12.25266543500004
inference time for step 18: 12.302418783000007

In [12]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	and the land was empty and deserted, and the darkness on the swamp	2.067469046683853
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light upon it!'And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light which was good, and God separated the light from the	28.93253943064403
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness 

In [13]:
output_file_name = "translation_fa-en_combined_all_split-refs_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Bible subset

In [None]:
bible_original_test, bible_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='bible_fa_en')
print(bible_original_test[0])
print(bible_translated_test[0])

test part:
 #original: 31020, #translation: 31020
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.


In [None]:
!nvidia-smi
!lscpu

Mon Jul  5 11:30:52 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    28W /  70W |  13054MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(bible_original_test, bible_translated_test, device, max_length=512, batch_size=64)

#original_text:31020, #translated_text:31020
#batch: 485
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.595222237999224
inference time for step 1: 11.679540075999284
inference time for step 2: 11.792648941000152
inference time for step 3: 11.794719575000272
inference time for step 4: 11.680625977999625
inference time for step 5: 11.626483283000198
inference time for step 6: 11.587056344999837
inference time for step 7: 11.601875301000291
inference time for step 8: 11.616125268999895
inference time for step 9: 11.65312242499931
inference time for step 10: 11.658557729999302
inference time for step 11: 11.66715585899965
inference time for step 12: 11.661329338000542
inference time for step 13: 11.648068808000062
inference time for step 14: 11.627559303999988
inference time for step 15: 11.607864135000455
inference time for step 16: 11.605099317000168
inference time for step 17: 11.629676709000705
inference time for step 18: 11.629689833000

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	In the first place, God created heavens and earth.	13.492767333412544
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	and the land was empty and deserted, and the darkness on the swamp	2.067469046683853
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light upon it!'And it was light.	25.897672591713206
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light which was good, and God separated the light from the	28.93253943064403
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness 

In [None]:
output_file_name = "translation_fa-en_combined_bible_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Mizan subset

In [None]:
mizan_original_test, mizan_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='mizan_test_fa_en')
print(mizan_original_test[0])
print(mizan_translated_test[0])

test part:
 #original: 10000, #translation: 10000
این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.
a sure sign with him of mental agitation.


In [None]:
!nvidia-smi
!lscpu

Mon Jul  5 10:59:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    28W /  70W |  13054MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(mizan_original_test, mizan_translated_test, device, max_length=512, batch_size=64)

#original_text:10000, #translated_text:10000
#batch: 157
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.57766757499985
inference time for step 1: 11.642236697000044
inference time for step 2: 11.75371066300022
inference time for step 3: 11.741548130999945
inference time for step 4: 11.667130430999805
inference time for step 5: 11.589377700000114
inference time for step 6: 11.569780247999915
inference time for step 7: 11.573171590999664
inference time for step 8: 11.585412203000033
inference time for step 9: 11.64654785599987
inference time for step 10: 11.63354227200034
inference time for step 11: 11.636476932000278
inference time for step 12: 11.632033561999833
inference time for step 13: 11.62626284299995
inference time for step 14: 11.611862935000318
inference time for step 15: 11.614613536999968
inference time for step 16: 11.618632323999918
inference time for step 17: 11.616072734999761
inference time for step 18: 11.63443811199977


In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.	a sure sign with him of mental agitation.	This was a definite sign of his excitement, and he began to pace the	3.4585921141027356
عاقبت گفت: سزار! تو از این موضوع نگران شده‌ای؛	;Cesare, you are worried about this affair,; she said at last.	You are worried about this, Cesare, she said at last;	35.31760084168881
بسیار متأسفم که نسبت به آن احساس بی اعتمادی می‌کنی؛	;I am very sorry you feel so despondent over it;	I am very sorry that you feel distrust of it;	28.947421495675087
اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.	but I could decide only as seemed right to me.;	but I couldn't decide because it seemed right to me.	38.09694917244036
مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی‌دانم؛	;It is not the affair,; he answered, sullenly; ;I know nothing about it,	It's not about that, I don't know, Martini answered, fr	5.099408508435378
هنگامی که تو قبول می‌کنی در کاری شرکت جویی قطعا

In [None]:
output_file_name = "translation_fa-en_combined_mizan_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### qqp subset

In [None]:
qqp_original_test, qqp_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='qqp_test_fa_en')
print(qqp_original_test[0])
print(qqp_translated_test[0])

test part:
 #original: 489, #translation: 489
آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Does the spirit world exist? If yes, who is the initiator and controller of this spirit realm?


In [None]:
!nvidia-smi
!lscpu

Mon Jul  5 10:55:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    29W /  70W |  13406MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(qqp_original_test, qqp_translated_test, device, max_length=512, batch_size=64)

#original_text:489, #translated_text:489
#batch: 8
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.570670686000085
inference time for step 1: 11.660449064999739
inference time for step 2: 11.756614751999678
inference time for step 3: 11.71026837599993
inference time for step 4: 11.645137368000178
inference time for step 5: 11.585708417000205
inference time for step 6: 11.548544283999945
inference time for step 7: 7.502226803000212
total inference time: 88.97961975099997
total inference time / #samples: 0.18196241257873205
BLEU Score: 30.413704153909947


In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟	Does the spirit world exist? If yes, who is the initiator and controller of this spirit realm?	Is there a world of souls? And if so, who is the inventor and	14.179666986700974
چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟	How can I stop watching porn videos?	How can I stop watching porn?	72.89545183625967
آیا قرار است دونالد ترامپ رئیس جمهور بعدی ایالات متحده باشد؟	Is Donald Trump going to be the next US President?	Is Donald Trump to be the next president of the United States?	28.917849332325716
چگونه می توانم سوالی را در این باره بپرسم؟	How do I ask a question on this?	How can I ask a question about it?	33.03164318013809
مضرات لیست پیوندی چیست؟	What are the disadvantages of linked lists?	What are the threats of a membership list?	20.164945583740657
اگر از تولید مثل افراد با ضریب هوشی ضعیف جلوگیری کنیم، آیا در نهایت تکامل انسان ها باهوش تر می شوند؟	If we prevent people with low IQ from reproducing, would 

In [None]:
output_file_name = "translation_fa-en_combined_qqp_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Quran subset

In [None]:
quran_original_test, quran_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='quran_fa_en')
print(quran_original_test[0])
print(quran_translated_test[0])

test part:
 #original: 6229, #translation: 6229
به نام خداوند رحمتگر مهربان
In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.


In [None]:
!nvidia-smi
!lscpu

Mon Jul  5 10:16:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, batch_size=64)

#original_text:6229, #translated_text:6229
#batch: 98
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 11.217349980999984
inference time for step 1: 11.068350295999949
inference time for step 2: 11.11855355199998
inference time for step 3: 11.28292511799998
inference time for step 4: 11.413917484999956
inference time for step 5: 11.462326780000012
inference time for step 6: 11.55546157699996
inference time for step 7: 11.637482667999961
inference time for step 8: 11.727926102999959
inference time for step 9: 11.75071390100004
inference time for step 10: 11.721739075000073
inference time for step 11: 11.641416146999973
inference time for step 12: 11.61774131900006
inference time for step 13: 11.612824163000028
inference time for step 14: 11.633572975999982
inference time for step 15: 11.659815800999922
inference time for step 16: 11.666822068999977
inference time for step 17: 11.703171312000109
inference time for step 18: 11.680530526999974
in

In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	In the name of the good God	3.913243333339487e-07
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise be t

In [None]:
output_file_name = "translation_fa-en_combined_quran_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

In [None]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, split_reference='///', batch_size=64)

#original_text:6229, #translated_text:6229
#batch: 98
#maximum_translation_reference: 25
Start to evaluate test data ...
inference time for step 0: 11.575048884000125
inference time for step 1: 11.709057020000046
inference time for step 2: 11.812044664000041
inference time for step 3: 11.80501277899998
inference time for step 4: 11.72563246499999
inference time for step 5: 11.632038786000066
inference time for step 6: 11.593827462000036
inference time for step 7: 11.638240441000107
inference time for step 8: 11.65340986000001
inference time for step 9: 11.680953434000003
inference time for step 10: 11.715457715999946
inference time for step 11: 11.715445311999929
inference time for step 12: 11.684907289999956
inference time for step 13: 11.673085465999975
inference time for step 14: 11.660965529000123
inference time for step 15: 11.644028690000141
inference time for step 16: 11.65454769300004
inference time for step 17: 11.682826076000083
inference time for step 18: 11.677530341000192


In [None]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	In the name of the good God	25.694343649393552
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise be to A

In [None]:
output_file_name = "translation_fa-en_combined_quran_split-refs_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## QQP Dataset

## TEP Dataset


## OPUS Dataset

## Global Voice Dataset