# Machine Translation

In [1]:
!nvidia-smi
!lscpu

Sun Jul  4 08:21:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install hazm==0.7.0
!pip install seqeval==1.2.2
!pip install sentencepiece==0.1.96
!pip install transformers==4.7.0
!pip install clean-text[gpl]==0.4.0
!pip install sacrebleu==1.5.1

Collecting hazm==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/22/13/5a7074bc11d20dbbb46239349ac3f85f7edc148b4cf68e9b8c2f8263830c/hazm-0.7.0-py3-none-any.whl (316kB)
[K     |█                               | 10kB 22.6MB/s eta 0:00:01[K     |██                              | 20kB 16.1MB/s eta 0:00:01[K     |███                             | 30kB 13.8MB/s eta 0:00:01[K     |████▏                           | 40kB 12.7MB/s eta 0:00:01[K     |█████▏                          | 51kB 6.9MB/s eta 0:00:01[K     |██████▏                         | 61kB 6.7MB/s eta 0:00:01[K     |███████▎                        | 71kB 7.6MB/s eta 0:00:01[K     |████████▎                       | 81kB 8.4MB/s eta 0:00:01[K     |█████████▎                      | 92kB 8.2MB/s eta 0:00:01[K     |██████████▍                     | 102kB 6.7MB/s eta 0:00:01[K     |███████████▍                    | 112kB 6.7MB/s eta 0:00:01[K     |████████████▍                   | 122kB 6.7MB/s et

In [3]:
!pip install PyDrive
import os
import IPython.display as ipd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [4]:
# Import required packages
import os
import gc
import re
import hazm
import time
import json
import sacrebleu
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import transformers
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import MT5Config, MT5ForConditionalGeneration, MT5Tokenizer

from cleantext import clean

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print()
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('transformers', transformers.__version__)
print('torch', torch.__version__)
print()

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



numpy 1.19.5
pandas 1.1.5
transformers 4.7.0
torch 1.9.0+cu102

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
class MachineTranslationDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Machine Translation. """

    def __init__(self, original_text, translated_text, tokenizer, max_length):
        self.original_text = original_text
        self.translated_text = translated_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.original_text)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.original_text[item],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        inputs = {
            'original': self.original_text[item],
            'translated': self.translated_text[item],
            'input_ids': encoding.input_ids.flatten(),
            'attention_mask': encoding.attention_mask.flatten()
        }
        return inputs


class MachineTranslation:
    def __init__(self, model_name, model_type):
        self.normalizer = hazm.Normalizer()
        self.model_name = model_name
        if model_type.lower() == "mt5":
            self.tokenizer = MT5Tokenizer.from_pretrained(model_name)
            self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
            self.config = MT5Config.from_pretrained(self.model_name)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.id2label = self.config.id2label
            self.label2id = self.config.label2id

    def load_dataset_test_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() == "mizan":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['original', 'translation'], header=None)
            original, translation = data['original'].values.tolist(), data['translation'].values.tolist()
            print(f'test part:\n #original: {len(original)}, #translation: {len(translation)}')
            return original, translation
        if dataset_name.lower() == "combined":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            data = pd.read_csv(dataset_file, delimiter="\t", names=['original', 'translation', 'source'], header=None)
            if 'source' in kwargs:
                data = data[data['source'] == kwargs['source']]
                data = data[['original', 'translation']]
            else:
                data = data[['original', 'translation']]
            original, translation = data['original'].values.tolist(), data['translation'].values.tolist()
            print(f'test part:\n #original: {len(original)}, #translation: {len(translation)}')
            return original, translation

    def load_dataset_file(self, dataset_name, dataset_file, **kwargs):
        if dataset_name.lower() == "quran" or dataset_name.lower() == "bible":
            if not os.path.exists(dataset_file):
                print(f'{dataset_file} not exists!')
                return
            original, translation = [], []
            with open(dataset_file, encoding="utf8") as infile:
                for line in infile:
                    parts = line.strip().split('\t')
                    original.append(parts[0])
                    translation.append(parts[1])
            print(f'all data:\n #original: {len(original)}, #translation: {len(translation)}')

            _, original_test, _, translation_test = train_test_split(original, translation, test_size=0.1,
                                                                     random_state=1)
            print(f'test part:\n #original: {len(original_test)}, #translation: {len(translation_test)}')
            return original, translation, original_test, translation_test

    def mt5_machine_translation_inference(self, input_text, device):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return

        tokenized_batch = self.tokenizer(
            input_text,
            padding=True,
            return_tensors="pt"
        )

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        input_ids = tokenized_batch.input_ids.to(device)
        attention_mask = tokenized_batch.attention_mask.to(device)
        outputs = self.model.generate(input_ids=input_ids,
                                      attention_mask=attention_mask)
        predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return predictions

    def mt5_evaluation(self, input_text, translated_text, device, max_length, split_reference=None, batch_size=4):
        if not self.model or not self.tokenizer:
            print('Something wrong has been happened!')
            return
        if len(input_text) != len(translated_text):
            print('length of inputs and its translations is not equal!!')
            return

        dataset = MachineTranslationDataset(original_text=input_text, translated_text=translated_text,
                                            tokenizer=self.tokenizer, max_length=max_length)
        data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
        print(f'#original_text:{len(input_text)}, #translated_text:{len(translated_text)}')
        print("#batch:", len(data_loader))

        if split_reference is None:
            max_num_ref = 1
        else:
            max_num_ref = 0
            for ref in translated_text:
                max_num_ref = max(max_num_ref, len(ref.split(split_reference)))
        print("#maximum_translation_reference:", max_num_ref)

        gc.collect()
        torch.cuda.empty_cache()
        # Tell pytorch to run this model on the GPU.
        if device.type != 'cpu':
            self.model.cuda()

        total_time = 0
        output_predictions = []
        golden_translations, predicted_translations = [[] for _ in range(max_num_ref)], []
        print("Start to evaluate test data ...")
        for step, batch in enumerate(data_loader):
            b_input_ids = batch['input_ids']
            b_attention_mask = batch['attention_mask']

            # move tensors to GPU if CUDA is available
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)

            # This will return the loss (rather than the model output) because we have provided the `labels`.
            with torch.no_grad():
                start = time.monotonic()
                b_outputs = self.model.generate(input_ids=b_input_ids, attention_mask=b_attention_mask)
                end = time.monotonic()
                total_time += end - start
                print(f'inference time for step {step}: {end - start}')

            b_predictions = self.tokenizer.batch_decode(b_outputs, skip_special_tokens=True)

            for i in range(len(b_input_ids)):
                if split_reference is None:
                    sample_golden_translation = [batch['translated'][i]]
                else:
                    sample_golden_translation = batch['translated'][i].split(split_reference)
                sample_generated_translation = b_predictions[i]
                bleu_score = sacrebleu.corpus_bleu(sys_stream=[sample_generated_translation],
                                                   ref_streams=[[g] for g in sample_golden_translation]).score
                output_predictions.append((batch['original'][i], batch['translated'][i], b_predictions[i], bleu_score))

                for j in range(max_num_ref):
                    try:
                        golden_translations[j].append(sample_golden_translation[j])
                    except:
                        golden_translations[j].append('')
                predicted_translations.append(sample_generated_translation)

        print("total inference time:", total_time)
        print("total inference time / #samples:", total_time / len(input_text))

        # evaluate
        print("BLEU Score: {}".format(sacrebleu.corpus_bleu(
            sys_stream=predicted_translations, ref_streams=golden_translations).score))
        return output_predictions


In [6]:
model_name='persiannlp/mt5-small-parsinlu-opus-translation_fa_en'
mt_model = MachineTranslation(model_name=model_name, model_type="mt5")
print(mt_model.config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4309802.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=383.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=609.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1200773595.0, style=ProgressStyle(descr…


MT5Config {
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.7.0",
  "use_cache": true,
  "vocab_size": 250112
}



## Sample Inference

In [7]:
input_list = [
  "ستایش خدای را که پروردگار جهانیان است.",
  "در هاید پارک کرنر بر گلدانی ایستاده موعظه می‌کند؛",
  "وی از تمامی بلاگرها، سازمان‌ها و افرادی که از وی پشتیبانی کرده‌اند، تشکر کرد.",
  "مشابه سال ۲۰۰۱، تولید آمونیاک بی آب در ایالات متحده در سال ۲۰۰۰ تقریباً ۱۷،۴۰۰،۰۰۰ تن (معادل بدون آب) با مصرف ظاهری ۲۲،۰۰۰،۰۰۰ تن و حدود ۴۶۰۰۰۰۰ با واردات خالص مواجه شد. ",
  "می خواهم دکترای علوم کامپیوتر راجع به شبکه های اجتماعی را دنبال کنم، چالش حل نشده در شبکه های اجتماعی چیست؟"
]
mt_model.mt5_machine_translation_inference(input_list, device)

['the admiration of God, which is the Lord of the world.',
 'At the Ford Park, the Crawford Park stands on a vase;',
 'He thanked all the bloggers, the organizations, and the people who supported him',
 'similar to the year 2001, the economy of ammonia in the United States in the',
 'I want to follow the computer experts on social networks, what is the unsolved problem in']

## Mizan Dataset


In [42]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/mizan/mizan_test_fa_en.tsv
!ls

--2021-07-04 09:27:16--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/mizan/mizan_test_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2296459 (2.2M) [text/plain]
Saving to: ‘mizan_test_fa_en.tsv’


2021-07-04 09:27:16 (42.0 MB/s) - ‘mizan_test_fa_en.tsv’ saved [2296459/2296459]

adc.json
mizan_test_fa_en.tsv
sample_data
translation_combined_fa_en
translation_fa-en_combined_all_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_all_split-refs_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_bible_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_mizan_persiannlp-mt5-small-pars

In [43]:
mizan_original_text, mizan_translated_text = mt_model.load_dataset_test_file(dataset_name="mizan", dataset_file="mizan_test_fa_en.tsv")
print(mizan_original_text[:5])
print(mizan_translated_text[:5])
print(len(mizan_original_text))
print(len(mizan_translated_text))

test part:
 #original: 10000, #translation: 10000
['این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.', 'عاقبت گفت: سزار! تو از این موضوع نگران شده\u200cای؛', 'بسیار متأسفم که نسبت به آن احساس بی اعتمادی می\u200cکنی؛', 'اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.', 'مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی\u200cدانم؛']
['a sure sign with him of mental agitation.', ';Cesare, you are worried about this affair,; she said at last.', ';I am very sorry you feel so despondent over it;', 'but I could decide only as seemed right to me.;', ';It is not the affair,; he answered, sullenly; ;I know nothing about it,']
10000
10000


In [44]:
mt_model.mt5_machine_translation_inference(mizan_original_text[:5], device)

['This was a decisive sign of his excitement, and he began to pace the',
 'You are worried about this, Cesar, he said at last;',
 'I am very sorry you feel distrustful of that;',
 'But I only could not decide why it seemed to me right.',
 "It's not for that, I don't know about it, said Martini,"]

In [45]:
!nvidia-smi
!lscpu

Sun Jul  4 09:27:24 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    29W /  70W |   2276MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [46]:
evaluation_output = mt_model.mt5_evaluation(mizan_original_text, mizan_translated_text, device, max_length=512, batch_size=128)

#original_text:10000, #translated_text:10000
#batch: 79
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.042021274000035
inference time for step 1: 1.9639375239999026
inference time for step 2: 1.9641387159999795
inference time for step 3: 1.9682919950000723
inference time for step 4: 1.9718505759997242
inference time for step 5: 1.9799732320002477
inference time for step 6: 1.9782218969999121
inference time for step 7: 1.9919918720001988
inference time for step 8: 1.9903656530000262
inference time for step 9: 1.9900314910000816
inference time for step 10: 1.9919953980001992
inference time for step 11: 1.9945894139996199
inference time for step 12: 2.0020797389997824
inference time for step 13: 2.0066130330001215
inference time for step 14: 2.0238946260001285
inference time for step 15: 2.01059998799974
inference time for step 16: 2.027595424000083
inference time for step 17: 2.0294923060000656
inference time for step 18: 2.02874377299986
i

In [47]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.	a sure sign with him of mental agitation.	This was a decisive sign of his excitement, and he began to pace the	3.4585921141027356
عاقبت گفت: سزار! تو از این موضوع نگران شده‌ای؛	;Cesare, you are worried about this affair,; she said at last.	You are worried about this, Cesar, he said at last;	24.275696005907676
بسیار متأسفم که نسبت به آن احساس بی اعتمادی می‌کنی؛	;I am very sorry you feel so despondent over it;	I am very sorry you feel distrustful of that;	43.98917247584221
اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.	but I could decide only as seemed right to me.;	But I only could not decide why it seemed to me right.	8.998265635882948
مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی‌دانم؛	;It is not the affair,; he answered, sullenly; ;I know nothing about it,	It's not for that, I don't know about it, said Martini,	8.800046366764844
هنگامی که تو قبول می‌کنی در کاری شرکت جویی قطعا

In [48]:
output_file_name = "translation_fa-en_mizan_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## Quran Dataset

In [49]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/quran/quran_fa_en.tsv
!ls

--2021-07-04 09:30:51--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/quran/quran_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9309105 (8.9M) [text/plain]
Saving to: ‘quran_fa_en.tsv’


2021-07-04 09:30:52 (105 MB/s) - ‘quran_fa_en.tsv’ saved [9309105/9309105]

adc.json
mizan_test_fa_en.tsv
quran_fa_en.tsv
sample_data
translation_combined_fa_en
translation_fa-en_combined_all_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_all_split-refs_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_bible_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_mizan_persiannlp-mt5-small-pars

In [50]:
quran_original_all, quran_translated_all, quran_original_test, quran_translated_test = mt_model.load_dataset_file(dataset_name="quran", dataset_file="quran_fa_en.tsv")
print(quran_original_all[0])
print(quran_translated_all[0])
print(quran_original_test[0])
print(quran_translated_test[0])

all data:
 #original: 6236, #translation: 6236
test part:
 #original: 624, #translation: 624
به نام خداوند رحمتگر مهربان
In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.
حقا که انسان سرکشى مى‌کند،
And yet, but yet man is rebellious,///Yes indeed, man is surely rebellious.///No indeed; surely Man waxes insolent,///By no means: Verily man exorbitateth.///Nay! Verily, man does transgress all bounds (in disbelief and evil deed, etc.).///In fact, man oversteps all bounds.///Nay, surely man transgresses;///Nay! Verily, ma

In [51]:
mt_model.mt5_machine_translation_inference(quran_original_test[:5], device)

['Really, a man strikes',
 'So, what is Kiche, if you lie? they said.',
 'So go to him and say, We are sent by the Lord, then send the children of',
 'In fact, those who are already sheathed and slain, though',
 'My bones are broken, said he, and my head is gone from the white']

### test set

In [52]:
!nvidia-smi
!lscpu

Sun Jul  4 09:31:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    30W /  70W |   2286MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [53]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, split_reference='///', batch_size=128)

#original_text:624, #translated_text:624
#batch: 5
#maximum_translation_reference: 9
Start to evaluate test data ...
inference time for step 0: 2.1067595300000903
inference time for step 1: 2.039256612000827
inference time for step 2: 2.05511945499984
inference time for step 3: 2.047922384999765
inference time for step 4: 1.8273850370005675
total inference time: 10.07644301900109
total inference time / #samples: 0.016148145863783797
BLEU Score: 7.620352655531135


In [54]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

حقا که انسان سرکشى مى‌کند،	And yet, but yet man is rebellious,///Yes indeed, man is surely rebellious.///No indeed; surely Man waxes insolent,///By no means: Verily man exorbitateth.///Nay! Verily, man does transgress all bounds (in disbelief and evil deed, etc.).///In fact, man oversteps all bounds.///Nay, surely man transgresses;///Nay! Verily, man does transgress.///Nay, but man doth transgress all bounds,	Really, a man strikes	10.400597689005304
گفتند: «پس، اگر دروغ بگویید، کیفرش چیست؟»	"What should be the punishment," they were asked, "in case you are liars?"///They said, “And what shall be the punishment for it, if you are liars?”///They said, 'And what shall be its recompense if you are liars?'///They said: what shall be the meed of him, if ye are found liars!///They [Yusuf's (Joseph) men] said: "What then shall be the penalty of him, if you are (proved to be) liars."///They said, “What shall be his punishment, if you are lying?”///The officials said: "If you are lying, what wil

In [55]:
output_file_name = "translation_fa-en_quran_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### all dataset

In [56]:
!nvidia-smi
!lscpu

Sun Jul  4 09:31:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    30W /  70W |   5486MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [57]:
evaluation_output = mt_model.mt5_evaluation(quran_original_all, quran_translated_all, device, max_length=512, split_reference='///', batch_size=128)

#original_text:6236, #translated_text:6236
#batch: 49
#maximum_translation_reference: 9
Start to evaluate test data ...
inference time for step 0: 2.0835500699995464
inference time for step 1: 2.059299357999407
inference time for step 2: 2.059763465999822
inference time for step 3: 2.05785883200042
inference time for step 4: 2.0676188170000387
inference time for step 5: 2.0622238920004747
inference time for step 6: 2.070185491999837
inference time for step 7: 2.0708840279994547
inference time for step 8: 2.0757185190004748
inference time for step 9: 2.0836335859994506
inference time for step 10: 2.0856044470001507
inference time for step 11: 2.088109024000005
inference time for step 12: 2.08396753299985
inference time for step 13: 2.1027404659998865
inference time for step 14: 2.08158288300001
inference time for step 15: 2.0995972560003793
inference time for step 16: 2.104206365000209
inference time for step 17: 2.1098067850007283
inference time for step 18: 2.0969095819991708
inferenc

In [58]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	called God, a kind pilgrim	7.0550047212602784
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise be to Al

In [59]:
output_file_name = "translation_fa-en_quran_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## Bible Dataset

In [60]:
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/bible/bible_fa_en.tsv
!ls

--2021-07-04 09:33:35--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/bible/bible_fa_en.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10023337 (9.6M) [text/plain]
Saving to: ‘bible_fa_en.tsv’


2021-07-04 09:33:35 (75.6 MB/s) - ‘bible_fa_en.tsv’ saved [10023337/10023337]

adc.json
bible_fa_en.tsv
mizan_test_fa_en.tsv
quran_fa_en.tsv
sample_data
translation_combined_fa_en
translation_fa-en_combined_all_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_all_split-refs_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_bible_persiannlp-mt5-small-parsinlu-opus-translation_fa_en_outputs.txt
translation_fa-en_combined_mizan_persi

In [61]:
bible_original_all, bible_translated_all, bible_original_test, bible_translated_test = mt_model.load_dataset_file(dataset_name="bible", dataset_file="bible_fa_en.tsv")
print(bible_original_all[0])
print(bible_translated_all[0])
print(bible_original_test[0])
print(bible_translated_test[0])

all data:
 #original: 31020, #translation: 31020
test part:
 #original: 3102, #translation: 3102
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.
و چون قربانی های سوختنی تمام شد، پادشاه و جمیع حاضرین با وی رکوع کرده، سجده نمودند.
And of the sons of Elizaphan; Shimri, and Jeiel: and of the sons of Asaph; Zechariah, and Mattaniah:


In [62]:
mt_model.mt5_machine_translation_inference(bible_original_test[:5], device)

['and when the burning victims were finished, the king and the crew',
 'And you command the devil to bring the artificial and broken olive oil to light to illumina',
 'And Benjamin, Dreben, and the gorgeous, and the wonderful.',
 'So, when you are a pioneer in the preparations of the days,',
 'and took some of it, tie them in his lap.']

### test set

In [63]:
!nvidia-smi
!lscpu

Sun Jul  4 09:33:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    35W /  70W |   2290MiB / 15109MiB |     32%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [64]:
evaluation_output = mt_model.mt5_evaluation(bible_original_test, bible_translated_test, device, max_length=512, batch_size=128)

#original_text:3102, #translated_text:3102
#batch: 25
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.087289905999569
inference time for step 1: 2.0975067509998553
inference time for step 2: 2.0969609609992403
inference time for step 3: 2.110342902999946
inference time for step 4: 2.103591246000178
inference time for step 5: 2.0992588880008043
inference time for step 6: 2.1061763110001266
inference time for step 7: 2.1117317669995828
inference time for step 8: 2.1142815860002884
inference time for step 9: 2.1069754399995873
inference time for step 10: 2.1102114740006073
inference time for step 11: 2.127489513999535
inference time for step 12: 2.1254415329995027
inference time for step 13: 2.1172533270000713
inference time for step 14: 2.1234785449996707
inference time for step 15: 2.1301760729993475
inference time for step 16: 2.1314575959995636
inference time for step 17: 2.1262985439998374
inference time for step 18: 2.131460521000008
in

In [65]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

و چون قربانی های سوختنی تمام شد، پادشاه و جمیع حاضرین با وی رکوع کرده، سجده نمودند.	And of the sons of Elizaphan; Shimri, and Jeiel: and of the sons of Asaph; Zechariah, and Mattaniah:	and when the burning victims were finished, the king and the crew	1.9819296757939446
«و تو بنی‌اسرائیل را امر فرما که روغن زیتون مصفی و کوبیده شده برای روشنایی نزد توبیاورند تا چراغها دائم روشن شود.در خیمه اجتماع، بیرون پرده‌ای که در برابر شهادت است، هارون و پسرانش از شام تا صبح، به حضورخداوند آن را درست کنند. و این برای بنی‌اسرائیل نسلا بعد نسل فریضه ابدی باشد.	All the pillars round about the court shall be filleted with silver; their hooks shall be of silver, and their sockets of brass.	And you command the devil to bring the artificial and broken olive oil to light to illumina	1.7726282342575568
وبنی جاد، دیبون و عطاروت و عروعیر.	And Moses said unto them, If the children of Gad and the children of Reuben will pass with you over Jordan, every man armed to battle, before the LORD, and the land shall be s

In [66]:
output_file_name = "translation_fa-en_bible_test_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### all dataset

In [67]:
!nvidia-smi
!lscpu

Sun Jul  4 09:34:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    32W /  70W |   5486MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [68]:
evaluation_output = mt_model.mt5_evaluation(bible_original_all, bible_translated_all, device, max_length=512, batch_size=128)

#original_text:31020, #translated_text:31020
#batch: 243
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.1573257339996417
inference time for step 1: 2.1466076189999512
inference time for step 2: 2.137467421999645
inference time for step 3: 2.145867489000011
inference time for step 4: 2.1448594769999545
inference time for step 5: 2.155225423000047
inference time for step 6: 2.1552952100000766
inference time for step 7: 2.1422046990001036
inference time for step 8: 2.156299399999625
inference time for step 9: 2.1581051240000306
inference time for step 10: 2.15467075900051
inference time for step 11: 2.162933910999527
inference time for step 12: 2.150268727999901
inference time for step 13: 2.1669516579995616
inference time for step 14: 2.1686706529999356
inference time for step 15: 2.1709006979999685
inference time for step 16: 2.172352339000099
inference time for step 17: 2.1648480150006435
inference time for step 18: 2.1673862070001633
inf

In [69]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	At first, God created the heavens and the earth.	33.932513407933634
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was thick and barren, and the darkness upon the abyss	2.2309139608141733
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light,' and light came.	20.544097977075833
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light that separated Nicholas and the light from the darkness.	38.00213082631731
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he called 

In [70]:
output_file_name = "translation_fa-en_bible_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## translation_combined

In [8]:
!mkdir translation_combined_fa_en
!wget https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/translation_combined_fa_en/test.tsv -O translation_combined_fa_en/test.tsv
!ls translation_combined_fa_en

--2021-07-04 08:23:20--  https://media.githubusercontent.com/media/persiannlp/parsinlu/master/data/translation/translation_combined_fa_en/test.tsv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22332746 (21M) [text/plain]
Saving to: ‘translation_combined_fa_en/test.tsv’


2021-07-04 08:23:21 (49.2 MB/s) - ‘translation_combined_fa_en/test.tsv’ saved [22332746/22332746]

test.tsv


### All combined test set

In [9]:
combined_original_test, combined_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv")
print(combined_original_test[0])
print(combined_translated_test[0])

test part:
 #original: 47738, #translation: 47738
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.


In [10]:
!nvidia-smi
!lscpu

Sun Jul  4 08:23:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    29W /  70W |   2284MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
evaluation_output = mt_model.mt5_evaluation(combined_original_test, combined_translated_test, device, max_length=512, batch_size=128)

#original_text:47738, #translated_text:47738
#batch: 373
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.091683829999994
inference time for step 1: 2.0270710430000065
inference time for step 2: 2.0240018850000183
inference time for step 3: 2.029663889999995
inference time for step 4: 2.033348383999993
inference time for step 5: 2.0334560889999977
inference time for step 6: 2.046868193999984
inference time for step 7: 2.057520645000011
inference time for step 8: 2.058251490999993
inference time for step 9: 2.0617738820000113
inference time for step 10: 2.089576026000003
inference time for step 11: 2.0839093649999825
inference time for step 12: 2.0986377820000257
inference time for step 13: 2.09828685399998
inference time for step 14: 2.1077807290000123
inference time for step 15: 2.111834270000003
inference time for step 16: 2.1148191880000127
inference time for step 17: 2.117034832999991
inference time for step 18: 2.143497317999987
infere

In [12]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	At first, God created the heavens and the earth.	33.932513407933634
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was thick and barren, and the darkness upon the abyss	2.2309139608141733
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light,' and light came.	20.544097977075833
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light that separated Nicholas and the light from the darkness.	38.00213082631731
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he called 

In [13]:
output_file_name = "translation_fa-en_combined_all_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

In [14]:
evaluation_output = mt_model.mt5_evaluation(combined_original_test, combined_translated_test, device, max_length=512, split_reference='///', batch_size=128)

#original_text:47738, #translated_text:47738
#batch: 373
#maximum_translation_reference: 25
Start to evaluate test data ...
inference time for step 0: 2.1677942930000427
inference time for step 1: 2.1384664370000337
inference time for step 2: 2.155013159000191
inference time for step 3: 2.16478986900006
inference time for step 4: 2.1847436789998937
inference time for step 5: 2.194149445999983
inference time for step 6: 2.2075318289998904
inference time for step 7: 2.2303339270001743
inference time for step 8: 2.2319534089999706
inference time for step 9: 2.261742280000135
inference time for step 10: 2.2636621090000517
inference time for step 11: 2.2822910940001293
inference time for step 12: 2.2955995439999697
inference time for step 13: 2.2961520519997975
inference time for step 14: 2.300228839000056
inference time for step 15: 2.2804506330001004
inference time for step 16: 2.2690980200000013
inference time for step 17: 2.2494360609998694
inference time for step 18: 2.2462007949998224

In [15]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	At first, God created the heavens and the earth.	33.932513407933634
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was thick and barren, and the darkness upon the abyss	2.2309139608141733
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light,' and light came.	20.544097977075833
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light that separated Nicholas and the light from the darkness.	38.00213082631731
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he called 

In [16]:
output_file_name = "translation_fa-en_combined_all_split-refs_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Bible subset

In [19]:
bible_original_test, bible_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='bible_fa_en')
print(bible_original_test[0])
print(bible_translated_test[0])

test part:
 #original: 31020, #translation: 31020
در ابتدا، خدا آسمانها و زمین را آفرید.
In the beginning God created the heaven and the earth.


In [18]:
!nvidia-smi
!lscpu

Sun Jul  4 08:54:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    32W /  70W |   5614MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [20]:
evaluation_output = mt_model.mt5_evaluation(bible_original_test, bible_translated_test, device, max_length=512, batch_size=128)

#original_text:31020, #translated_text:31020
#batch: 243
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.152768174999892
inference time for step 1: 2.1367592259998673
inference time for step 2: 2.136720312000307
inference time for step 3: 2.1574017889997776
inference time for step 4: 2.173385137999958
inference time for step 5: 2.1824309149997134
inference time for step 6: 2.203710742000112
inference time for step 7: 2.2214197010002863
inference time for step 8: 2.226055178000024
inference time for step 9: 2.2597820969999702
inference time for step 10: 2.2639104200002294
inference time for step 11: 2.283014258000094
inference time for step 12: 2.2973186330000317
inference time for step 13: 2.302683636999973
inference time for step 14: 2.317754267000055
inference time for step 15: 2.301708561000396
inference time for step 16: 2.2828785989995595
inference time for step 17: 2.27037558200027
inference time for step 18: 2.2581441450001876
infer

In [21]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

در ابتدا، خدا آسمانها و زمین را آفرید.	In the beginning God created the heaven and the earth.	At first, God created the heavens and the earth.	33.932513407933634
وزمین تهی و بایر بود و تاریکی بر روی لجه. و روح خدا سطح آبها را فرو گرفت.	And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.	The land was thick and barren, and the darkness upon the abyss	2.2309139608141733
و خدا گفت: «روشنایی بشود.» و روشنایی شد.	And God said, Let there be light: and there was light.	And God said, 'Light,' and light came.	20.544097977075833
و خدا روشنایی را دید که نیکوست و خداروشنایی را از تاریکی جدا ساخت.	And God saw the light, that it was good: and God divided the light from the darkness.	and God saw the light that separated Nicholas and the light from the darkness.	38.00213082631731
و خداروشنایی را روز نامید و تاریکی را شب نامید. وشام بود و صبح بود، روزی اول.	And God called the light Day, and the darkness he called 

In [22]:
output_file_name = "translation_fa-en_combined_bible_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Mizan subset

In [23]:
mizan_original_test, mizan_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='mizan_test_fa_en')
print(mizan_original_test[0])
print(mizan_translated_test[0])

test part:
 #original: 10000, #translation: 10000
این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.
a sure sign with him of mental agitation.


In [24]:
!nvidia-smi
!lscpu

Sun Jul  4 09:04:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    32W /  70W |   5488MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [25]:
evaluation_output = mt_model.mt5_evaluation(mizan_original_test, mizan_translated_test, device, max_length=512, batch_size=128)

#original_text:10000, #translated_text:10000
#batch: 79
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.17341979899993
inference time for step 1: 2.1459954359997937
inference time for step 2: 2.157544874999985
inference time for step 3: 2.1542418089998137
inference time for step 4: 2.1593375369998284
inference time for step 5: 2.174609495000368
inference time for step 6: 2.1773286019997613
inference time for step 7: 2.1762374180002553
inference time for step 8: 2.1801534430001084
inference time for step 9: 2.1903784189998987
inference time for step 10: 2.1756005830002323
inference time for step 11: 2.1934740910000983
inference time for step 12: 2.1841002830001344
inference time for step 13: 2.185636704999979
inference time for step 14: 2.1918059750000793
inference time for step 15: 2.187395825999829
inference time for step 16: 2.1854342230003567
inference time for step 17: 2.1889372440000443
inference time for step 18: 2.1966355690001365
i

In [26]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

این نشان قاطعی از هیجان وی بود شروع به قدم زدن در اتاق کرد.	a sure sign with him of mental agitation.	This was a decisive sign of his excitement, and he began to pace the	3.4585921141027356
عاقبت گفت: سزار! تو از این موضوع نگران شده‌ای؛	;Cesare, you are worried about this affair,; she said at last.	You are worried about this, Cesar, he said at last;	24.275696005907676
بسیار متأسفم که نسبت به آن احساس بی اعتمادی می‌کنی؛	;I am very sorry you feel so despondent over it;	I am very sorry you feel distrustful of that;	43.98917247584221
اما من فقط بدان علت نتوانستم تصمیم بگیرم که آن کار به نظرم صحیح رسید.	but I could decide only as seemed right to me.;	But I only could not decide why it seemed to me right.	8.998265635882948
مارتینی، با اخم، پاسخ داد: به خاطر آن موضوع نیست، من در آن باره چیزی نمی‌دانم؛	;It is not the affair,; he answered, sullenly; ;I know nothing about it,	It's not for that, I don't know about it, said Martini,	8.800046366764844
هنگامی که تو قبول می‌کنی در کاری شرکت جویی قطعا

In [27]:
output_file_name = "translation_fa-en_combined_mizan_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### qqp subset

In [29]:
qqp_original_test, qqp_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='qqp_test_fa_en')
print(qqp_original_test[0])
print(qqp_translated_test[0])

test part:
 #original: 489, #translation: 489
آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Does the spirit world exist? If yes, who is the initiator and controller of this spirit realm?


In [30]:
!nvidia-smi
!lscpu

Sun Jul  4 09:13:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    27W /  70W |   5486MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [31]:
evaluation_output = mt_model.mt5_evaluation(qqp_original_test, qqp_translated_test, device, max_length=512, batch_size=128)

#original_text:489, #translated_text:489
#batch: 4
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.0153533840002638
inference time for step 1: 1.9716617499998392
inference time for step 2: 1.976241638999909
inference time for step 3: 1.643590163999761
total inference time: 7.606846936999773
total inference time / #samples: 0.015555924206543503
BLEU Score: 21.809350563050895


In [32]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟	Does the spirit world exist? If yes, who is the initiator and controller of this spirit realm?	Is there a soul world? If yes, who is the scholar and observer of	31.706699280572998
چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟	How can I stop watching porn videos?	How can I stop watching the operas?	54.10822690539397
آیا قرار است دونالد ترامپ رئیس جمهور بعدی ایالات متحده باشد؟	Is Donald Trump going to be the next US President?	Is Donald Trump going to be the next president of the United States?	52.960749334062214
چگونه می توانم سوالی را در این باره بپرسم؟	How do I ask a question on this?	How can I ask a question about this?	36.88939732334405
مضرات لیست پیوندی چیست؟	What are the disadvantages of linked lists?	What are the facts of the list?	23.356898886410015
اگر از تولید مثل افراد با ضریب هوشی ضعیف جلوگیری کنیم، آیا در نهایت تکامل انسان ها باهوش تر می شوند؟	If we prevent people with low IQ from reproducing,

In [33]:
output_file_name = "translation_fa-en_combined_qqp_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

### Quran subset

In [34]:
quran_original_test, quran_translated_test = mt_model.load_dataset_test_file(dataset_name="combined", dataset_file="translation_combined_fa_en/test.tsv", source='quran_fa_en')
print(quran_original_test[0])
print(quran_translated_test[0])

test part:
 #original: 6229, #translation: 6229
به نام خداوند رحمتگر مهربان
In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.


In [35]:
!nvidia-smi
!lscpu

Sun Jul  4 09:14:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    28W /  70W |   5486MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [36]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, batch_size=128)

#original_text:6229, #translated_text:6229
#batch: 49
#maximum_translation_reference: 1
Start to evaluate test data ...
inference time for step 0: 2.0317915699997684
inference time for step 1: 1.989276171000256
inference time for step 2: 1.9798178569999436
inference time for step 3: 1.9880618409997624
inference time for step 4: 1.9899037109998972
inference time for step 5: 1.9873305030000665
inference time for step 6: 1.993514487000084
inference time for step 7: 2.005122125000071
inference time for step 8: 2.0066699459998745
inference time for step 9: 2.008153495999977
inference time for step 10: 2.0111697470001673
inference time for step 11: 2.0210044830000697
inference time for step 12: 2.0261364040002263
inference time for step 13: 2.019960216999607
inference time for step 14: 2.0199978380001085
inference time for step 15: 2.016430073000265
inference time for step 16: 2.0159661759998926
inference time for step 17: 2.014175244999933
inference time for step 18: 2.030596216000049
infer

In [37]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	called God, a kind pilgrim	5.3494999618159915e-09
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise be t

In [38]:
output_file_name = "translation_fa-en_combined_quran_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

In [39]:
evaluation_output = mt_model.mt5_evaluation(quran_original_test, quran_translated_test, device, max_length=512, split_reference='///', batch_size=128)

#original_text:6229, #translated_text:6229
#batch: 49
#maximum_translation_reference: 25
Start to evaluate test data ...
inference time for step 0: 2.1166077129996665
inference time for step 1: 2.060220830000162
inference time for step 2: 2.078608401999645
inference time for step 3: 2.068843753999772
inference time for step 4: 2.0774083279998194
inference time for step 5: 2.073223901000347
inference time for step 6: 2.08196193699996
inference time for step 7: 2.0862643680002293
inference time for step 8: 2.073144844000126
inference time for step 9: 2.085531349000121
inference time for step 10: 2.0872527500000615
inference time for step 11: 2.087724962000266
inference time for step 12: 2.091001548999884
inference time for step 13: 2.0938281189996815
inference time for step 14: 2.084533628999907
inference time for step 15: 2.104037359999893
inference time for step 16: 2.1007484589999876
inference time for step 17: 2.0934511639998163
inference time for step 18: 2.103701344000001
inference

In [40]:
for text, true_translation, generated_translation, bleu_score in evaluation_output[:25]:
  print('{}\t{}\t{}\t{}'.format(text, true_translation, generated_translation, bleu_score))

به نام خداوند رحمتگر مهربان	In the name of Allah, most benevolent, ever-merciful.///Allah - beginning with the name of - the Most Gracious, the Most Merciful.///In the Name of God, the Merciful, the Compassionate///In the name of Allah, the Compassionate, the Merciful.///In the Name of Allah, the Most Beneficent, the Most Merciful.///In the name of God, the Gracious, the Merciful.///In the name of Allah, the Merciful, the Compassionate///In the Name of Allah, the Most Gracious, the Most Merciful.///In the name of Allah, Most Gracious, Most Merciful.	called God, a kind pilgrim	7.0550047212602784
ستایش خدایى را که پروردگار جهانیان،	ALL PRAISE BE to Allah, Lord of all the worlds,///All praise is to Allah, the Lord Of The Creation.///Praise belongs to God, the Lord of all Being,///All praise unto Allah, the Lord of all the worlds.///All the praises and thanks be to Allah, the Lord of the 'Alamin (mankind, jinns and all that exists).///Praise be to God, Lord of the Worlds.///Praise be to Al

In [41]:
output_file_name = "translation_fa-en_combined_quran_split-refs_{}_outputs.txt".format(model_name.replace('/','-'))
with open(output_file_name, "w", encoding='utf8') as output_file:
  for text, true_translation, generated_translation, bleu_score in evaluation_output:
    output_file.write('{}\t{}\t{}\t{}\n'.format(text, true_translation, generated_translation, bleu_score))
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
upload = drive.CreateFile({'title': output_file_name})
upload.SetContentFile(output_file_name)
upload.Upload()

## QQP Dataset

## TEP Dataset


## OPUS Dataset

## Global Voice Dataset