In [1]:
import os
import pandas as pd

In [2]:
def prepare_translation_datasets(data_path):
    with open(os.path.join(data_path, "train.trg"), "r", encoding="utf-8") as f:
        myanmar_text = f.readlines()
        myanmar_text = [text.strip("\n") for text in myanmar_text]

    with open(os.path.join(data_path, "train.src"), "r") as f:
        english_text = f.readlines()
        english_text = [text.strip("\n") for text in english_text]

    data = []
    for myanmar, english in zip(myanmar_text, english_text):
        data.append(["translate myanmar to english", myanmar, english])
        data.append(["translate english to myanmar", english, myanmar])

    train_df = pd.DataFrame(data, columns=["prefix", "input_text", "target_text"])

    with open(os.path.join(data_path, "test.trg"), "r", encoding="utf-8") as f:
        myanmar_text = f.readlines()
        myanmar_text = [text.strip("\n") for text in myanmar_text]

    with open(os.path.join(data_path, "test.src"), "r") as f:
        english_text = f.readlines()
        english_text = [text.strip("\n") for text in english_text]

    data = []
    for myanmar, english in zip(myanmar_text, english_text):
        data.append(["translate myanmar to english", myanmar, english])
        data.append(["translate english to myanmar", english, myanmar])

    eval_df = pd.DataFrame(data, columns=["prefix", "input_text", "target_text"])

    return train_df, eval_df

In [3]:
train_df, eval_df = prepare_translation_datasets("/kaggle/input/datamt")

In [4]:
train_df

Unnamed: 0,prefix,input_text,target_text
0,translate myanmar to english,ပြင် သစ် နိုင် ငံ ပါ ရီ မြို့ ပါ့ ဒက်စ် ပ ရင့်...,Italy have defeated Portugal 31-5 in Pool C of...
1,translate english to myanmar,Italy have defeated Portugal 31-5 in Pool C of...,ပြင် သစ် နိုင် ငံ ပါ ရီ မြို့ ပါ့ ဒက်စ် ပ ရင့်...
2,translate myanmar to english,အန် ဒ ရီ ယာ မာ စီ သည် အီ တ လီ အ တွက် စမ်း သပ် ...,Andrea Masi opened the scoring in the fourth m...
3,translate english to myanmar,Andrea Masi opened the scoring in the fourth m...,အန် ဒ ရီ ယာ မာ စီ သည် အီ တ လီ အ တွက် စမ်း သပ် ...
4,translate myanmar to english,ပ ထ မ တစ် ဝက် ၏ တော် တော် များ များ အ တွက် က စ...,Despite controlling the game for much of the f...
...,...,...,...
79995,translate english to myanmar,The Arakan Army is engaged in frequent clashes...,သင် တို့ ပြည် နယ် မှာ မြန် မာ့ တပ် မ တော် နဲ့ ...
79996,translate myanmar to english,ဒီ ပ ဋိ ပက္ခ တွေ ကို သင် အ မြင် က ဘာ လဲ ။,What is your opinion of this conflict?
79997,translate english to myanmar,What is your opinion of this conflict?,ဒီ ပ ဋိ ပက္ခ တွေ ကို သင် အ မြင် က ဘာ လဲ ။
79998,translate myanmar to english,ကျောက် တော် နဲ့ ပ လက် ဝ မြို့ နယ် တွေ မှာ A A ...,AA the death toll is considerable on the side ...


In [5]:
!mkdir data

In [6]:
train_df.to_csv("data/train.tsv", sep="\t")
eval_df.to_csv("data/eval.tsv", sep="\t")

In [7]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting streamlit
  Downloading streamlit-1.17.0-py2.py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Collecting watchdog
  Downloading watchdog-2.2.1-py3-none-manylinux2014_x86_64.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.0/79.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck>=0.1.dev5
  Downloading pydeck-0.8.0-py2.py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━

In [8]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m894.0 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.3.1
[0mNote: you may need to restart the kernel to use updated packages.


In [9]:
import logging
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [10]:
train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("data/eval.tsv", sep="\t").astype(str)

train_df["prefix"] = ""
eval_df["prefix"] = ""

In [11]:
model_args = T5Args()
model_args.max_seq_length = 96
model_args.train_batch_size = 10
model_args.eval_batch_size = 10
model_args.num_train_epochs = 5
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 50000
model_args.use_multiprocessing = False
model_args.fp16 = False
model_args.save_steps = -1
model_args.save_eval_checkpoints = False
model_args.no_cache = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.preprocess_inputs = False
model_args.num_return_sequences = 1
#model_args.wandb_project = "MT5 Sinhala-English Translation"

model = T5Model("mt5", "google/mt5-base", args=model_args)

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

In [12]:
# Train the model
model.train_model(train_df, eval_data=eval_df)

  0%|          | 0/80000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2036 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2036 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2036 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2036 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/2036 [00:00<?, ?it/s]

(40000,
 {'global_step': [8000, 16000, 24000, 32000, 40000],
  'eval_loss': [1.711438494570115,
   1.5541025744933707,
   1.4911320618554658,
   1.4698903204179277,
   1.4876211180406458],
  'train_loss': [1.6560382843017578,
   1.8986091613769531,
   1.337767243385315,
   1.036765694618225,
   1.09765625]})

In [13]:
import logging
import sacrebleu
import pandas as pd
from simpletransformers.t5 import T5Model, T5Args


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


model_args = T5Args()
model_args.max_length = 512
model_args.length_penalty = 1
model_args.num_beams = 10

model = T5Model("mt5", "outputs", args=model_args)

In [14]:
eval_df = pd.read_csv("data/eval.tsv", sep="\t").astype(str)

myanmar_truth = [eval_df.loc[eval_df["prefix"] == "translate english to myanmar"]["target_text"].tolist()]
to_myanmar = eval_df.loc[eval_df["prefix"] == "translate english to myanmar"]["input_text"].tolist()

english_truth = [eval_df.loc[eval_df["prefix"] == "translate myanmar to english"]["target_text"].tolist()]
to_english = eval_df.loc[eval_df["prefix"] == "translate myanmar to english"]["input_text"].tolist()


In [15]:
# Predict
myanmar_preds = model.predict(to_myanmar)

eng_mya_bleu = sacrebleu.corpus_bleu(myanmar_preds, myanmar_truth)
print("--------------------------")
print("English to Myanmar: ", eng_mya_bleu.score)

english_preds = model.predict(to_english)

mya_eng_bleu = sacrebleu.corpus_bleu(english_preds, english_truth)
print("Myanmar to English: ", mya_eng_bleu.score)

Generating outputs:   0%|          | 0/128 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1018 [00:00<?, ?it/s]

--------------------------
English to Myanmar:  25.75466731867631


Generating outputs:   0%|          | 0/128 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/1018 [00:00<?, ?it/s]

Myanmar to English:  17.316397547264945
