## Data preparation

In [1]:
import os
import jsonlines
import json
import pandas as pd

In [2]:
path = 'ru-en_20/ru-en-release'

In [3]:
files_en = []
files_ru = []

for el in os.listdir(path):
    if el[-6] == 'e':
        files_en.append(el)
    else:
        files_ru.append(el)

In [4]:
files_en = sorted(files_en)
files_ru = sorted(files_ru)

In [5]:
results = []

for i, j in zip(files_en, files_ru):
    with open(path + '/' + i, 'r') as f:
        en = f.read()
    with open(path + '/' + j, 'r') as f:
        ru = f.read()
    results.append({"translation": {"ru": ru,
                                    "en": en
                                    }})

In [6]:
with open('train_test/train.jsonl', 'r') as f:
    train_data = f.readlines()

In [7]:
for el in train_data:
    results.append(el.strip('\n'))

In [11]:
with open('wmt20test/medline_ru2en_en.txt', 'r') as f:
    en_wmt_20 = f.readlines()

In [12]:
en_wmt_20 = [x.split('\t') for x in en_wmt_20]

In [13]:
with open('wmt20test/medline_ru2en_ru.txt', 'r') as f:
    ru_wmt_20 = f.readlines()

In [14]:
ru_wmt_20 = [x.split('\t') for x in ru_wmt_20]

In [15]:
alignment = pd.read_csv('wmt20test/ru-en_align_validation.tsv', sep='\t')

In [22]:
with open('wmt20test/ruen_mapping.txt', 'r') as f:
    mapping = f.readlines()

In [23]:
mapping = [x.strip('\n').split('\t') for x in mapping]

In [24]:
mapping_dict = {}

for el in mapping:
    mapping_dict[el[0]] = el[1]

In [25]:
align = [[mapping_dict[x[1]], x[2], x[3]] for x in align]

In [26]:
unique_doc_ids = {}

for el in en_wmt_20:
    unique_doc_ids[el[0]] = {}

In [27]:
for el in en_wmt_20:
    doc_id = el[0]
    unique_doc_ids[doc_id][el[1]] = el[2].strip('\n')

In [28]:
unique_doc_ids_ru = {}

for el in ru_wmt_20:
    unique_doc_ids_ru[el[0]] = {}

In [29]:
for el in ru_wmt_20:
    doc_id = el[0]
    unique_doc_ids_ru[doc_id][el[1]] = el[2].strip('\n')

In [33]:
res_wmt2020 = []

for el in align:
    if 'omitted' not in el:
        try:
            doc_id = el[0]
            sent_en = el[2]
            sent_ru = el[1]
            en = unique_doc_ids[doc_id][sent_en]
            ru = unique_doc_ids_ru[doc_id][sent_ru]
            res_wmt2020.append({"translation": {"ru": ru, "en": en}})
        except KeyError:
            pass

In [34]:
for el in res_wmt2020:
    results.append(el)

In [36]:
with open('wmt21test/medline_ru2en_en.txt', 'r') as f:
    en_wmt_21 = f.readlines()

In [37]:
en_wmt_21 = [x.split('\t') for x in en_wmt_21]

In [38]:
with open('wmt21test/medline_ru2en_ru.txt', 'r') as f:
    ru_wmt_21 = f.readlines()

In [39]:
ru_wmt_21 = [x.split('\t') for x in ru_wmt_21]

In [40]:
ru_wmt_21 = [[x[0], x[1], x[2].strip('\n')] for x in ru_wmt_21]

In [42]:
with open('wmt21test/ruen_mapping.txt', 'r') as f:
    mapping = f.readlines()

In [43]:
mapping = [x.strip('\n').split('\t') for x in mapping]

In [44]:
mapping_dict = {}

for el in mapping:
    mapping_dict[el[0]] = el[1]

In [41]:
with open('wmt21test/ru-en_align_validation.tsv', 'r') as f:
    align = f.readlines()

In [45]:
align = [x.strip('\n').split('\t') for x in align]
align = [[mapping_dict[x[1]], x[2], x[3]] for x in align]

In [46]:
unique_doc_ids = {}

for el in en_wmt_21:
    unique_doc_ids[el[0]] = {}

In [47]:
for el in en_wmt_21:
    doc_id = el[0]
    unique_doc_ids[doc_id][el[1]] = el[2].strip('\n')

In [48]:
unique_doc_ids_ru = {}

for el in ru_wmt_21:
    unique_doc_ids_ru[el[0]] = {}

In [49]:
for el in ru_wmt_21:
    doc_id = el[0]
    unique_doc_ids_ru[doc_id][el[1]] = el[2].strip('\n')

In [51]:
res_wmt2021 = []

for el in align:
    if 'omitted' not in el:
        try:
            doc_id = el[0]
            sent_en = el[1]
            sent_ru = el[2]
            en = unique_doc_ids[doc_id][sent_en]
            ru = unique_doc_ids_ru[doc_id][sent_ru]
            res_wmt2021.append({"translation": {"ru": ru, "en": en}})
        except KeyError:
            pass

In [52]:
for el in res_wmt2021:
    results.append(el)

In [53]:
with jsonlines.open('train.jsonl', mode='w') as writer:
    for annotation in results:
        writer.write(annotation)

## Training

In [None]:
!git clone https://github.com/huggingface/transformers.git

In [None]:
%cd /content/transformers/
!pip install . -q

In [None]:
!pip install -r /content/transformers/examples/pytorch/translation/requirements.txt -q

In [None]:
!python /content/transformers/examples/pytorch/translation/run_translation.py \
    --model_name_or_path 'Helsinki-NLP/opus-mt-ru-en' \
    --do_train \
    --do_eval \
    --save_strategy steps \
    --source_lang ru \
    --num_train_epochs 10 \
    --target_lang en \
    --max_source_length 512 \
    --max_target_length 512 \
    --val_max_target_length 512 \
    --train_file '/content/drive/MyDrive/data_contest/train.json' \
    --validation_file '/content/drive/MyDrive/data_contest/val.json' \
    --output_dir '/content/drive/MyDrive/contest_train' \
    --per_device_train_batch_size=2 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
    --pad_to_max_length False \
    --save_steps 1000 \
    --evaluation_strategy steps \
    --logging_steps 5000 \
    --predict_with_generate