## Data preparation

In [87]:
import os
import jsonlines
import json
import pandas as pd

from collections import defaultdict

In [88]:
def lang_data_parse(path):
    
    with open(path, 'r') as f:
        lang = f.readlines()
        
    lang = [x.split('\t') for x in lang]
    
    unique_doc_ids = defaultdict(dict)
    
    for el in lang:
        doc_id = el[0]
        unique_doc_ids[doc_id][el[1]] = el[2].strip('\n')
        
    return unique_doc_ids

In [89]:
def data_add_test(path_en, path_ru, path_alignment, path_mapping):
    
    unique_doc_ids_en = lang_data_parse(path_en)
    unique_doc_ids_ru = lang_data_parse(path_ru)
    
    with open(path_mapping, 'r') as f:
        mapping = f.readlines()
        
    mapping = [x.strip('\n').split('\t') for x in mapping]
    mapping_dict = {}
    
    for el in mapping:
        mapping_dict[el[0]] = el[1]
        
    with open(path_alignment, 'r') as f:
        align = f.readlines()
        
    align = [x.strip('\n').split('\t') for x in align]
    align = [[mapping_dict[x[1]], x[2], x[3]] for x in align]

    return align, unique_doc_ids_en, unique_doc_ids_ru

Adding wmt20 train data:

In [2]:
path = 'ru-en_20/ru-en-release'

In [3]:
files_en = []
files_ru = []

for el in os.listdir(path):
    if el[-6] == 'e':
        files_en.append(el)
    else:
        files_ru.append(el)

In [4]:
files_en = sorted(files_en)
files_ru = sorted(files_ru)

In [78]:
results = []

for i, j in zip(files_en, files_ru):
    with open(path + '/' + i, 'r') as f:
        en = f.read()
    with open(path + '/' + j, 'r') as f:
        ru = f.read()
    results.append({"translation": {"ru": ru,
                                    "en": en
                                    }})

Adding contest's train data:

In [6]:
with open('train_test/train.jsonl', 'r') as f:
    train_data = f.readlines()

In [7]:
for el in train_data:
    results.append(el.strip('\n'))

Adding wmt20 test data:

In [90]:
align_20, unique_doc_ids_20, unique_doc_ids_ru_20 = data_add_test(
                            'wmt20test/medline_ru2en_en.txt',
                            'wmt20test/medline_ru2en_ru.txt',
                            'wmt20test/ru-en_align_validation.tsv',
                            'wmt20test/ruen_mapping.txt'
                           )

In [91]:
res_20 = []

for el in align_20:
    if 'omitted' not in el:
        try:
            doc_id = el[0]
            sent_en = el[2]
            sent_ru = el[1]
            en = unique_doc_ids_20[doc_id][sent_en]
            ru = unique_doc_ids_ru_20[doc_id][sent_ru]
            res_20.append({"translation": {"ru": ru, "en": en}})
        except KeyError:
            pass

In [81]:
for el in res_20:
    results.append(el)

Adding wmt21 test data:

In [82]:
align_21, unique_doc_ids_21, unique_doc_ids_ru_21 = data_add_test(
                            'wmt21test/medline_ru2en_en.txt',
                            'wmt21test/medline_ru2en_ru.txt',
                            'wmt21test/ru-en_align_validation.tsv',
                            'wmt21test/ruen_mapping.txt'
                           )

In [83]:
res_21 = []

for el in align_21:
    if 'omitted' not in el:
        try:
            doc_id = el[0]
            sent_en = el[1]
            sent_ru = el[2]
            en = unique_doc_ids_21[doc_id][sent_en]
            ru = unique_doc_ids_ru_21[doc_id][sent_ru]
            res_21.append({"translation": {"ru": ru, "en": en}})
        except KeyError:
            pass

In [85]:
for el in res_21:
    results.append(el)

Combining and saving to a single jsonlines file:

In [86]:
with jsonlines.open('train.jsonl', mode='w') as writer:
    for annotation in results:
        writer.write(annotation)

## Training

In [None]:
!git clone https://github.com/huggingface/transformers.git

In [None]:
%cd /content/transformers/
!pip install . -q

In [None]:
!pip install -r /content/transformers/examples/pytorch/translation/requirements.txt -q

In [None]:
!python /content/transformers/examples/pytorch/translation/run_translation.py \
    --model_name_or_path 'Helsinki-NLP/opus-mt-ru-en' \
    --do_train \
    --do_eval \
    --save_strategy steps \
    --source_lang ru \
    --num_train_epochs 10 \
    --target_lang en \
    --max_source_length 512 \
    --max_target_length 512 \
    --val_max_target_length 512 \
    --train_file '/content/drive/MyDrive/data_contest/train.json' \
    --validation_file '/content/drive/MyDrive/data_contest/val.json' \
    --output_dir '/content/drive/MyDrive/contest_train' \
    --per_device_train_batch_size=2 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
    --pad_to_max_length False \
    --save_steps 1000 \
    --evaluation_strategy steps \
    --logging_steps 5000 \
    --predict_with_generate