## Running the trained model on the test data

In [1]:
import json
import jsonlines
from nltk import sent_tokenize
from transformers import pipeline, AutoTokenizer,AutoModelForSeq2SeqLM

Initializing model and tokenizer:

In [2]:
config_path = 'Helsinki-NLP/opus-mt-ru-en' # this will download a model from huggingface.
# you can use any other path of yours
model = AutoModelForSeq2SeqLM.from_pretrained(config_path) 
tokenizer = AutoTokenizer.from_pretrained(config_path)

Opening the test data:

In [3]:
data = []

with jsonlines.open('test.jsonl', 'r') as f:
    for line in f:
        data.append(line['translation']['ru'])

Writing the preprocessing function:

In [4]:
def get_translation(text, lang, cur_model, cur_tokenizer):
    if lang == 'ru':
        translator = pipeline("translation_en_to_ru", model=cur_model, tokenizer=cur_tokenizer)
    else:
        translator = pipeline("translation_ru_to_en", model=cur_model, tokenizer=cur_tokenizer)
    text_length = len(text.split())
    text_split = str(text).split()
    if text_length > 100:
        sentences = sent_tokenize(text)
        result = []
        for sent in sentences:
            result.append(translator(sent)[0]['translation_text'])
        result = ' '.join(result)

    else:
        translation = translator(text)
        result = translation[0]['translation_text']    

    return result

Running the translation:

In [None]:
preds = []

for el in data:
    translated = get_translation(el, 'en', cur_model=model, cur_tokenizer=tokenizer)
    preds.append({"translation": {"ru": el, "en": translated}})
    print(translated)

Saving the predictions into the desired format:

In [None]:
with jsonlines.open('preds.jsonl', mode='w') as f:
    for line in preds:
        f.write(line)