In [None]:
import torch
import os
from datasets import load_dataset, load_metric
import sentencepiece
import re
import numpy as np
import pandas as pd
import transformers
import nltk
from tqdm import notebook
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, MarianMTModel,MarianTokenizer
from translate.storage.tmx import tmxfile
from transformers import 

os.environ["WANDB_DISABLED"]="true"
model_checkpoint = "Helsinki-NLP/opus-mt-ru-en"
metric = load_metric("sacrebleu")
TMX_FILE_NAME = 'Letters.tmx'

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
with open(TMX_FILE_NAME, 'rb') as fin:
...     tmx_file = tmxfile(fin, 'ru', 'en')
source =[]
translation=[]
for node in tmx_file.unit_iter():
    source.append(node.source)
    translation.append(node.target)

dataset = pd.DataFrame(data=(source,translation))
dataset = dataset.transpose()
dataset.columns =['ru','en']


In [None]:
dataset.to_csv('letter.csv',sep=';',index=None)

In [None]:
ds = load_dataset('csv', data_files='letter.csv',delimiter=';',split='train')

In [None]:
ds = ds.train_test_split(test_size=0.2, shuffle=True)

In [None]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

In [None]:
max_input_length = 128
max_target_length = 128
source_lang = "ru"
target_lang = "en"

def preprocess_function(examples):
    inputs = examples["ru"]
    targets = examples["en"]
    model_inputs = tokenizer(inputs, max_length=max_input_length)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
tokenized_datasets = ds.map(preprocess_function,batched=True)

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}-lett",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    push_to_hub=True
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
old_collator = trainer.data_collator
trainer.data_collator = lambda data: dict(old_collator(data))

In [None]:
trainer.train()

In [None]:
trainer.save_model('letter')

In [None]:
from transformers import pipeline
pipe = pipeline("translation", model="Kovalev/opus-mt-ru-en-finetuned-ru-to-en-lett")
text = input()
print(pipe(text))