In [None]:
import os
import urllib
import pandas as pd
from tqdm.notebook import tqdm

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
DATA_PATH = '../data'

if not os.path.isdir(DATA_PATH):
    os.makedirs(DATA_PATH)
    urllib.request.urlretrieve('https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/train.tsv', os.path.join(DATA_PATH, 'train.tsv'))
    urllib.request.urlretrieve('https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/dev.tsv', os.path.join(DATA_PATH, 'dev.tsv'))
    urllib.request.urlretrieve('https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/test.tsv', os.path.join(DATA_PATH, 'test.tsv'))

In [None]:
train_data = pd.read_csv(os.path.join(DATA_PATH, 'train.tsv'), sep='\t').drop(columns=['index'])
val_data = pd.read_csv(os.path.join(DATA_PATH, 'dev.tsv'), sep='\t')
df = pd.concat([train_data, val_data]).reset_index(drop=True)
toxic_inputs = df['toxic_comment'].tolist()
neutral_inputs = df['neutral_comment1'].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ru-en')
model = AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-ru-en').to(device).eval()

In [None]:
def translate_text(text):
    inputs = tokenizer(text, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs)[0]

    return tokenizer.decode(outputs, skip_special_tokens=True)

In [None]:
toxic_en = [translate_text(text) for text in tqdm(toxic_inputs)]
neutral_en = [translate_text(text) for text in tqdm(neutral_inputs)]
pd.DataFrame({'toxic_comment': toxic_en, 'neutral_comment': neutral_en}).to_csv(os.path.join(DATA_PATH, 'data_en.csv'), index=False)