# Детоксификация текстов на русском языке с помощью модели RuT5

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [3]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.22.0


In [4]:
!pip install transformers
!pip install datasets==2.2.2
!pip install razdel
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.3 MB/s[0m eta [36m0:00:0

In [5]:
import os
import pandas as pd
import numpy as np
from argparse import ArgumentParser
from functools import partial
from shutil import rmtree
from tqdm import trange

import numpy as np
from datasets import load_metric
from razdel import tokenize
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel
)

from datasets import Dataset, DatasetDict

In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

N_SEEDS = 10
N_EPOCHS = 30
LR_VALUE = 5e-5
#DECAY_VALUES = (1e-4, 1e-2, 0.1)
#BATCH_SIZES = (32, 64)
BATCH_SIZE = 32

## Функции для оценки качества
На базе https://github.com/s-nlp/russe_detox_2022/blob/main/evaluation/ru_detoxification_evaluation.ipynb

Мы будем использовать `Style Transfer Accuracy (STA)`, `Meaning Preservation Score (SIM)`, `Fluency Score (FL)`, и `Joint Score (J)`.

In [7]:
def load_model(model_name=None, model=None, tokenizer=None,
               model_class=AutoModelForSequenceClassification, use_cuda=True):
    if model is None:
        if model_name is None:
            raise ValueError('Either model or model_name should be provided')
        model = model_class.from_pretrained(model_name)
        if torch.cuda.is_available() and use_cuda:
            model.cuda()
    if tokenizer is None:
        if model_name is None:
            raise ValueError('Either tokenizer or model_name should be provided')
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

### Style Transfer Accuracy (STA)

In [8]:
def prepare_target_label(model, target_label):
    if target_label in model.config.id2label:
        pass
    elif target_label in model.config.label2id:
        target_label = model.config.label2id.get(target_label)
    elif target_label.isnumeric() and int(target_label) in model.config.id2label:
        target_label = int(target_label)
    else:
        raise ValueError(f'target_label "{target_label}" is not in model labels or ids: {model.config.id2label}.')
    return target_label

In [9]:
def classify_texts(model, tokenizer, texts, second_texts=None, target_label=None, batch_size=32, verbose=False):
    target_label = prepare_target_label(model, target_label)
    res = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        inputs = [texts[i:i+batch_size]]
        if second_texts is not None:
            inputs.append(second_texts[i:i+batch_size])
        inputs = tokenizer(*inputs, return_tensors='pt', padding=True, truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            preds = torch.softmax(model(**inputs).logits, -1)[:, target_label].cpu().numpy()
        res.append(preds)
    return np.concatenate(res)

In [10]:
def rotation_calibration(data, coef=1.0, px=1, py=1, minimum=0, maximum=1):
    result = (data - px) * coef + py
    if minimum is not None:
        result = np.maximum(minimum, result)
    if maximum is not None:
        result = np.minimum(maximum, result)
    return result

In [11]:
def evaluate_style(
    model,
    tokenizer,
    texts,
    target_label=1,  # 1 is toxic, 0 is neutral
    batch_size=32,
    verbose=False
):
    target_label = prepare_target_label(model, target_label)
    scores = classify_texts(
        model,
        tokenizer,
        texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    return rotation_calibration(scores, 0.90)

In [12]:
style_model, style_tokenizer = load_model('SkolkovoInstitute/russian_toxicity_classifier')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Meaning Preservation Score (SIM)

In [13]:
def encode_cls(texts, model, tokenizer, batch_size=32, verbose=False):
    results = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        batch = texts[i: i + batch_size]
        with torch.no_grad():
            out = model(**tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(model.device))
            embeddings = out.pooler_output
            embeddings = torch.nn.functional.normalize(embeddings).cpu().numpy()
            results.append(embeddings)
    return np.concatenate(results)

In [14]:
def evaluate_cosine_similarity(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    batch_size=32,
    verbose=False,
):
    scores = (
        encode_cls(original_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
        * encode_cls(rewritten_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
    ).sum(1)
    return rotation_calibration(scores, 1.50)

In [15]:
meaning_model, meaning_tokenizer = load_model('cointegrated/LaBSE-en-ru', model_class=AutoModel)

Downloading (…)lve/main/config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/516M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Fluency score (FL)

In [16]:
def evaluate_cola_relative(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    target_label=1,
    batch_size=32,
    verbose=False,
    maximum=0,
):
    target_label = prepare_target_label(model, target_label)
    original_scores = classify_texts(
        model, tokenizer,
        original_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    rewritten_scores = classify_texts(
        model, tokenizer,
        rewritten_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    scores = rewritten_scores - original_scores
    if maximum is not None:
        scores = np.minimum(0, scores)
    return rotation_calibration(scores, 1.15, px=0)

In [17]:
cola_model, cola_tolenizer = load_model('SkolkovoInstitute/rubert-base-corruption-detector')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Функция `compute_metrics`

In [18]:
def compute_metrics(p, tokenizer):
    preds = tokenizer.batch_decode(p.predictions, skip_special_tokens=True)
    labels = np.where(p.label_ids != -100, p.label_ids, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    sta = evaluate_style(
        model = style_model,
        tokenizer = style_tokenizer,
        texts = preds,
        target_label=0,  # 1 is toxic, 0 is neutral
        batch_size=32,
        verbose=True
        )

    similarity = evaluate_cosine_similarity(
        model = meaning_model,
        tokenizer = meaning_tokenizer,
        original_texts = labels,
        rewritten_texts = preds,
        batch_size=32,
        verbose=True,
        )

    fluency = evaluate_cola_relative(
        model = cola_model,
        tokenizer = cola_tolenizer,
        original_texts = labels,
        rewritten_texts = preds,
        target_label=1,
        batch_size=32,
        verbose=True
        )

    joint = sta * similarity * fluency

    result = {"STA": np.mean(sta), "SIM": np.mean(similarity), "FL": np.mean(fluency), "Joint": np.mean(joint)}

    return result


## Загрузка данных

In [19]:
def preprocess_examples(examples, tokenizer):
    result = tokenizer(examples["toxic_comment"], padding=False)

    if "neutral_comment1" in examples:
        result["labels"] = examples["neutral_comment1"]
        result["labels"] = tokenizer(result["labels"], padding=False)["input_ids"]

    result["length"] = [len(list(tokenize(sentence))) for sentence in examples["toxic_comment"]]
    return result

In [20]:
DATA_DIR = '/content/'
TRAIN_FILE = DATA_DIR + "train_preprocessed.tsv"
DEV_FILE = DATA_DIR + "dev.tsv"
TEST_FILE = DATA_DIR + "test.tsv"


def read_splits(*, as_datasets):
    train_df, dev_df, test_df = map(
        partial(pd.read_csv, sep='\t'), (TRAIN_FILE, DEV_FILE, TEST_FILE),
    )


    if as_datasets:
        train, dev, test = map(Dataset.from_pandas, (train_df, dev_df, test_df))
        return DatasetDict(train=train, dev=dev, test=test)
    else:
        return train_df, dev_df, test_df

## Токенизация

In [21]:
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
)

In [22]:
MODEL_NAME = 'ai-forever/ruT5-base'

In [23]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [24]:
splits = read_splits(as_datasets=True)

tokenized_splits = splits.map(
      partial(preprocess_examples, tokenizer=tokenizer),
      batched=True,
      remove_columns=["toxic_comment"],
      keep_in_memory=True,
  )

data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)

  # seed, lr, wd, bs
dev_metrics_per_run = np.empty((1, 4))

best_joint = -float("inf")

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Training loop

In [25]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

training_args = Seq2SeqTrainingArguments(
            output_dir=f"checkpoints/",
            overwrite_output_dir=True,
            evaluation_strategy="epoch",
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            learning_rate=LR_VALUE,
            num_train_epochs=N_EPOCHS,
            lr_scheduler_type="constant",
            optim="adamw_torch",
            save_strategy="epoch",
            save_total_limit=1,
            seed=N_SEEDS,
            fp16=True,
            dataloader_num_workers=4,
            group_by_length=True,
            report_to="none",
            load_best_model_at_end=True,
            metric_for_best_model="eval_Joint",
            predict_with_generate=True,
            )

trainer = Seq2SeqTrainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_splits["train"],
                eval_dataset=tokenized_splits["dev"],
                compute_metrics=partial(compute_metrics, tokenizer=tokenizer),
                tokenizer=tokenizer,
                data_collator=data_collator,
                )

train_result = trainer.train()
print("train", train_result.metrics)


dev_predictions = trainer.predict(test_dataset=tokenized_splits["dev"])
print("dev", dev_predictions.metrics)
dev_metrics_per_run = (
                    dev_predictions.metrics["test_STA"],
                    dev_predictions.metrics["test_SIM"],
                    dev_predictions.metrics["test_FL"],
                    dev_predictions.metrics["test_Joint"],
                )


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Sta,Sim,Fl,Joint
1,No log,1.571485,0.598734,0.69314,0.871072,0.378054
2,No log,1.469107,0.664068,0.709448,0.872987,0.425254
3,1.869300,1.408549,0.723986,0.721789,0.880318,0.476253
4,1.869300,1.424934,0.736998,0.718732,0.89148,0.4845
5,1.087500,1.400888,0.769759,0.723608,0.897335,0.512467
6,1.087500,1.457947,0.785783,0.729603,0.89335,0.522861
7,1.087500,1.444631,0.776063,0.728216,0.895226,0.520222
8,0.852200,1.430438,0.794757,0.73222,0.897424,0.529653
9,0.852200,1.496592,0.795437,0.730826,0.902685,0.533336
10,0.697800,1.487756,0.794543,0.733831,0.890553,0.528399


100%|██████████| 25/25 [00:01<00:00, 17.32it/s]
100%|██████████| 25/25 [00:01<00:00, 15.99it/s]
100%|██████████| 25/25 [00:01<00:00, 16.83it/s]
100%|██████████| 25/25 [00:01<00:00, 17.71it/s]
100%|██████████| 25/25 [00:01<00:00, 18.17it/s]
100%|██████████| 25/25 [00:01<00:00, 17.96it/s]
100%|██████████| 25/25 [00:01<00:00, 15.75it/s]
100%|██████████| 25/25 [00:01<00:00, 16.72it/s]
100%|██████████| 25/25 [00:01<00:00, 16.97it/s]
100%|██████████| 25/25 [00:01<00:00, 17.36it/s]
100%|██████████| 25/25 [00:01<00:00, 18.86it/s]
100%|██████████| 25/25 [00:01<00:00, 15.76it/s]
100%|██████████| 25/25 [00:01<00:00, 17.43it/s]
100%|██████████| 25/25 [00:01<00:00, 16.90it/s]
100%|██████████| 25/25 [00:01<00:00, 18.62it/s]
100%|██████████| 25/25 [00:01<00:00, 17.43it/s]
100%|██████████| 25/25 [00:01<00:00, 15.72it/s]
100%|██████████| 25/25 [00:01<00:00, 16.14it/s]
100%|██████████| 25/25 [00:01<00:00, 17.26it/s]
100%|██████████| 25/25 [00:01<00:00, 17.57it/s]
100%|██████████| 25/25 [00:01<00:00, 22.

train {'train_runtime': 3515.9889, 'train_samples_per_second': 56.357, 'train_steps_per_second': 1.766, 'total_flos': 8710600271339520.0, 'train_loss': 0.6091955458292446, 'epoch': 30.0}




100%|██████████| 25/25 [00:01<00:00, 21.89it/s]
100%|██████████| 25/25 [00:01<00:00, 15.13it/s]
100%|██████████| 25/25 [00:01<00:00, 19.24it/s]
100%|██████████| 25/25 [00:01<00:00, 16.71it/s]
100%|██████████| 25/25 [00:01<00:00, 22.00it/s]


dev {'test_loss': 1.9621657133102417, 'test_STA': 0.8264595866203308, 'test_SIM': 0.7269046306610107, 'test_FL': 0.9154664874076843, 'test_Joint': 0.5595974922180176, 'test_runtime': 43.7997, 'test_samples_per_second': 18.265, 'test_steps_per_second': 0.571}


In [26]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [27]:
trainer.save_model('/content/gdrive/My Drive/model_ruT5_5e-5_withPreprocessing')

## Инференс
На базе https://github.com/s-nlp/russe_detox_2022/blob/main/baselines/t5/t5_inference.ipynb

In [28]:
def paraphrase(text, model, n=None, max_length='auto', temperature=0.0, beams=3):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs,
        num_return_sequences=n or 1,
        do_sample=False,
        temperature=temperature,
        repetition_penalty=3.0,
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]
    if not n and isinstance(text, str):
        return texts[0]
    return texts

In [29]:
dev = pd.read_csv('/content/dev.tsv', sep='\t')

In [30]:
toxic_inputs = dev['toxic_comment']

In [31]:
from tqdm.auto import tqdm, trange

In [32]:
para_results = []
problematic_batch = [] #if something goes wrong you can track such bathces
batch_size = 8

for i in tqdm(range(0, len(toxic_inputs), batch_size)):
    batch = [sentence for sentence in toxic_inputs[i:i + batch_size]]
    try:
        para_results.extend(paraphrase(batch, model, temperature=0.0))
    except Exception as e:
        print(i)
        para_results.append(toxic_inputs[i:i + batch_size])

  0%|          | 0/100 [00:00<?, ?it/s]



In [33]:
with open('/content/gdrive/My Drive/model_ruT5_5e-5_withPreprocessing/rut5_5e5_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence+'\n' for sentence in para_results])

In [34]:
preds = para_results

## Оценка
Переиспользуем еще раз функции для подсчета качества.

### Style Transfer Accuracy (STA)

In [35]:
accuracy = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = preds,
    target_label=0,  # 1 is toxic, 0 is neutral
    batch_size=32,
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

In [36]:
print(f'Style transfer accuracy (STA):  {np.mean(accuracy)}')

Style transfer accuracy (STA):  0.8275251984596252


### Meaning Preservation Score (SIM)

In [37]:
similarity = evaluate_cosine_similarity(
    model = meaning_model,
    tokenizer = meaning_tokenizer,
    original_texts = list(toxic_inputs),
    rewritten_texts = preds,
    batch_size=32,
    verbose=True,
    )

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [38]:
print(f'Meaning preservation (SIM):  {np.mean(similarity)}')

Meaning preservation (SIM):  0.8176755309104919


### Fluency score (FL)

In [39]:
fluency = evaluate_cola_relative(
    model = cola_model,
    tokenizer = cola_tolenizer,
    original_texts = list(toxic_inputs),
    rewritten_texts = preds,
    target_label=1,
    batch_size=32,
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [40]:
print(f'Fluency score (FL):  {np.mean(fluency)}')

Fluency score (FL):  0.8776353597640991


### Joint score (J)

In [41]:
joint = accuracy * similarity * fluency

In [42]:
print(f'Joint score (J):   {np.mean(joint)}')

Joint score (J):   0.5930470824241638


### ChrF1 with references

In [43]:
from nltk.translate.chrf_score import corpus_chrf

In [44]:
df = dev.fillna('')
neutral_references = []
for index, row in df.iterrows():
    neutral_references.append([row['neutral_comment1'], row['neutral_comment2'], row['neutral_comment3']])

In [45]:
corpus_chrf(neutral_references, preds)

0.571609580671046