# Детоксификация текстов на русском языке с помощью модели RuGPT3

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [2]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.22.0


In [3]:
!pip install transformers
!pip install datasets==2.2.2
!pip install razdel
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.9 MB/s[0m eta [36m0:00:0

In [4]:
import os
import pandas as pd
import numpy as np
from argparse import ArgumentParser
from functools import partial
from shutil import rmtree
from tqdm import trange

import numpy as np
from datasets import load_metric, load_dataset
from razdel import tokenize
from transformers import (
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel
)

from datasets import Dataset, DatasetDict

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

N_SEEDS = 10
N_EPOCHS = 5
#LR_VALUES = (3e-5, 5e-5)
LR_VALUE = 5e-5
#DECAY_VALUES = (1e-4, 1e-2, 0.1)
#BATCH_SIZES = (32, 64)
BATCH_SIZE = 16

## Загрузка и подготовка данных

In [6]:
DATA_DIR = '/content/'
TRAIN_FILE = DATA_DIR + "train.tsv"
DEV_FILE = DATA_DIR + "dev.tsv"
TEST_FILE = DATA_DIR + "test.tsv"


train_df, dev_df, test_df = map(partial(pd.read_csv, sep='\t'), (TRAIN_FILE, DEV_FILE, TEST_FILE))

# concatenate datasets
train1 = train_df[~train_df['neutral_comment1'].isna()][['toxic_comment', 'neutral_comment1']]
train2 = train_df[~train_df['neutral_comment2'].isna()][['toxic_comment', 'neutral_comment2']]
train3 = train_df[~train_df['neutral_comment3'].isna()][['toxic_comment', 'neutral_comment3']]
train2.columns = ['toxic_comment', 'neutral_comment1']
train3.columns = ['toxic_comment', 'neutral_comment1']
train_df = pd.concat([train1, train2, train3], ignore_index=True)
train_df['combined'] = '<toxic>' + train_df['toxic_comment'] + '</toxic>' + ' >>>>> ' + '<neutral>' + train_df['neutral_comment1'] + '</neutral>'
with open('train_combined.txt', 'w', encoding='utf-8') as f:
  for line in list(train_df['combined']):
    f.write(line + '\n')

dev1 = dev_df[~dev_df['neutral_comment1'].isna()][['toxic_comment', 'neutral_comment1']]
dev2 = dev_df[~dev_df['neutral_comment2'].isna()][['toxic_comment', 'neutral_comment2']]
dev3 = dev_df[~dev_df['neutral_comment3'].isna()][['toxic_comment', 'neutral_comment3']]
dev2.columns = ['toxic_comment', 'neutral_comment1']
dev3.columns = ['toxic_comment', 'neutral_comment1']
dev_df = pd.concat([dev1, dev2, dev3], ignore_index=True)
dev_df['combined'] = '<toxic>' + dev_df['toxic_comment'] + '</toxic>' + ' >>>>> ' + '<neutral>' + dev_df['neutral_comment1'] + '</neutral>'
with open('dev_combined.txt', 'w', encoding='utf-8') as f:
  for line in dev_df['combined']:
    f.write(line + '\n')

In [7]:
dataset = load_dataset("text", data_files={"train": "/content/train_combined.txt", "dev": "/content/dev_combined.txt", "test": TEST_FILE})



Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-fbc0b6fd6b324847/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-fbc0b6fd6b324847/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Токенизация

In [8]:
MODEL_NAME = 'ai-forever/rugpt3small_based_on_gpt2'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)lve/main/config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def preprocess_examples(examples):
    result = tokenizer(examples['text'], padding=False)
    #result = tokenizer(examples, truncation=True, max_length=256,)
    return result

In [11]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

tokenized_dataset = dataset.map(
    preprocess_examples,
    batched=True,
    remove_columns=["text"],
    keep_in_memory=True,
)


  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Training loop

In [12]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, pad_token_id=tokenizer.eos_token_id)

training_args = TrainingArguments(
            output_dir=f"checkpoints/",
            overwrite_output_dir=True,
            evaluation_strategy="epoch",
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            learning_rate=LR_VALUE,
            num_train_epochs=N_EPOCHS,
            lr_scheduler_type="constant",
            optim="adamw_torch",
            save_strategy="epoch",
            save_total_limit=1,
            seed=N_SEEDS,
            dataloader_num_workers=4,
            group_by_length=True,
            report_to="none",
            load_best_model_at_end=True,
            fp16=True,
            #use_cpu=True,
            #bf16=True
            )
trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_dataset["train"],
                eval_dataset=tokenized_dataset["dev"],
                tokenizer=tokenizer,
                data_collator=data_collator,
                )

train_result = trainer.train()
print("train", train_result.metrics)


Downloading pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.8952,1.972577
2,1.6325,2.024904
3,1.2513,2.117942
4,1.0697,2.252867
5,0.9676,2.364754


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

train {'train_runtime': 668.9419, 'train_samples_per_second': 83.251, 'train_steps_per_second': 5.21, 'total_flos': 1700092523520000.0, 'train_loss': 1.3055958204666203, 'epoch': 5.0}


In [13]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [14]:

trainer.save_model('/content/gdrive/My Drive/model_ruGPT3_noEDA_5epochs')

## Инференс


In [15]:
def paraphrase_sentence(text, model, n=None, max_length='auto', temperature=0.0, beams=3):
    text = '<toxic>'+text+'</toxic> >>>>> <neutral>'
    inputs = tokenizer(text, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs,
        num_return_sequences=n or 1,
        do_sample=False,
        temperature=temperature,
        repetition_penalty=5.0,
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=beams,
    )
    text = tokenizer.decode(result[0], skip_special_tokens=True)
    text = text.split('<neutral>')[1].split('</ne')[0]
    return text

In [16]:
def paraphrase_batch(texts, model, n=None, max_length='auto', temperature=0.0, beams=3):
    result = []
    for text in texts:
        result.append(paraphrase_sentence(text, model=model, n=n, max_length=max_length, temperature=temperature, beams=3))
    return result

In [17]:
dev = pd.read_csv('/content/dev.tsv', sep='\t')
toxic_inputs = dev['toxic_comment']

In [18]:
from tqdm.auto import tqdm, trange

In [19]:
para_results = []
problematic_batch = [] #if something goes wrong you can track such bathces
batch_size = 8

for i in tqdm(range(0, len(toxic_inputs), batch_size)):
    batch = [sentence for sentence in toxic_inputs[i:i + batch_size]]
    try:
        para_results.extend(paraphrase_batch(batch, model, temperature=0.0))
    except Exception as e:
        print(i)
        para_results.append(toxic_inputs[i:i + batch_size])

  0%|          | 0/100 [00:00<?, ?it/s]



In [20]:
with open('/content/rutgpt3_dev.txt', 'w', encoding='utf-8') as file:
    file.writelines([sentence+'\n' for sentence in para_results])

In [21]:
preds = para_results

In [24]:
import gc
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

## Оценка
На базе https://github.com/s-nlp/russe_detox_2022/blob/main/evaluation/ru_detoxification_evaluation.ipynb

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

In [26]:
def load_model(model_name=None, model=None, tokenizer=None,
               model_class=AutoModelForSequenceClassification, use_cuda=True):
    if model is None:
        if model_name is None:
            raise ValueError('Either model or model_name should be provided')
        model = model_class.from_pretrained(model_name)
        if torch.cuda.is_available() and use_cuda:
            model.cuda()
    if tokenizer is None:
        if model_name is None:
            raise ValueError('Either tokenizer or model_name should be provided')
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

### Style Transfer Accuracy (STA)

In [27]:
def prepare_target_label(model, target_label):
    if target_label in model.config.id2label:
        pass
    elif target_label in model.config.label2id:
        target_label = model.config.label2id.get(target_label)
    elif target_label.isnumeric() and int(target_label) in model.config.id2label:
        target_label = int(target_label)
    else:
        raise ValueError(f'target_label "{target_label}" is not in model labels or ids: {model.config.id2label}.')
    return target_label

In [28]:
def classify_texts(model, tokenizer, texts, second_texts=None, target_label=None, batch_size=32, verbose=False):
    target_label = prepare_target_label(model, target_label)
    res = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        inputs = [texts[i:i+batch_size]]
        if second_texts is not None:
            inputs.append(second_texts[i:i+batch_size])
        inputs = tokenizer(*inputs, return_tensors='pt', padding=True, truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            preds = torch.softmax(model(**inputs).logits, -1)[:, target_label].cpu().numpy()
        res.append(preds)
    return np.concatenate(res)

In [29]:
def rotation_calibration(data, coef=1.0, px=1, py=1, minimum=0, maximum=1):
    result = (data - px) * coef + py
    if minimum is not None:
        result = np.maximum(minimum, result)
    if maximum is not None:
        result = np.minimum(maximum, result)
    return result

In [30]:
def evaluate_style(
    model,
    tokenizer,
    texts,
    target_label=1,  # 1 is toxic, 0 is neutral
    batch_size=32,
    verbose=False
):
    target_label = prepare_target_label(model, target_label)
    scores = classify_texts(
        model,
        tokenizer,
        texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    return rotation_calibration(scores, 0.90)

In [31]:
style_model, style_tokenizer = load_model('SkolkovoInstitute/russian_toxicity_classifier')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [32]:
accuracy = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = preds,
    target_label=0,  # 1 is toxic, 0 is neutral
    batch_size=32,
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

In [33]:
print(f'Style transfer accuracy (STA):  {np.mean(accuracy)}')

Style transfer accuracy (STA):  0.7790749073028564


### Meaning Preservation Score (SIM)

In [34]:
def encode_cls(texts, model, tokenizer, batch_size=32, verbose=False):
    results = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        batch = texts[i: i + batch_size]
        with torch.no_grad():
            out = model(**tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(model.device))
            embeddings = out.pooler_output
            embeddings = torch.nn.functional.normalize(embeddings).cpu().numpy()
            results.append(embeddings)
    return np.concatenate(results)

In [35]:
def evaluate_cosine_similarity(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    batch_size=32,
    verbose=False,
):
    scores = (
        encode_cls(original_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
        * encode_cls(rewritten_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
    ).sum(1)
    return rotation_calibration(scores, 1.50)

In [36]:
meaning_model, meaning_tokenizer = load_model('cointegrated/LaBSE-en-ru', model_class=AutoModel)

Downloading (…)lve/main/config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/516M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [37]:
similarity = evaluate_cosine_similarity(
    model = meaning_model,
    tokenizer = meaning_tokenizer,
    original_texts = list(toxic_inputs),
    rewritten_texts = preds,
    batch_size=32,
    verbose=True,
    )

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [38]:
print(f'Meaning preservation (SIM):  {np.mean(similarity)}')

Meaning preservation (SIM):  0.496984601020813


### Fluency score (FL)

In [39]:
def evaluate_cola_relative(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    target_label=1,
    batch_size=32,
    verbose=False,
    maximum=0,
):
    target_label = prepare_target_label(model, target_label)
    original_scores = classify_texts(
        model, tokenizer,
        original_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    rewritten_scores = classify_texts(
        model, tokenizer,
        rewritten_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    scores = rewritten_scores - original_scores
    if maximum is not None:
        scores = np.minimum(0, scores)
    return rotation_calibration(scores, 1.15, px=0)

In [40]:
cola_model, cola_tolenizer = load_model('SkolkovoInstitute/rubert-base-corruption-detector')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [41]:
fluency = evaluate_cola_relative(
    model = cola_model,
    tokenizer = cola_tolenizer,
    original_texts = list(toxic_inputs),
    rewritten_texts = preds,
    target_label=1,
    batch_size=32,
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [42]:
print(f'Fluency score (FL):  {np.mean(fluency)}')

Fluency score (FL):  0.6454139947891235


### Joint score (J)

In [43]:
joint = accuracy * similarity * fluency

In [44]:
print(f'Joint score (J):   {np.mean(joint)}')

Joint score (J):   0.2661975026130676


### ChrF1 with references

In [45]:
from nltk.translate.chrf_score import corpus_chrf

In [46]:
df = dev.fillna('')
neutral_references = []
for index, row in df.iterrows():
    neutral_references.append([row['neutral_comment1'], row['neutral_comment2'], row['neutral_comment3']])

In [47]:
corpus_chrf(neutral_references, preds)

0.33413664376386853