# RuGPT3: инференс и валидация

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Инференс" data-toc-modified-id="Инференс-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Инференс</a></span></li><li><span><a href="#Оценка" data-toc-modified-id="Оценка-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Оценка</a></span><ul class="toc-item"><li><span><a href="#Style-Transfer-Accuracy-(STA)" data-toc-modified-id="Style-Transfer-Accuracy-(STA)-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Style Transfer Accuracy (STA)</a></span></li><li><span><a href="#Meaning-Preservation-Score-(SIM)" data-toc-modified-id="Meaning-Preservation-Score-(SIM)-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Meaning Preservation Score (SIM)</a></span></li><li><span><a href="#Fluency-Score-(Fl)" data-toc-modified-id="Fluency-Score-(Fl)-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Fluency Score (Fl)</a></span></li><li><span><a href="#Joint-Score-(J)" data-toc-modified-id="Joint-Score-(J)-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Joint Score (J)</a></span></li><li><span><a href="#ChrF1-with-references" data-toc-modified-id="ChrF1-with-references-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>ChrF1 with references</a></span></li></ul></li></ul></div>

## Инференс

In [1]:
import warnings
warnings.filterwarnings('ignore')
from transformers.utils import logging
logging.set_verbosity(40)

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1050


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("./model_ruGPT3_noEDA_20epochs")

In [4]:
model_60 =  AutoModelForCausalLM.from_pretrained("./model_ruGPT3_noEDA_60epochs")

In [5]:
tokenizer.pad_token = tokenizer.eos_token

In [89]:
def paraphrase_sentence(text, model, n=None, max_length='auto', temperature=0.0, beams=3):
    text = '<toxic>'+text+'</toxic> >>>>> <neutral>'
    inputs = tokenizer(text, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs,
        num_return_sequences=n or 1,
        do_sample=False,
        temperature=temperature,
        repetition_penalty=5.0,
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=beams,
    )
    text = tokenizer.decode(result[0], skip_special_tokens=True)
    text = text.split('<neutral>')[1].split('</ne')[0]
    return text

In [93]:
def paraphrase_batch(texts, model, n=None, max_length='auto', temperature=0.0, beams=3):
    result = []
    for text in texts:
        result.append(paraphrase_sentence(text, model=model, n=n, max_length=max_length, temperature=temperature, beams=3))
    return result

In [9]:
import pandas as pd

In [10]:
dev = pd.read_csv('../content/dev.tsv', sep='\t')

In [11]:
toxic_inputs = list(dev['toxic_comment'])

In [12]:
from tqdm.auto import tqdm, trange

In [98]:
para_results = []
problematic_batch = [] #if something goes wrong you can track such bathces
batch_size = 8

for i in tqdm(range(0, len(toxic_inputs), batch_size)):
    batch = [sentence for sentence in toxic_inputs[i:i + batch_size]]
    try:
        para_results.extend(paraphrase_batch(batch, model, temperature=0.0))
    except Exception as e:
        print(i, e)
        para_results.append(toxic_inputs[i:i + batch_size])

  0%|          | 0/100 [00:00<?, ?it/s]

In [100]:
with open('ruGPT3_20epochs_dev.txt', 'w') as file:
    file.writelines([sentence+'\n' for sentence in para_results])

In [102]:
para_results60 = []
problematic_batch = [] #if something goes wrong you can track such bathces
batch_size = 8

for i in tqdm(range(0, len(toxic_inputs), batch_size)):
    batch = [sentence for sentence in toxic_inputs[i:i + batch_size]]
    try:
        para_results60.extend(paraphrase_batch(batch, model_60, temperature=0.0))
    except Exception as e:
        print(i, e)
        para_results60.append(toxic_inputs[i:i + batch_size])

  0%|          | 0/100 [00:00<?, ?it/s]

In [103]:
para_results == para_results60

True

In [105]:
with open('ruGPT3_60epochs_dev.txt', 'w') as file:
    file.writelines([sentence+'\n' for sentence in para_results60])

In [106]:
import gc
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

## Оценка

In [108]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

In [109]:
def load_model(model_name=None, model=None, tokenizer=None,
               model_class=AutoModelForSequenceClassification, use_cuda=True):
    if model is None:
        if model_name is None:
            raise ValueError('Either model or model_name should be provided')
        model = model_class.from_pretrained(model_name)
        if torch.cuda.is_available() and use_cuda:
            model.cuda()
    if tokenizer is None:
        if model_name is None:
            raise ValueError('Either tokenizer or model_name should be provided')
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

### Style Transfer Accuracy (STA)

In [110]:
def prepare_target_label(model, target_label):
    if target_label in model.config.id2label:
        pass
    elif target_label in model.config.label2id:
        target_label = model.config.label2id.get(target_label)
    elif target_label.isnumeric() and int(target_label) in model.config.id2label:
        target_label = int(target_label)
    else:
        raise ValueError(f'target_label "{target_label}" is not in model labels or ids: {model.config.id2label}.')
    return target_label

In [111]:
def classify_texts(model, tokenizer, texts, second_texts=None, target_label=None, batch_size=32, verbose=False):
    target_label = prepare_target_label(model, target_label)
    res = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        inputs = [texts[i:i+batch_size]]
        if second_texts is not None:
            inputs.append(second_texts[i:i+batch_size])
        inputs = tokenizer(*inputs, return_tensors='pt', padding=True, truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            preds = torch.softmax(model(**inputs).logits, -1)[:, target_label].cpu().numpy()
        res.append(preds)
    return np.concatenate(res)

In [112]:
def rotation_calibration(data, coef=1.0, px=1, py=1, minimum=0, maximum=1):
    result = (data - px) * coef + py
    if minimum is not None:
        result = np.maximum(minimum, result)
    if maximum is not None:
        result = np.minimum(maximum, result)
    return result

In [113]:
def evaluate_style(
    model,
    tokenizer,
    texts,
    target_label=1,  # 1 is toxic, 0 is neutral
    batch_size=32,
    verbose=False
):
    target_label = prepare_target_label(model, target_label)
    scores = classify_texts(
        model,
        tokenizer,
        texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    return rotation_calibration(scores, 0.90)

In [114]:
style_model, style_tokenizer = load_model('SkolkovoInstitute/russian_toxicity_classifier')

In [116]:
import numpy as np

In [117]:
accuracy = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = para_results,
    target_label=0,  # 1 is toxic, 0 is neutral
    batch_size=32,
    verbose=True
)
print(f'Style transfer accuracy (STA):  {np.mean(accuracy)}')

  0%|          | 0/25 [00:00<?, ?it/s]

Style transfer accuracy (STA):  0.779522716999054


In [118]:
cleanup()

### Meaning Preservation Score (SIM)

In [119]:
def encode_cls(texts, model, tokenizer, batch_size=32, verbose=False):
    results = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        batch = texts[i: i + batch_size]
        with torch.no_grad():
            out = model(**tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(model.device))
            embeddings = out.pooler_output
            embeddings = torch.nn.functional.normalize(embeddings).cpu().numpy()
            results.append(embeddings)
    return np.concatenate(results)

In [120]:
def evaluate_cosine_similarity(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    batch_size=32,
    verbose=False,
):
    scores = (
        encode_cls(original_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
        * encode_cls(rewritten_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
    ).sum(1)
    return rotation_calibration(scores, 1.50)

In [121]:
meaning_model, meaning_tokenizer = load_model('cointegrated/LaBSE-en-ru', model_class=AutoModel)

In [122]:
similarity = evaluate_cosine_similarity(
    model = meaning_model,
    tokenizer = meaning_tokenizer,
    original_texts = list(toxic_inputs),
    rewritten_texts = para_results,
    batch_size=32,
    verbose=True,
    )
print(f'Meaning preservation (SIM):  {np.mean(similarity)}')

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Meaning preservation (SIM):  0.49658897519111633


In [123]:
cleanup()

### Fluency Score (Fl)

In [124]:
def evaluate_cola_relative(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    target_label=1,
    batch_size=32,
    verbose=False,
    maximum=0,
):
    target_label = prepare_target_label(model, target_label)
    original_scores = classify_texts(
        model, tokenizer,
        original_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    rewritten_scores = classify_texts(
        model, tokenizer,
        rewritten_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    scores = rewritten_scores - original_scores
    if maximum is not None:
        scores = np.minimum(0, scores)
    return rotation_calibration(scores, 1.15, px=0)

In [125]:
cola_model, cola_tolenizer = load_model('SkolkovoInstitute/rubert-base-corruption-detector')

In [126]:
fluency = evaluate_cola_relative(
    model = cola_model,
    tokenizer = cola_tolenizer,
    original_texts = list(toxic_inputs),
    rewritten_texts = para_results,
    target_label=1,
    batch_size=32,
    verbose=True
)
print(f'Fluency score (FL):  {np.mean(fluency)}')

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Fluency score (FL):  0.6458035111427307


In [127]:
cleanup()

### Joint Score (J)

In [128]:
joint = accuracy * similarity * fluency
print(f'Joint score (J):   {np.mean(joint)}')

Joint score (J):   0.265836626291275


### ChrF1 with references

In [129]:
from nltk.translate.chrf_score import corpus_chrf

In [130]:
df = dev.fillna('')
neutral_references = []
for index, row in df.iterrows():
    neutral_references.append([row['neutral_comment1'], row['neutral_comment2'], row['neutral_comment3']])

In [131]:
corpus_chrf(neutral_references, para_results)

0.3336086972278492