In [1]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_PATH = "/content/drive/MyDrive/YandexCup/"

Mounted at /content/drive


Загружаем скачанный классификатор токсичности:

In [2]:
!pip install -q transformers
!pip install -q natasha
!pip install -q pymorphy2
!pip install -q -U pymorphy2-dicts-ru
!pip install -q hnswlib
!pip install -q pymystem3
!pip install -q pymorphy2
!pip install -q -U pymorphy2-dicts-ru

[K     |████████████████████████████████| 2.9 MB 12.6 MB/s 
[K     |████████████████████████████████| 56 kB 6.3 MB/s 
[K     |████████████████████████████████| 636 kB 62.9 MB/s 
[K     |████████████████████████████████| 895 kB 75.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 67.3 MB/s 
[K     |████████████████████████████████| 34.4 MB 56 kB/s 
[K     |████████████████████████████████| 49 kB 8.0 MB/s 
[K     |████████████████████████████████| 55 kB 5.0 MB/s 
[K     |████████████████████████████████| 41 kB 163 kB/s 
[K     |████████████████████████████████| 8.2 MB 46.5 MB/s 
[?25h  Building wheel for intervaltree (setup.py) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for hnswlib (PEP 517) ... [?25l[?25hdone


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained(DRIVE_PATH + "trained_roberta")

model = AutoModelForSequenceClassification.from_pretrained(DRIVE_PATH + "trained_roberta").cuda()

TOXIC_CLASS=-1
TOKENIZATION_TYPE='sentencepiece'


Ниже функции для применения классификатора

In [4]:
from torch import softmax, sigmoid
import numpy as np


import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm.auto import tqdm
import argparse
from pymystem3 import Mystem
import sys
from collections import Counter
from functools import partial



ALLOWED_ALPHABET=list(map(chr, range(ord('а'), ord('я') + 1)))
ALLOWED_ALPHABET.extend(map(chr, range(ord('a'), ord('z') + 1)))
ALLOWED_ALPHABET.extend(list(map(str.upper, ALLOWED_ALPHABET)))
ALLOWED_ALPHABET = set(ALLOWED_ALPHABET)


def logits_to_toxic_probas(logits):
    if logits.shape[-1] > 1:
        activation = lambda x: softmax(x, -1)
    else:
        activation = sigmoid
    return activation(logits)[:, TOXIC_CLASS].cpu().detach().numpy()


def is_word_start(token):
    if TOKENIZATION_TYPE == 'sentencepiece':
        return token.startswith('▁')
    if TOKENIZATION_TYPE == 'bert':
        return not token.startswith('##')
    raise ValueError("Unknown tokenization type")


def normalize(sentence, max_tokens_per_word=20):
    sentence = ''.join(map(lambda c: c if c.isalpha() else ' ', sentence.lower()))
    ids = tokenizer(sentence)['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(ids)[1:-1]
    
    result = []
    num_continuation_tokens = 0
    for token in tokens:
        if not is_word_start(token):
            num_continuation_tokens += 1
            if num_continuation_tokens < max_tokens_per_word:
                result.append(token.lstrip('#▁'))
        else:
            num_continuation_tokens = 0
            result.extend([' ', token.lstrip('▁#')])
    
    return ''.join(result).strip()

def iterate_batches(data, batch_size=40):
    batch = []
    for x in data:
        batch.append(x)
        if len(batch) >= batch_size:
            yield batch
            batch = []
    if len(batch) > 0:
        yield batch

from tqdm.auto import tqdm
def predict_toxicity(sentences, batch_size=5, threshold=0.5, return_scores=False, verbose=True, device='cuda', off_tqdm=False):
    results = []
    tqdm_fn = tqdm if verbose else lambda x, total: x
    for batch in tqdm_fn(iterate_batches(sentences, batch_size), total=np.ceil(len(sentences) / batch_size),  disable=off_tqdm):
        normlized = [normalize(sent, max_tokens_per_word=5) for sent in batch]
        tokenized = tokenizer(normlized, return_tensors='pt', padding=True, max_length=512, truncation=True)
        
        model_input = {key: val.to(device) for key, val in tokenized.items()}
        
        logits = model.to(device)(**model_input).logits
        
        preds = logits_to_toxic_probas(logits)
        
        if not return_scores:
            preds = preds >= threshold
        results.extend(preds)
    return results


Читаем тестовый набор

In [5]:
texts = []
raw_texts = []
with open(DRIVE_PATH + 'public_testset.txt', 'rt') as f:
    for line in f:
        raw_texts.append(line) 
        texts.append(normalize(line))

Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors


Вычисляем токсичность отдельных слов

Посчитаем токсичность для каждого предложения:

In [6]:
line_toxicities = []
line_mean_toxicities = []

for line in tqdm(np.random.choice(raw_texts, 25)):
    pred = predict_toxicity([line], return_scores=True, off_tqdm=True)
    
    line_words = normalize(line).split()
    toxi = [predict_toxicity([word], return_scores=True, off_tqdm=True) for word in line_words]
    
    line_toxicities.append(pred[0])
    line_mean_toxicities.append(np.mean(toxi))



  0%|          | 0/25 [00:00<?, ?it/s]

In [7]:
print(f"На ({len(line_toxicities)} комментах):")
print(np.mean(line_toxicities), f"- средняя токсичность всего комментария")
print(np.mean(line_mean_toxicities), "- cредняя токсичность каждого слова в комментарии")

На (25 комментах):
0.5373261 - средняя токсичность всего комментария
0.18996425 - cредняя токсичность каждого слова в комментарии


In [8]:

line = raw_texts[100]
pred = predict_toxicity([line], return_scores=True, off_tqdm=True)

line_words = normalize(line).split()
toxi = [predict_toxicity([word], return_scores=True, off_tqdm=True) for word in line_words]

print(line)
print(pred)
print(line_words)
print(toxi)



Можно сказать проще - кукуха едет от старческого маразма.

[0.8303647]
['можно', 'сказать', 'проще', 'кукуха', 'едет', 'от', 'старческого', 'маразма']
[[0.07630243], [0.2619401], [0.04409236], [0.56395835], [0.15138271], [0.0645212], [0.2058756], [0.63614625]]


In [9]:
def sort_by_toxicity(words):
    toxicities = predict_toxicity(words, return_scores = True, off_tqdm=True)
    #[токсичность, индекс, слово]
    result = [[toxicities[i], i, words[i]] for i in range(len(words))]
    result.sort()
    return result

line = line = np.random.choice(raw_texts)
pred = predict_toxicity([line], return_scores=True, off_tqdm=True)

line_words = normalize(line).split()

print(sort_by_toxicity(line_words))

[[0.022175852, 0, 'респект'], [0.048265655, 4, 'вообще'], [0.05011496, 6, 'первую'], [0.061193287, 9, 'то'], [0.06933392, 1, 'эти'], [0.07038343, 10, 'ещё'], [0.082705215, 7, 'полосу'], [0.085665196, 5, 'на'], [0.09521889, 3, 'надо'], [0.18159607, 12, 'ставят'], [0.2817572, 8, 'а'], [0.4454826, 11, 'дизы'], [0.5695918, 14, 'людишки'], [0.8388674, 13, 'тупорые'], [0.98567885, 2, 'уродов']]


Ниже читаем эмбеддинги слов и описываем функции их обработки

In [10]:
import gensim
from pymystem3 import Mystem

stemmer = Mystem()
#lemmas = stemmer.lemmatize("Красивая мама красиво мыла раму")

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [11]:
!pip install -q pymorphy2
!pip install -q -U pymorphy2-dicts-ru

from pymystem3 import Mystem
from pymorphy2 import MorphAnalyzer
import pymorphy2


morph = pymorphy2.MorphAnalyzer(lang='ru')

In [12]:
embs_file = np.load(DRIVE_PATH + 'embeddings_with_lemmas.npz', allow_pickle=True)
embs_vectors = embs_file['vectors']
embs_vectors_normed = embs_vectors / np.linalg.norm(embs_vectors, axis=1, keepdims=True)
embs_voc = embs_file['voc'].item()

embs_voc_by_id = [None for i in range(len(embs_vectors))]
for word, idx in embs_voc.items():
    if embs_voc_by_id[idx] is None:
        embs_voc_by_id[idx] = word

In [13]:
words = set(embs_voc.keys())

with torch.inference_mode():
    word_toxicities = predict_toxicity(words, batch_size=100, return_scores=True)

toxicity = dict(zip(words, word_toxicities))

  0%|          | 0/1627.0 [00:00<?, ?it/s]

In [14]:
sorted_toxicity = {k: v for k, v in sorted(toxicity.items(), key=lambda item: item[1], reverse = True)}
for k, v in list(sorted_toxicity.items())[:100]:
    print(k ,v)

благодаренута 0.9906294
пиндостан 0.990629
ублюдочный 0.9906288
пиндосий 0.99062866
долбоёб 0.9906279
черноволосый 0.9906275
ублюдок 0.9906267
долбоёба 0.9906262
пидораса 0.9906254
долбоеба 0.99062496
пиндосский 0.9906238
черножопый 0.9906166
пидорашка 0.99061453
кондор 0.9906143
пидорахо 0.9906136
пиндос 0.9906108
шаболовская 0.9906059
пидорас 0.9906031
уебищный 0.99060285
ебанутый 0.9906
андорра 0.99059266
пидарас 0.99058306
блюзовый 0.9905827
ублюдка 0.99058
выродок 0.99057066
обернутый 0.99057007
шабо 0.9905646
пидарастка 0.9905353
пидорский 0.99051744
сувальдный 0.9905172
выблядок 0.9905165
пиндосия 0.9904979
гейропа 0.9904908
уебка 0.99048233
пинбол 0.9904805
черноголовка 0.99047625
виталий::чуркин 0.990453
швальба 0.99033934
уебанский 0.99020326
2ч::3п::4с::5в::6п::7в::8с::9ч::10п::11с::12в::13п::14в::15с::16ч::17п::18с::19в::20п::21в::22с::23ч::24п::25с::26в::27п::28в::29с::30ч 0.99019426
чуркин 0.99019206
пидараскать 0.9901198
плечелопаточный 0.9900637
выволочь 0.99004364
пида

In [15]:
def get_w2v_indicies(a):
    res = []
    if isinstance(a, str):
        a = a.split()
    for w in a:
        if w in embs_voc:
            res.append(embs_voc[w])
        else:
            
            lemma = morph.parse(w)[0].normal_form
            res.append(embs_voc.get(lemma, None))
    return res

def calc_embs(words):
    words = ' '.join(map(normalize, words))
    inds = get_w2v_indicies(words)

    return [None if i is None else embs_vectors[i] for i in inds]

Сложим эмбеддинги нетоксичных слов в kd-дерево, чтобы можно было близко искать ближайших соседей

In [16]:
print(len(toxicity))

162690


In [17]:
MAX_TOXICITY  = 1

non_toxicity = {word : val for word, val in toxicity.items() if val <= MAX_TOXICITY}

nontoxic_emb_inds = [ind for word, ind in embs_voc.items() if toxicity.get(word, 1.0) <= MAX_TOXICITY]
embs_vectors_normed_nontoxic = embs_vectors_normed[nontoxic_emb_inds]

print(len(non_toxicity))

162690


Функция находит самое близкое нетоксичное слово по предпосчитанным эмбеддингам слов

In [18]:
embs_vectors_normed_nontoxic.shape

(162690, 300)

In [19]:
import hnswlib
import numpy as np
import pickle


num_elements, dim = embs_vectors_normed_nontoxic.shape

# Generating sample data
data = embs_vectors_normed_nontoxic
ids = np.arange(num_elements)

# Declaring index
p = hnswlib.Index(space = 'cosine', dim = dim) # possible options are l2, cosine or ip

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 400, M = 16)

# Element insertion (can be called several times):
p.add_items(data, ids)

In [20]:
word = "идиот"
emb_word = calc_embs([word])
for i in p.knn_query(emb_word, k = 15)[0][0]:
    other_word = embs_voc_by_id[nontoxic_emb_inds[i]]
    print(other_word, toxicity.get(other_word))

идиот 0.85655516
дебил 0.97273284
мудак 0.9194301
идиота 0.8899699
долбоеб 0.9877017
идиотка 0.9372305
долбоеба 0.99062496
дурак 0.9634208
дибил 0.9492958
дурачок 0.8315148
глупец 0.5484641
долбое 0.9891151
долбое 0.9891151
кретин 0.8392081
быдло 0.9608194


In [21]:
#A better one

from functools import lru_cache

@lru_cache()
def find_closest_nontoxic(word, word_toxicity, allow_self=False):
    #if toxicity.get(word, 1.0) <= threshold:
    #    return word

    #word_lemmatized = morph.parse(word)[0].normal_form
    if word not in embs_voc:
        #if word_lemmatized not in embs_voc:
            return word_toxicity, word
        #else:
           # word = word_lemmatized
    

    #threshold = min(toxicity.get(word, threshold), threshold)
    
    word_emb = calc_embs([word])
    if word_emb is None or word_emb[0] is None:
        return word_toxicity, word


    query = p.knn_query(word_emb, k = 200)[0][0]


    best_candidate = word
    best_score = word_toxicity

    

    for i in query:
        candidate = embs_voc_by_id[nontoxic_emb_inds[i]]
        similarity = distance_score(word, candidate)
        
        if (1 - toxicity.get(candidate)) + (similarity) > best_score:
            
            best_score = (1 - toxicity.get(candidate)) + (similarity)
            best_candidate = candidate 
            
    return toxicity.get(candidate), best_candidate


Заменяем токсичные слова на ближайшие по эмбеддингам не-токсичные

In [22]:
def detox(line):
    
    words = normalize(line).split()
    sorted_words = sort_by_toxicity(words) #[токсичность, индекс, слово]
    
    
    #cur_toxicity = predict_toxicity([' '.join(words)], return_scores=True, off_tqdm=True)[0]
  
    while sorted_words:
        toxic_toxicity, toxic_idx, toxic_word = sorted_words.pop()
        if toxic_toxicity > 0.77:
            fixed_word = "спасибо"
        else:
            fixed_toxicity, fixed_word = find_closest_nontoxic(toxic_word, toxic_toxicity, allow_self = True)
        words[toxic_idx] = fixed_word
        #if fixed_word != toxic_word:
        #    sorted_words.append([fixed_toxicity, toxic_idx, fixed_word])
        #    sorted_words.sort()
        
        #cur_toxicity = predict_toxicity([' '.join(words)], return_scores=True, off_tqdm=True)[0]
        

    
    return ' '.join(words)

In [23]:
len(nontoxic_emb_inds)

162690

In [None]:
Original = []
Detoxed = []
for i in tqdm(range(0, len(texts), 25)):
    line = texts[i]
    detoxed = detox(line)
    Original.append(line)
    Detoxed.append(detoxed)


new_checker(Original, Detoxed)

  0%|          | 0/100 [00:00<?, ?it/s]

0it [00:00, ?it/s]

average toxicity: 0.44503346
mean lmdiff: 0.9199872424285057
mean distance_score: 0.8360369109706309


43.749843042478545

In [None]:
fixed_texts = list(map(detox, tqdm(texts)))

with open(DRIVE_PATH + 'submits/sorted_fixed.txt', 'wt') as f:
    for text in fixed_texts:
        print(text, file=f)

new_checker(texts, fixed_texts)

  0%|          | 0/2500 [00:00<?, ?it/s]

In [None]:
with open(DRIVE_PATH + 'submits/sorted_fixed.txt', 'wt') as f:
    for text in fixed_texts:
        print(text, file=f)

#Score

In [24]:
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install -q kenlm

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip
[K     \ 540 kB 5.5 MB/s
[?25hBuilding wheels for collected packages: kenlm
  Building wheel for kenlm (setup.py) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.0.0-cp37-cp37m-linux_x86_64.whl size=2331827 sha256=45bb2a7e71e05bac49bbb48bcc5849e50ee1830404c3e4cc1679c0bb797b652b
  Stored in directory: /tmp/pip-ephem-wheel-cache-8thpmg9t/wheels/3d/aa/02/7b4a2eab5d7a2a9391bd9680dbad6270808a147bc3b7047e4e
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.0.0


In [25]:
import numpy as np

import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import softmax, sigmoid
from tqdm.auto import tqdm
import argparse
from pymystem3 import Mystem
import sys
from collections import Counter
from functools import partial
import kenlm

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/YandexCup/trained_roberta")

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/YandexCup/trained_roberta").cuda()

lm = kenlm.Model('/content/drive/MyDrive/YandexCup/lm.binary')



TOXIC_CLASS=-1
TOKENIZATION_TYPE='sentencepiece'


ALLOWED_ALPHABET=list(map(chr, range(ord('а'), ord('я') + 1)))
ALLOWED_ALPHABET.extend(map(chr, range(ord('a'), ord('z') + 1)))
ALLOWED_ALPHABET.extend(list(map(str.upper, ALLOWED_ALPHABET)))
ALLOWED_ALPHABET = set(ALLOWED_ALPHABET)


def logits_to_toxic_probas(logits):
    if logits.shape[-1] > 1:
        activation = lambda x: softmax(x, -1)
    else:
        activation = sigmoid
    return activation(logits)[:, TOXIC_CLASS].cpu().detach().numpy()


def is_word_start(token):
    if TOKENIZATION_TYPE == 'sentencepiece':
        return token.startswith('▁')
    if TOKENIZATION_TYPE == 'bert':
        return not token.startswith('##')
    raise ValueError("Unknown tokenization type")


def normalize(sentence, max_tokens_per_word=20):
    def validate_char(c):
        return c in ALLOWED_ALPHABET
    
    sentence = ''.join(map(lambda c: c if validate_char(c) else ' ', sentence.lower()))
    ids = tokenizer(sentence)['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(ids)[1:-1]
    
    result = []
    num_continuation_tokens = 0
    for token in tokens:
        if not is_word_start(token):
            num_continuation_tokens += 1
            if num_continuation_tokens < max_tokens_per_word:
                result.append(token.lstrip('#▁'))
        else:
            num_continuation_tokens = 0
            result.extend([' ', token.lstrip('▁#')])
    
    return ''.join(result).strip()


def iterate_batches(data, batch_size=40):
    batch = []
    for x in data:
        batch.append(x)
        if len(batch) >= batch_size:
            yield batch
            batch = []
    if len(batch) > 0:
        yield batch


def predict_toxicity_1(sentences, batch_size=5, threshold=0.5, return_scores=False, verbose=True, device='cuda'):
    results = []
    for batch in iterate_batches(sentences, batch_size):
        normlized = [normalize(sent, max_tokens_per_word=5) for sent in batch]
        tokenized = tokenizer(normlized, return_tensors='pt', padding=True, max_length=512, truncation=True)
    
        logits = model.to(device)(**{key: val.to(device) for key, val in tokenized.items()}).logits
        preds = logits_to_toxic_probas(logits)
        if not return_scores:
            preds = preds >= threshold
        results.extend(preds)
    return results


def get_w2v_indicies_1(a):
    res = []
    if isinstance(a, str):
        a = a.split()
    for w in a:
        if w in embs_voc:
            res.append((w, embs_voc[w]))
        else:
            if w.isalpha():
                lemma = morph.parse(w)[0].normal_form
                res.append((embs_voc.get(lemma), None))
    return res


def load_embeddings(path):
    embs_file = np.load(path, allow_pickle=True)
    embs_vectors = embs_file['vectors']
    embs_voc = embs_file['voc'].item()

    embs_voc_by_id = [None for i in range(len(embs_vectors))]
    for word, idx in embs_voc.items():
        if embs_voc_by_id[idx] is None:
            embs_voc_by_id[idx] = word
    return embs_vectors, embs_voc, embs_voc_by_id


def calc_embs_1(words):
    words = ' '.join(map(normalize, words))
    inds = get_w2v_indicies_1(words)
    return [(w, i if i is None else embs_vectors[i]) for w, i in inds]


def calc_single_embedding_dist(a, b):
    a_s, a_v = a  #слово, вектор
    b_s, b_v = b  #слово, вектор
    if a_s == b_s: #если слова равны
        return 0.0
    if a_v is None or b_v is None: #если одно из векторов нет - дать пизды 
        return 1.0
    a = a_v
    b = b_v
    # inexact match is punished by 0.1
    return 0.1 + 0.9 * (1 - a.dot(b) / np.linalg.norm(a) / np.linalg.norm(b)) / 2


def greedy_match_embs(a, b, max_dist=99999, cache=None, a_ind=0, b_ind=0):
    a_len = len(a) - a_ind #сколько слов мы еще не рассмотрели в а
    b_len = len(b) - b_ind #сколько слов мы еще не рассмотрели в b
    minlen = min(a_len, b_len) #минимальная длина строки
    maxlen = max(a_len, b_len) #минимальная длина строки
    if minlen == 0: 
        return np.minimum(maxlen, max_dist) 
    if maxlen - minlen >= max_dist: 
        return max_dist 
    
    if cache is None:
        cache = {}
    
    cache_key = (a_len, b_len)
    if cache_key in cache:
        return cache[cache_key]
        
    min_dist = max_dist
    
    first_dist = calc_single_embedding_dist(a[a_ind], b[b_ind])
    if max_dist >= first_dist:
        min_dist = np.minimum(min_dist, first_dist + greedy_match_embs(
            a, b, max_dist, cache, a_ind + 1, b_ind + 1
        ))
    
    if first_dist > 0 and max_dist >= 1:
        min_dist = np.minimum(min_dist, 1 + greedy_match_embs(
            a, b, max_dist, cache, a_ind + 1, b_ind
        ))
        min_dist = np.minimum(min_dist, 1 + greedy_match_embs(
            a, b, max_dist, cache, a_ind, b_ind + 1
        ))
    
    cache[cache_key] = min_dist
    
    return min_dist



def calc_semantic_distance(a, b):
    a_embs = calc_embs_1(a)
    b_embs = calc_embs_1(b)
    
    clip_distance = 5  # this clips long computations
    return np.exp(-(greedy_match_embs(a_embs, b_embs, max_dist=clip_distance) / (0.6 * np.log(1 + len(a)))) ** 2)


def distance_score(original, fixed):
    original = original.split()
    fixed = fixed.split()
    
    return calc_semantic_distance(original, fixed)


def compute_lmdiff(original, fixed):
    original_lm_logproba = lm.score(original, bos=True, eos=True)
    fixed_lm_logproba = lm.score(fixed, bos=True, eos=True)
    
    probability_fraction = 10**((fixed_lm_logproba - original_lm_logproba) / 25)
    
    return np.clip(probability_fraction, 0.0, 1.0)


def compute_score(original_sentences, fixed_sentences, threshold=0.5, batch_size=5):
    fixed_toxicities = predict_toxicity_1(fixed_sentences, threshold=threshold, batch_size=batch_size, return_scores=True)
    scores = []
    lmdiffs = []
    emb_dists = []
    for original_sentence, fixed_sentence, fixed_toxicity in tqdm(zip(
        original_sentences, fixed_sentences, fixed_toxicities
    ), miniters=250):
        original_sentence = normalize(original_sentence)
        fixed_sentence = normalize(fixed_sentence)
        
        distance = distance_score(original_sentence, fixed_sentence)
        lmdiff = compute_lmdiff(original_sentence, fixed_sentence)
        
        score = (1 - fixed_toxicity) * distance * lmdiff
        
        lmdiffs.append(lmdiff)
        emb_dists.append(distance)
        scores.append(score)
    
    print('average toxicity:', np.mean(fixed_toxicities), file=sys.stderr)
    print('mean lmdiff:', np.mean(lmdiffs), file=sys.stderr)
    print('mean distance_score:', np.mean(emb_dists), file=sys.stderr)
    
    return np.mean(scores)

'''
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('original_texts', type=argparse.FileType('r'))
    parser.add_argument('fixed_texts', type=argparse.FileType('r'))
    parser.add_argument('--score', type=argparse.FileType('w'))
    parser.add_argument('--model', required=True, type=str)
    parser.add_argument('--embeddings', type=str, required=True)
    parser.add_argument('--lm', type=str, required=True)
    parser.add_argument('--device', type=str, choices=['cuda', 'cpu'], default='cpu')
    
    return parser.parse_args()
'''

def new_checker(original_texts, fixed_texts, device='cuda'):
    original_texts = list(map(str.strip, original_texts))
    fixed_texts = list(map(str.strip, fixed_texts))
    
    assert len(original_texts) == len(fixed_texts)
    
    with torch.inference_mode(True):
        return (100 * compute_score(original_texts, fixed_texts))

In [None]:
original_texts = []

with open(DRIVE_PATH + 'public_testset.txt', 'rt') as f:
    for line in f:
        original_texts.append(normalize(line))

text_for_eval = []
with open(DRIVE_PATH + 'submits/sorted_fixed.txt', 'rt') as f:
    for line in f:
        text_for_eval.append(normalize(line))


new_checker(original_texts, text_for_eval)

Скор, если никак не изменять комментарии:

In [None]:
!python3.7 score.py public_testset.short.txt public_testset.short.txt  --embeddings embeddings_with_lemmas.npz --lm lm.binary --model ./trained_roberta/ --device cuda --score -

Скор бейзлайна:

In [None]:
!python3.7 score.py public_testset.short.txt baseline_fixed.txt  --embeddings embeddings_with_lemmas.npz --lm lm.binary --model ./trained_roberta/ --device cuda --score -

Сохраним данные для бейзлайна online-задачи

##Для Online-решения

In [None]:
!mkdir -p online_baseline

In [None]:
import pickle as pkl

with open('./online_baseline/data.pkl', 'wb') as f:
    pkl.dump(toxicity, f)
    pkl.dump(nontoxic_emb_inds, f)