# Фильтр нецензурной лексики на основе эмбеддингов слов

## Загрузим готовые эмбеддинги для русского языка

In [1]:
import fasttext
import fasttext.util

In [16]:
fasttext.util.download_model('ru', if_exists='ignore')

'cc.ru.300.bin'

In [2]:
ft = fasttext.load_model('cc.ru.300.bin')

In [3]:
ft.get_word_vector('привет').shape

(300,)

## Создадим датасет

In [15]:
from datasets import load_dataset

ds = load_dataset("textdetox/multilingual_toxicity_dataset", "default")

en-00000-of-00001.parquet:   0%|          | 0.00/267k [00:00<?, ?B/s]

ru-00000-of-00001.parquet:   0%|          | 0.00/388k [00:00<?, ?B/s]

uk-00000-of-00001.parquet:   0%|          | 0.00/360k [00:00<?, ?B/s]

de-00000-of-00001.parquet:   0%|          | 0.00/614k [00:00<?, ?B/s]

es-00000-of-00001.parquet:   0%|          | 0.00/641k [00:00<?, ?B/s]

am-00000-of-00001.parquet:   0%|          | 0.00/607k [00:00<?, ?B/s]

zh-00000-of-00001.parquet:   0%|          | 0.00/248k [00:00<?, ?B/s]

ar-00000-of-00001.parquet:   0%|          | 0.00/503k [00:00<?, ?B/s]

hi-00000-of-00001.parquet:   0%|          | 0.00/800k [00:00<?, ?B/s]

it-00000-of-00001.parquet:   0%|          | 0.00/522k [00:00<?, ?B/s]

fr-00000-of-00001.parquet:   0%|          | 0.00/388k [00:00<?, ?B/s]

he-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

hin-00000-of-00001.parquet:   0%|          | 0.00/473k [00:00<?, ?B/s]

tt-00000-of-00001.parquet:   0%|          | 0.00/411k [00:00<?, ?B/s]

ja-00000-of-00001.parquet:   0%|          | 0.00/451k [00:00<?, ?B/s]

Generating en split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating ru split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating uk split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating de split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating es split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating am split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating zh split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating ar split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating hi split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating it split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating fr split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating he split:   0%|          | 0/2011 [00:00<?, ? examples/s]

Generating hin split:   0%|          | 0/4363 [00:00<?, ? examples/s]

Generating tt split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating ja split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [27]:
ru_ds = ds['ru']
ru_ds

Dataset({
    features: ['text', 'toxic'],
    num_rows: 5000
})

In [29]:
import random

items = list(ru_ds)
random.shuffle(items)

TRAIN = 0.8
split_index = int(TRAIN * len(items))
train_items = items[:split_index]
test_items = items[split_index:]

In [34]:
def save_fasttext(items: list[dict], filename: str) -> None:
    """
    Записывает датасет в файл в формате fasttext
    """
    with open(filename, 'wt', encoding='utf-8') as f:
        for item in items:
            tag = "__label__toxic" if item["toxic"] else "__label__clean"
            f.write(f"{tag} {item['text']}\n")

save_fasttext(train_items, 'train.txt')
save_fasttext(test_items, 'test.txt')

In [56]:
model = fasttext.train_supervised(
    input="train.txt",
    lr=0.2,
    epoch=50,
    wordNgrams=2, 
    verbose=2,
    loss='softmax'
)

model.save_model("toxity_model.bin")

In [57]:
def evaluate(model, test_file):
    result = model.test(test_file)
    print(f"Samples: {result[0]}")
    print(f"Precision@1: {result[1]:.4f}")
    print(f"Recall@1: {result[2]:.4f}")

evaluate(model, "test.txt")

Samples: 1000
Precision@1: 0.7770
Recall@1: 0.7770


In [41]:
print(model.predict("ты дурак"))
print(model.predict("спасибо за помощь"))

(('__label__toxic',), array([1.00001001]))
(('__label__clean',), array([1.00000978]))


In [58]:
def classify_toxicity(text, threshold=0.6):
    label, prob = model.predict(text)
    label = label[0].replace("__label__", "")
    score = prob[0]
    is_toxic = (label == "toxic" and score >= threshold)
    return {
        "label": label,
        "score": round(score, 4),
        "toxic": is_toxic
    }

In [59]:
classify_toxicity('что за дебилизм')

{'label': 'toxic', 'score': 1.0, 'toxic': True}

In [68]:
classify_toxicity('можешь играть пеккой завтра')

{'label': 'clean', 'score': 0.9991, 'toxic': False}