## Подготовка

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt')
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve
import sklearn.metrics
import warnings
warnings.simplefilter("ignore")

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('/datasets/toxic_comments.csv')
display(df.sample(n = 10,random_state = 55))
print(df['toxic'].value_counts())

Unnamed: 0,text,toxic
52665,Daily Scripture \n\n2Th 1:6 1:6 {3} Seeing it ...,0
29740,"Welcome!\n\nHello , and welcome to Wikipedia! ...",0
14703,"""\n\nI left this message\n\n""""Before anyone ba...",0
39248,No problem from me regarding adding the portal...,0
23048,what to do with elitist assholes who do not al...,1
101690,"Well, if this is indeed original research (and...",0
30825,"Here i found something just for you, you many ...",0
13379,"""::I wouldn't. Drop it. It's done. It's WP:O...",0
46881,"Clearly, you both need to lose the mop in orde...",0
80445,"""\n\nTo all Wikipedia Editors of the Jehovah's...",0


0    143346
1     16225
Name: toxic, dtype: int64


In [3]:
train_df, test_df = train_test_split(df, test_size=0.25)

In [4]:
#test_df["toxic"].value_counts()

In [5]:
#train_df["toxic"].value_counts()

Загрузили данные, просмотрели случайные строки из них, разделили датасет на тренировочную и тестовую

## Обучение

In [6]:
snowball = SnowballStemmer(language="english")
stop_words = stopwords.words("english")

def tokenize_sentence(sentence: str, remove_stop_words: bool = True):
    tokens = word_tokenize(sentence, language="russian")
    tokens = [i for i in tokens if i not in string.punctuation]
    if remove_stop_words:
        tokens = [i for i in tokens if i not in stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [7]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))

In [8]:
features = vectorizer.fit_transform(train_df["text"])

In [10]:
%%time
model = LogisticRegression(random_state=0)
model.fit(features, train_df["toxic"])

CPU times: user 6.88 s, sys: 4.04 s, total: 10.9 s
Wall time: 10.9 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
%%time
model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", LogisticRegression(random_state=0))])

CPU times: user 153 µs, sys: 4 µs, total: 157 µs
Wall time: 162 µs


In [11]:
%%time
model_pipeline.fit(train_df["text"], train_df["toxic"])

CPU times: user 3min 47s, sys: 4.18 s, total: 3min 51s
Wall time: 3min 52s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...u)\\b\\w\\w+\\b',
                                 tokenizer=<function <lambda> at 0x7efd7dcd3dd0>,
                                 use_idf=True, vocabulary=None)),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    f

In [12]:
print("\nF1:", f1_score(test_df['toxic'], model_pipeline.predict(test_df["text"])))


F1: 0.7315267288612468


значение F1 меры меньше требуемого, попробуем его поднять, за счёт подбора гиперпараметров

In [13]:
#prec, rec, thresholds = precision_recall_curve(y_true=test_df["toxic"], probas_pred=model_pipeline.predict_proba(test_df["text"])[:, 1])

In [14]:
%%time
grid_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", 
     GridSearchCV(
        LogisticRegression(random_state=0),
        param_grid={'C': [0.1, 1, 10.]},
        cv=3,
         verbose=4
        )
    )
])

CPU times: user 212 µs, sys: 4 µs, total: 216 µs
Wall time: 223 µs


In [15]:
%%time
grid_pipeline.fit(train_df["text"], train_df["toxic"])

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=0.1, score=0.928, total=   5.9s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s


[CV] ............................... C=0.1, score=0.928, total=   6.0s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.9s remaining:    0.0s


[CV] ............................... C=0.1, score=0.928, total=   5.9s
[CV] C=1 .............................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.8s remaining:    0.0s


[CV] ................................. C=1, score=0.952, total=   8.6s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.952, total=  10.5s
[CV] C=1 .............................................................
[CV] ................................. C=1, score=0.953, total=  10.3s
[CV] C=10.0 ..........................................................
[CV] .............................. C=10.0, score=0.958, total=  16.0s
[CV] C=10.0 ..........................................................
[CV] .............................. C=10.0, score=0.958, total=  16.1s
[CV] C=10.0 ..........................................................
[CV] .............................. C=10.0, score=0.959, total=  15.5s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.6min finished


CPU times: user 4min 45s, sys: 48.1 s, total: 5min 34s
Wall time: 5min 36s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scali

Лучший результат показли при C = 10. Используем его в дальнейшем

In [16]:
%%time
model_pipeline_c_10 = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer=lambda x: tokenize_sentence(x, remove_stop_words=True))),
    ("model", LogisticRegression(random_state=0, C=10.))])

CPU times: user 161 µs, sys: 12 µs, total: 173 µs
Wall time: 179 µs


In [17]:
%%time
model_pipeline_c_10.fit(train_df["text"], train_df["toxic"])

CPU times: user 3min 52s, sys: 7.12 s, total: 3min 59s
Wall time: 4min 3s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...)\\b\\w\\w+\\b',
                                 tokenizer=<function <lambda> at 0x7efd780004d0>,
                                 use_idf=True, vocabulary=None)),
                ('model',
                 LogisticRegression(C=10.0, class_weight=None, dual=False,
                                    f

In [18]:
print("\nF1:", f1_score(test_df['toxic'], model_pipeline_c_10.predict(test_df["text"])))


F1: 0.773162066138289


## Выводы

Обучили модель логистической регрессии, которая может определять токсичные комментарии. Для этого разбили исходные комментарии на токены, затем использовали pipeline, чтобы модели могла принимать на вход целые предложения. Повысили точность f1 меры с помощью подбора гиперпаметра.