# Классификация комментариев

Интернет-магазин запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. 

Обучим модель классифицировать комментарии на позитивные и негативные. В нашем распоряжении набор данных с разметкой о токсичности правок.


**Этапы выполнения проекта:**

1. Загрузка и подготовка данных.
2. Обучение разных моделей. 
3. Выводы.


Столбец *text* содержит текст комментария, а *toxic* — целевой признак.

# 1. Подготовка

In [17]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from pymystem3 import Mystem
from catboost import CatBoostClassifier, Pool, cv

In [2]:
comments = pd.read_csv('/datasets/toxic_comments.csv')
comments

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [3]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
text     159571 non-null object
toxic    159571 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [4]:
# Деление на обучающую и тестовую выборки
train_features, test_features, train_target, test_target = train_test_split(
    comments['text'], comments['toxic'], test_size=0.25)

In [5]:
# Создадим корпус комментариев и преобразуем кодировку текста в unicode
corpus_train = train_features.values.astype('U')
corpus_test = test_features.values.astype('U')

In [6]:
m = Mystem()

In [7]:
# Лемматизация текста
def lemmatize(text):
    lemm = m.lemmatize(text)  
    return "".join(lemm)

In [8]:
for i in notebook.tqdm(range(corpus_train.shape[0])):
    corpus_train[i] = lemmatize(corpus_train[i])

HBox(children=(FloatProgress(value=0.0, max=119678.0), HTML(value='')))




In [9]:
for i in notebook.tqdm(range(corpus_test.shape[0])):
    corpus_test[i] = lemmatize(corpus_test[i])

HBox(children=(FloatProgress(value=0.0, max=39893.0), HTML(value='')))




In [10]:
# Английские стоп-слова, от которых надо избавить текст
nltk.download('stopwords')
stop_words = set(nltk_stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Вычисление TF-IDF
tf_idf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tf_idf_train = tf_idf_vectorizer.fit_transform(corpus_train)

In [12]:
tf_idf_test = tf_idf_vectorizer.transform(corpus_test)

Данные поделены на выборки и векторизованы.

In [13]:
# Удалим ненужные переменные дабы очистить память
del comments, corpus_train, corpus_test

In [14]:
# Уменьшим размерность массива для обучения градиентному бустингу
svd = TruncatedSVD(n_components=100, random_state=42)

In [15]:
svd.fit(tf_idf_train)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
             random_state=42, tol=0.0)

Ядро юпитера не справилось с обрезанием и до 1000 компонент, удалось сократить до 100. При таком сильном усечении массива не стоит надеятся на высокое значение метрики, но посмотрим что получится.

In [16]:
tf_idf_train_svd = svd.transform(tf_idf_train)

Ниже влоть до пункта 2 идет код работы с BERT для получения эмбеддингов.

In [9]:
#Возьмем 22000 записей, чтобы ядро осилило
train_features_for_emb = train_features.sample(22000)

In [11]:
train_target_for_emb = pd.Series(train_target, index=train_features_for_emb.index)

In [14]:
train_features_for_emb = train_features_for_emb.reset_index(drop=True)
train_target_for_emb = train_target_for_emb.reset_index(drop=True)

In [16]:
pretrained_weights = 'bert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_weights)

In [17]:
# Ограничим длину комментов 512 токенами, так как большее количество в BERT не влезет
tokenized = train_features_for_emb.apply(
    lambda x: tokenizer.encode(x[:512], add_special_tokens=True))

In [18]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [19]:
config = transformers.AutoConfig.from_pretrained(pretrained_model_name_or_path=pretrained_weights)
config.output_hidden_states=True

In [20]:
model = transformers.BertModel.from_pretrained(
    'bert-base-uncased', config=config)

In [None]:
batch_size = 500
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

HBox(children=(FloatProgress(value=0.0, max=44.0), HTML(value='')))

Даже на 22000 записей ядро не выдержало и умерло( и мы остались без эмбеддингов.  
Оно меня выбесило, поэтому переходим к обучению логистической регрессии на TF-IDF.

# 2. Обучение

In [37]:
model = LogisticRegression(random_state=12345, class_weight='balanced')

In [38]:
%%time
model.fit(tf_idf_train, train_target)



CPU times: user 5.78 s, sys: 36.5 ms, total: 5.82 s
Wall time: 5.82 s


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=12345, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
predictions = model.predict(tf_idf_test)
f1_score(test_target, predictions)

0.7553879310344828

Отлично, ну вот все и готово.

In [26]:
#Обучим модель градиентного бустинга из catboost применив кросс-валидацию
cv_comment = Pool(data=tf_idf_train,
              label=train_target)

In [32]:
cv_comment_svd = Pool(data=tf_idf_train_svd,
              label=train_target)

In [20]:
#Параметры модели
params = {'iterations': 1000,
         'depth': 6,
          'loss_function': 'Logloss',
         'eval_metric': 'F1',
          'learning_rate': 0.1,
         'verbose': 20,
         'random_state': 12345}

In [21]:
cv(cv_comment, params, fold_count=3, plot=False, as_pandas=True)

0:	learn: 0.4291132	test: 0.4291765	best: 0.4291765 (0)	total: 2.15s	remaining: 35m 52s
20:	learn: 0.5725016	test: 0.5712742	best: 0.5712742 (20)	total: 24.8s	remaining: 19m 17s
40:	learn: 0.6036546	test: 0.5965714	best: 0.5965714 (40)	total: 49.4s	remaining: 19m 14s
60:	learn: 0.6203272	test: 0.6121795	best: 0.6121795 (60)	total: 1m 10s	remaining: 18m 6s
80:	learn: 0.6325435	test: 0.6220651	best: 0.6220651 (80)	total: 1m 29s	remaining: 16m 52s
100:	learn: 0.6418868	test: 0.6286609	best: 0.6286609 (100)	total: 1m 48s	remaining: 16m 4s
120:	learn: 0.6511747	test: 0.6330834	best: 0.6330834 (120)	total: 2m 6s	remaining: 15m 22s
140:	learn: 0.6602021	test: 0.6378591	best: 0.6378591 (140)	total: 2m 27s	remaining: 14m 58s
160:	learn: 0.6686222	test: 0.6412070	best: 0.6412070 (160)	total: 2m 46s	remaining: 14m 27s
180:	learn: 0.6755148	test: 0.6439013	best: 0.6441409 (179)	total: 3m 5s	remaining: 13m 59s
200:	learn: 0.6818275	test: 0.6462451	best: 0.6466228 (198)	total: 3m 25s	remaining: 13m 

Unnamed: 0,iterations,test-F1-mean,test-F1-std,train-F1-mean,train-F1-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.429177,0.007167,0.429113,0.002662,0.568556,0.000461,0.568477,0.000609
1,1,0.510174,0.019311,0.509906,0.018087,0.475729,0.003272,0.475518,0.003752
2,2,0.521336,0.014535,0.521932,0.013816,0.410308,0.002642,0.410058,0.003466
3,3,0.505338,0.027578,0.506219,0.022688,0.363197,0.000874,0.362860,0.001314
4,4,0.515134,0.016122,0.515028,0.011787,0.328030,0.001361,0.327612,0.001883
...,...,...,...,...,...,...,...,...,...
995,995,0.669421,0.003800,0.858220,0.003213,0.159546,0.001758,0.075371,0.000235
996,996,0.669253,0.003979,0.858655,0.003115,0.159558,0.001743,0.075329,0.000246
997,997,0.669318,0.003928,0.859103,0.003231,0.159561,0.001739,0.075276,0.000247
998,998,0.669014,0.003943,0.859209,0.003243,0.159558,0.001738,0.075216,0.000259


In [22]:
params2 = {'iterations': 1000,
         'depth': 12,
          'loss_function': 'Logloss',
         'eval_metric': 'F1',
          'learning_rate': 0.1,
         'verbose': 20,
         'random_state': 12345}

In [23]:
cv(cv_comment, params2, fold_count=3, plot=False, as_pandas=True)

0:	learn: 0.5676253	test: 0.5589269	best: 0.5589269 (0)	total: 12s	remaining: 3h 19m 49s
20:	learn: 0.6346613	test: 0.5937427	best: 0.5937427 (20)	total: 3m 58s	remaining: 3h 5m 6s
40:	learn: 0.7061414	test: 0.6132653	best: 0.6132653 (40)	total: 7m 44s	remaining: 3h 1m 6s
60:	learn: 0.7759456	test: 0.6235563	best: 0.6235563 (60)	total: 11m 30s	remaining: 2h 57m 7s
80:	learn: 0.8382217	test: 0.6287549	best: 0.6287549 (80)	total: 15m 17s	remaining: 2h 53m 28s
100:	learn: 0.8882192	test: 0.6322109	best: 0.6331468 (97)	total: 19m 4s	remaining: 2h 49m 45s
120:	learn: 0.9220210	test: 0.6357521	best: 0.6361134 (112)	total: 22m 48s	remaining: 2h 45m 39s
140:	learn: 0.9468698	test: 0.6391859	best: 0.6394615 (137)	total: 26m 32s	remaining: 2h 41m 40s
160:	learn: 0.9661587	test: 0.6397797	best: 0.6403359 (156)	total: 30m 19s	remaining: 2h 38m 4s
180:	learn: 0.9791974	test: 0.6421528	best: 0.6421528 (180)	total: 34m 3s	remaining: 2h 34m 4s
200:	learn: 0.9874288	test: 0.6445350	best: 0.6445350 (200

Unnamed: 0,iterations,test-F1-mean,test-F1-std,train-F1-mean,train-F1-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.558927,0.004522,0.567625,0.001635,0.564532,0.000147,0.563265,0.000333
1,1,0.555481,0.008856,0.568618,0.002545,0.471683,0.000319,0.468980,0.000374
2,2,0.554462,0.006003,0.567093,0.005595,0.404133,0.002589,0.400545,0.002856
3,3,0.556846,0.006002,0.567622,0.004779,0.352924,0.001583,0.348472,0.001795
4,4,0.562850,0.005845,0.576655,0.002337,0.312923,0.000761,0.307629,0.001132
...,...,...,...,...,...,...,...,...,...
995,995,0.653464,0.007376,0.999053,0.000292,0.228753,0.002589,0.002776,0.000026
996,996,0.653427,0.007351,0.999053,0.000292,0.228780,0.002587,0.002775,0.000027
997,997,0.653535,0.007320,0.999053,0.000292,0.228803,0.002585,0.002772,0.000029
998,998,0.653359,0.007372,0.999053,0.000292,0.228822,0.002572,0.002771,0.000030


Наилучший показатель f1 при depth = 6 и iterations = 991.

In [27]:
# Обучим модель CatBoostClassifier сначало на полном массиве TF-IDF, затем на усеченном
model_catboost = CatBoostClassifier(iterations=991,
         depth=6,
         eval_metric='F1',
         learning_rate=0.1,
         verbose=20,
         random_state=12345)

In [28]:
model_catboost.fit(cv_comment)

0:	learn: 0.4125807	total: 6.4s	remaining: 1h 45m 33s
20:	learn: 0.5161600	total: 1m 54s	remaining: 1h 27m 53s
40:	learn: 0.5790973	total: 3m 43s	remaining: 1h 26m 8s
60:	learn: 0.6070712	total: 5m 31s	remaining: 1h 24m 20s
80:	learn: 0.6281897	total: 7m 22s	remaining: 1h 22m 53s
100:	learn: 0.6449178	total: 9m 13s	remaining: 1h 21m 15s
120:	learn: 0.6607485	total: 11m 4s	remaining: 1h 19m 39s
140:	learn: 0.6731599	total: 12m 53s	remaining: 1h 17m 43s
160:	learn: 0.6837821	total: 14m 44s	remaining: 1h 15m 57s
180:	learn: 0.6918656	total: 16m 34s	remaining: 1h 14m 10s
200:	learn: 0.7025417	total: 18m 24s	remaining: 1h 12m 19s
220:	learn: 0.7126799	total: 20m 13s	remaining: 1h 10m 27s
240:	learn: 0.7196243	total: 22m 3s	remaining: 1h 8m 37s
260:	learn: 0.7236888	total: 23m 52s	remaining: 1h 6m 47s
280:	learn: 0.7295377	total: 25m 44s	remaining: 1h 5m 1s
300:	learn: 0.7339285	total: 27m 35s	remaining: 1h 3m 14s
320:	learn: 0.7374702	total: 29m 27s	remaining: 1h 1m 28s
340:	learn: 0.741684

<catboost.core.CatBoostClassifier at 0x7f8af6c35d50>

In [22]:
tf_idf_test_svd = svd.transform(tf_idf_test)

In [29]:
pool_test = Pool(data=tf_idf_test, label=test_target)

In [30]:
predict = model_catboost.predict(pool_test)
f1_score(test_target, predict)

0.7569942873883113

Ура! Ноутбук справился с полным массивом и ядро не умерло. Хвала тем, кто ее заслуживает.

In [31]:
model_catboost_svd = CatBoostClassifier(iterations=991,
         depth=6,
         eval_metric='F1',
         learning_rate=0.1,
         verbose=20,
         random_state=12345)

In [33]:
model_catboost_svd.fit(cv_comment_svd)

0:	learn: 0.4329339	total: 481ms	remaining: 7m 56s
20:	learn: 0.5802242	total: 9.67s	remaining: 7m 26s
40:	learn: 0.6062653	total: 18.9s	remaining: 7m 17s
60:	learn: 0.6214009	total: 27.5s	remaining: 6m 59s
80:	learn: 0.6306661	total: 36.4s	remaining: 6m 49s
100:	learn: 0.6404144	total: 45.3s	remaining: 6m 39s
120:	learn: 0.6486345	total: 55.1s	remaining: 6m 35s
140:	learn: 0.6562223	total: 1m 3s	remaining: 6m 25s
160:	learn: 0.6625410	total: 1m 12s	remaining: 6m 14s
180:	learn: 0.6688776	total: 1m 21s	remaining: 6m 4s
200:	learn: 0.6745134	total: 1m 30s	remaining: 5m 54s
220:	learn: 0.6802203	total: 1m 38s	remaining: 5m 44s
240:	learn: 0.6855705	total: 1m 47s	remaining: 5m 35s
260:	learn: 0.6910078	total: 1m 56s	remaining: 5m 27s
280:	learn: 0.6951238	total: 2m 5s	remaining: 5m 17s
300:	learn: 0.6986002	total: 2m 14s	remaining: 5m 8s
320:	learn: 0.7030211	total: 2m 23s	remaining: 4m 59s
340:	learn: 0.7089926	total: 2m 32s	remaining: 4m 49s
360:	learn: 0.7110842	total: 2m 40s	remaining

<catboost.core.CatBoostClassifier at 0x7f8af6b02150>

In [34]:
pool_test_svd = Pool(data=tf_idf_test_svd, label=test_target)

In [35]:
predict_svd = model_catboost_svd.predict(pool_test_svd)
f1_score(test_target, predict_svd)

0.6813153042409343

Как и следовало ожидать при сильном обрезании матрицы модель проигрывает в качестве и полученное значение метрики F1 нас не удовлетворяет.

# 3. Выводы

Хорошо себя показала логистическая регрессия при обучении на *TF-IDF*, полученная метрика *F1* = 0.755. Еще большее значение метрики удалось получить при обучение модели *CatBoostClassifier* на той же выборке: *F1* = 0.757. Однако, я бы остановился на модели логистической регрессии, так как время обучения этой модели по сравнению с *CatBoostClassifier* (~ 6c против 1ч30мин у CatBoost) перекрывает незначительную разницу в значениях метрики *F1* для этих моделей.