Цель проекта - определить тональность комментариев, с целью отправить токсичные комментарии на модерацию.

Основная метрика - F1.
Желаемый результат - не менее 0.75.

# 1. Подготовка

In [1]:
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import os

[nltk_data] Downloading package stopwords to C:\Users\Lenovo
[nltk_data]     ThinkPad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = os.getcwd()

In [3]:
comments = pd.read_csv(path + '/toxic_comments.csv')
comments.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [4]:
# разделим датасет на тренировочный и тестовый, оставим для тестовой выборки 25% данных
train, test = train_test_split(comments, test_size=0.25, random_state=1234)
train_target = train['toxic']
test_target = test['toxic']

In [5]:
# комментарии в столбце 'text' необходимо предобработать: сначала осуществим стемминг всех слов 
# в комментариях, затем с помощью функции re_sub() очистим их от лишних символов, методы join()и 
# slip() помогут избавиться от лишних пробелов.
stemmer = SnowballStemmer('english')
def stemming(text):
    text = stemmer.stem(text)
    text = ' '.join((re.sub(r'[^a-z]', ' ', text)).split())
    return text
corpus = pd.Series([stemming(text) for text in train['text']])

print(corpus[0:4])
# обработанные комментарии мы сохранили в переменной corpus

0    talk boston tea party howdy i noticed that you...
1    i t report this is a major report on political...
2    metesky hi i noticed your work on george metes...
3    this shouldn t be allowed to continue the auth...
dtype: object


In [6]:
corpus = corpus.astype('U')
# здесь мы изменили кодировку на Unicode

In [7]:
# уберем из корпуса слова, которые не несут смысловую нагрузку
# и посчитаем TF-IDF для нашего корпуса текстов
stopwords = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf = count_tf_idf.fit_transform(corpus) 
print(tf_idf.shape)
# в переменной tf_idf мы сохранили величины TF-IDF, которые будем использовать в качестве  
# признаков для обучения моделей

(119678, 143536)


In [8]:
# проведем аналогичные преобразования текстов на тестовой выборке
stemmer = SnowballStemmer('english')
def stemming(text):
    text = stemmer.stem(text)
    text = ' '.join((re.sub(r'[^a-z]', ' ', text)).split())
    return text
corpus_test = pd.Series([stemming(text) for text in test['text']])
corpus_test = corpus_test.astype('U')
tf_idf_test = count_tf_idf.transform(corpus_test)
tf_idf_test.shape

(39893, 143536)

In [9]:
# наблюдается дисбаланс классов - положительных ответов целевого признака всего 10%
# при обучении моделей укажем значение 'balanced' для параметра class_weight
comments['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

# 2. Обучение моделей

In [10]:
logreg = LogisticRegression(solver='lbfgs', class_weight='balanced')
logreg.fit(tf_idf, train_target)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
train_pred_lg = logreg.predict(tf_idf)
print('F1 на тренировочной выборке:', f1_score(train_pred_lg, train_target))

F1 на тренировочной выборке: 0.8434411660149307


In [12]:
catboost = CatBoostClassifier()
catboost.fit(tf_idf, train_target)

Learning rate set to 0.079478
0:	learn: 0.6096643	total: 3.24s	remaining: 53m 55s
1:	learn: 0.5442156	total: 5.62s	remaining: 46m 43s
2:	learn: 0.4894510	total: 7.94s	remaining: 43m 58s
3:	learn: 0.4460159	total: 10.2s	remaining: 42m 31s
4:	learn: 0.4096329	total: 12.6s	remaining: 41m 48s
5:	learn: 0.3800085	total: 14.9s	remaining: 41m 9s
6:	learn: 0.3544681	total: 17.2s	remaining: 40m 44s
7:	learn: 0.3349875	total: 19.6s	remaining: 40m 25s
8:	learn: 0.3184662	total: 21.8s	remaining: 40m 3s
9:	learn: 0.3050539	total: 24.3s	remaining: 40m
10:	learn: 0.2931938	total: 26.5s	remaining: 39m 45s
11:	learn: 0.2832266	total: 28.8s	remaining: 39m 31s
12:	learn: 0.2751533	total: 31.1s	remaining: 39m 18s
13:	learn: 0.2681212	total: 33.3s	remaining: 39m 5s
14:	learn: 0.2621316	total: 35.8s	remaining: 39m 12s
15:	learn: 0.2571891	total: 38.1s	remaining: 39m 2s
16:	learn: 0.2518552	total: 40.4s	remaining: 38m 55s
17:	learn: 0.2476476	total: 42.7s	remaining: 38m 48s
18:	learn: 0.2435234	total: 45.2s	

152:	learn: 0.1662341	total: 6m 36s	remaining: 36m 35s
153:	learn: 0.1660572	total: 6m 39s	remaining: 36m 33s
154:	learn: 0.1658582	total: 6m 41s	remaining: 36m 29s
155:	learn: 0.1655967	total: 6m 44s	remaining: 36m 28s
156:	learn: 0.1654233	total: 6m 46s	remaining: 36m 24s
157:	learn: 0.1652426	total: 6m 49s	remaining: 36m 23s
158:	learn: 0.1650243	total: 6m 52s	remaining: 36m 21s
159:	learn: 0.1648188	total: 6m 54s	remaining: 36m 16s
160:	learn: 0.1646020	total: 6m 56s	remaining: 36m 12s
161:	learn: 0.1644045	total: 6m 59s	remaining: 36m 9s
162:	learn: 0.1642456	total: 7m 1s	remaining: 36m 5s
163:	learn: 0.1640669	total: 7m 4s	remaining: 36m 3s
164:	learn: 0.1638889	total: 7m 6s	remaining: 35m 58s
165:	learn: 0.1637016	total: 7m 9s	remaining: 35m 56s
166:	learn: 0.1634891	total: 7m 11s	remaining: 35m 52s
167:	learn: 0.1632341	total: 7m 13s	remaining: 35m 48s
168:	learn: 0.1630349	total: 7m 16s	remaining: 35m 45s
169:	learn: 0.1628948	total: 7m 19s	remaining: 35m 45s
170:	learn: 0.162

301:	learn: 0.1437492	total: 12m 57s	remaining: 29m 57s
302:	learn: 0.1436557	total: 13m	remaining: 29m 54s
303:	learn: 0.1435782	total: 13m 4s	remaining: 29m 56s
304:	learn: 0.1434281	total: 13m 7s	remaining: 29m 54s
305:	learn: 0.1433681	total: 13m 10s	remaining: 29m 53s
306:	learn: 0.1432397	total: 13m 13s	remaining: 29m 51s
307:	learn: 0.1431508	total: 13m 16s	remaining: 29m 49s
308:	learn: 0.1429834	total: 13m 19s	remaining: 29m 47s
309:	learn: 0.1428581	total: 13m 22s	remaining: 29m 46s
310:	learn: 0.1427747	total: 13m 25s	remaining: 29m 44s
311:	learn: 0.1426119	total: 13m 28s	remaining: 29m 42s
312:	learn: 0.1425304	total: 13m 31s	remaining: 29m 40s
313:	learn: 0.1424516	total: 13m 33s	remaining: 29m 38s
314:	learn: 0.1423767	total: 13m 36s	remaining: 29m 35s
315:	learn: 0.1423100	total: 13m 39s	remaining: 29m 33s
316:	learn: 0.1422055	total: 13m 42s	remaining: 29m 31s
317:	learn: 0.1421370	total: 13m 45s	remaining: 29m 29s
318:	learn: 0.1420233	total: 13m 47s	remaining: 29m 27

449:	learn: 0.1314926	total: 20m 6s	remaining: 24m 34s
450:	learn: 0.1314509	total: 20m 8s	remaining: 24m 31s
451:	learn: 0.1313984	total: 20m 11s	remaining: 24m 28s
452:	learn: 0.1313352	total: 20m 14s	remaining: 24m 26s
453:	learn: 0.1312686	total: 20m 17s	remaining: 24m 24s
454:	learn: 0.1312328	total: 20m 20s	remaining: 24m 22s
455:	learn: 0.1311375	total: 20m 23s	remaining: 24m 20s
456:	learn: 0.1310748	total: 20m 27s	remaining: 24m 18s
457:	learn: 0.1309994	total: 20m 30s	remaining: 24m 16s
458:	learn: 0.1309601	total: 20m 33s	remaining: 24m 14s
459:	learn: 0.1309092	total: 20m 36s	remaining: 24m 11s
460:	learn: 0.1308514	total: 20m 39s	remaining: 24m 8s
461:	learn: 0.1308171	total: 20m 41s	remaining: 24m 5s
462:	learn: 0.1307407	total: 20m 44s	remaining: 24m 3s
463:	learn: 0.1306748	total: 20m 47s	remaining: 24m 1s
464:	learn: 0.1305978	total: 20m 50s	remaining: 23m 58s
465:	learn: 0.1305615	total: 20m 52s	remaining: 23m 55s
466:	learn: 0.1305162	total: 20m 54s	remaining: 23m 52

597:	learn: 0.1232161	total: 27m 13s	remaining: 18m 17s
598:	learn: 0.1231903	total: 27m 15s	remaining: 18m 14s
599:	learn: 0.1231607	total: 27m 17s	remaining: 18m 11s
600:	learn: 0.1231337	total: 27m 20s	remaining: 18m 8s
601:	learn: 0.1230970	total: 27m 23s	remaining: 18m 6s
602:	learn: 0.1230724	total: 27m 25s	remaining: 18m 3s
603:	learn: 0.1230402	total: 27m 27s	remaining: 18m
604:	learn: 0.1229796	total: 27m 30s	remaining: 17m 57s
605:	learn: 0.1229193	total: 27m 32s	remaining: 17m 54s
606:	learn: 0.1228934	total: 27m 35s	remaining: 17m 51s
607:	learn: 0.1228484	total: 27m 38s	remaining: 17m 49s
608:	learn: 0.1227687	total: 27m 41s	remaining: 17m 46s
609:	learn: 0.1227016	total: 27m 43s	remaining: 17m 43s
610:	learn: 0.1226757	total: 27m 45s	remaining: 17m 40s
611:	learn: 0.1226101	total: 27m 48s	remaining: 17m 37s
612:	learn: 0.1225863	total: 27m 50s	remaining: 17m 34s
613:	learn: 0.1225615	total: 27m 52s	remaining: 17m 31s
614:	learn: 0.1225368	total: 27m 55s	remaining: 17m 28s

745:	learn: 0.1172247	total: 33m 41s	remaining: 11m 28s
746:	learn: 0.1171578	total: 33m 43s	remaining: 11m 25s
747:	learn: 0.1171192	total: 33m 46s	remaining: 11m 22s
748:	learn: 0.1170521	total: 33m 48s	remaining: 11m 19s
749:	learn: 0.1170318	total: 33m 50s	remaining: 11m 16s
750:	learn: 0.1169449	total: 33m 52s	remaining: 11m 13s
751:	learn: 0.1168874	total: 33m 55s	remaining: 11m 11s
752:	learn: 0.1168420	total: 33m 57s	remaining: 11m 8s
753:	learn: 0.1168227	total: 34m	remaining: 11m 5s
754:	learn: 0.1167798	total: 34m 2s	remaining: 11m 2s
755:	learn: 0.1167026	total: 34m 5s	remaining: 11m
756:	learn: 0.1166397	total: 34m 8s	remaining: 10m 57s
757:	learn: 0.1166150	total: 34m 10s	remaining: 10m 54s
758:	learn: 0.1165961	total: 34m 13s	remaining: 10m 51s
759:	learn: 0.1165673	total: 34m 15s	remaining: 10m 49s
760:	learn: 0.1165136	total: 34m 18s	remaining: 10m 46s
761:	learn: 0.1164947	total: 34m 20s	remaining: 10m 43s
762:	learn: 0.1164532	total: 34m 23s	remaining: 10m 40s
763:	l

895:	learn: 0.1117321	total: 40m 15s	remaining: 4m 40s
896:	learn: 0.1116931	total: 40m 17s	remaining: 4m 37s
897:	learn: 0.1116508	total: 40m 19s	remaining: 4m 34s
898:	learn: 0.1116338	total: 40m 22s	remaining: 4m 32s
899:	learn: 0.1115861	total: 40m 24s	remaining: 4m 29s
900:	learn: 0.1115530	total: 40m 26s	remaining: 4m 26s
901:	learn: 0.1115000	total: 40m 28s	remaining: 4m 23s
902:	learn: 0.1114741	total: 40m 31s	remaining: 4m 21s
903:	learn: 0.1114475	total: 40m 33s	remaining: 4m 18s
904:	learn: 0.1113952	total: 40m 35s	remaining: 4m 15s
905:	learn: 0.1113788	total: 40m 37s	remaining: 4m 12s
906:	learn: 0.1113553	total: 40m 39s	remaining: 4m 10s
907:	learn: 0.1113013	total: 40m 42s	remaining: 4m 7s
908:	learn: 0.1112730	total: 40m 44s	remaining: 4m 4s
909:	learn: 0.1112335	total: 40m 46s	remaining: 4m 2s
910:	learn: 0.1111853	total: 40m 49s	remaining: 3m 59s
911:	learn: 0.1111557	total: 40m 51s	remaining: 3m 56s
912:	learn: 0.1111395	total: 40m 53s	remaining: 3m 53s
913:	learn: 0

<catboost.core.CatBoostClassifier at 0x2a280b47808>

In [13]:
train_pred_cb = catboost.predict(tf_idf)
print('F1 на тренировочной выборке:', f1_score(train_pred_cb, train_target))

F1 на тренировочной выборке: 0.79108873266198


## Тестирование

### Модель LogisticRegression

In [93]:
test_pred_lg = logreg.predict(tf_idf_test)
print('F1 на тестовой выборке:', f1_score(test_pred_lg, test_target))

F1 на тестовой выборке: 0.7525195968645018


### Модель CatBoostClassifier

In [94]:
test_pred_cb = catboost.predict(tf_idf_test)
print('F1 на тестовой выборке:', f1_score(test_pred_cb, test_target))

F1 на тестовой выборке: 0.7362406015037594


# 3. Выводы

Для классификации комментариев на позитивные и негативные мы обучили две модели. В текущих настройках LogisticRegression показала лучший результат тренировочной и на тестовой выборках.