## Подготовка

In [1]:
import pandas as pd

import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from tqdm import tqdm
tqdm.pandas()

In [2]:
try:
    comms = pd.read_csv(r"C:\Users\Кеша-Юнити\Desktop\Практикум\Токсичные комментарии\toxic_comments.csv")
except:
    comms = pd.read_csv('/datasets/toxic_comments.csv')

comms.head(10)

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0
5,5,"""\n\nCongratulations from me as well, use the ...",0
6,6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,7,Your vandalism to the Matt Shirvington article...,0
8,8,Sorry if the word 'nonsense' was offensive to ...,0
9,9,alignment on this subject and which are contra...,0


Создадим список стоп-слов и список для фраз, которые не несут смысловой нагрузки. <p>
Создадим функцию для лемматизации и очистки текста. <p>

In [3]:
trash_list = ['http\S+', '@\S+', '#\S+', '[^\w\s]', '\d+']
stop_words = set(stopwords.words('english'))

def clean_text(text):

    for txt in trash_list:
        text = re.sub(txt, ' ', text)
        text = re.sub(r'[^a-zA-Z ]', ' ', text.lower())

    text = text.lower()
    text = text.split()
    text = [WordNetLemmatizer().lemmatize(word) for word in text if word not in stop_words]
    text = ' '.join(text)

    return text

Применим функцию к столбцу с текстом комментария. <p>

In [4]:
comms['text'] = comms['text'].progress_apply(clean_text) 
comms['text']

100%|██████████| 159292/159292 [00:17<00:00, 9187.28it/s] 


0         explanation edits made username hardcore metal...
1         aww match background colour seemingly stuck th...
2         hey man really trying edit war guy constantly ...
3         make real suggestion improvement wondered sect...
4                             sir hero chance remember page
                                ...                        
159287    second time asking view completely contradicts...
159288                 ashamed horrible thing put talk page
159289    spitzer umm there actual article prostitution ...
159290    look like actually put speedy first version de...
159291    really think understand came idea bad right aw...
Name: text, Length: 159292, dtype: object

Разобъем корпус на обучающую и тестовую выборки. <p>

In [6]:
train, test = train_test_split(comms, test_size=0.2, random_state=12345)

Разобъем выборки на признаки и целевой признак. Используем стратификацию по признаку "токсичности". <p>

In [None]:
features_train, features_test, target_train, target_test = train_test_split(comms['text'], comms['toxic'], test_size=0.2, random_state=12345, stratify=comms['toxic'])
len(features_train), len(features_test)

(127433, 31859)

## Обучение

Создадим пайплайны для векторизации и подбора параметров для каждой модели. <p>

In [8]:
pipeline_lr = Pipeline([
    ('tf_idf', TfidfVectorizer()),
    ('lr', LogisticRegression(random_state=12345)),
])

parameters_lr = {
    'tf_idf__max_df': (0.25, 0.5, 0.75),
    'lr__C': [1, 10, 100]}

In [9]:
pipeline_sgd = Pipeline([
    ('tf_idf', TfidfVectorizer()),
    ('sgd', SGDClassifier(random_state=12345)),
])

parameters_sgd = {
    'tf_idf__max_df': (0.25, 0.5, 0.75),
    'sgd__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}

In [10]:
pipeline_svc = Pipeline([
    ('tf_idf', TfidfVectorizer()),
    ('svc', LinearSVC(random_state=12345)),
])

parameters_svc = {
    'tf_idf__max_df': (0.25, 0.5, 0.75),
    'svc__C': [0.1, 1, 10, 100, 1000]}

Обучим несколько моделей и выберем лучшую.

In [11]:
grid_search_lr = GridSearchCV(pipeline_lr, parameters_lr, cv=5, n_jobs=-1, scoring='f1', verbose=10)
grid_search_lr.fit(features_train, target_train)

f1_score_lr = grid_search_lr.best_score_
f1_score_lr

Fitting 5 folds for each of 9 candidates, totalling 45 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7760789414597233

In [12]:
grid_search_sdg = GridSearchCV(pipeline_sgd, parameters_sgd, cv=5, n_jobs=-1, scoring='f1', verbose=10)
grid_search_sdg.fit(features_train, target_train)

f1_score_sdg = grid_search_sdg.best_score_
f1_score_sdg

Fitting 5 folds for each of 21 candidates, totalling 105 fits


0.641375765685461

In [13]:
grid_search_svc = GridSearchCV(pipeline_svc, parameters_svc, cv=5, n_jobs=-1, scoring='f1', verbose=10)
grid_search_svc.fit(features_train, target_train)

f1_score_svc = grid_search_svc.best_score_
f1_score_svc

Fitting 5 folds for each of 15 candidates, totalling 75 fits


0.7816288221722199

Создадим табличку для сравнения результатов.

In [14]:
model_comparison = pd.DataFrame({'Модель': ['Logistic Regression', 'SGDClassifier', 'LinearSVC'],
                                    'F1 score': [f1_score_lr, f1_score_sdg, f1_score_svc]})

model_comparison

Unnamed: 0,Модель,F1 score
0,Logistic Regression,0.776079
1,SGDClassifier,0.641376
2,LinearSVC,0.781629


И объявим победителя.

In [15]:
best_f1 = model_comparison['F1 score'].max().round(2)
best_model = model_comparison["Модель"][model_comparison["F1 score"].idxmax()]

In [16]:
print(f'Лучшей моделью с F1-score равным {best_f1} оказалась {best_model}. Проверим её на тестовой выборке.')

Лучшей моделью с F1-score равным 0.78 оказалась LinearSVC. Проверим её на тестовой выборке.


В ходе обучения использовались несколько моделей с подобранными GridSearchCV параметрами. <p>
Наилучший результат на тренировочной выборке показала модель LinearSVC со значением F1 = 0.78. <p>

## Выводы

Протестируем лучшую модель, в данном случае это LinearSVC.

In [18]:
best_model_predicts = grid_search_svc.predict(features_test)
best_model_f1 = f1_score(target_test, best_model_predicts)
print(f"F1 score: {best_model_f1:.3f}")

F1 score: 0.790


Модель LinearSVC показала адекватный результат на тестовой выборке: Значение F1-меры - 0,790. В связи с этим, можно смело рекомендовать ее зказчику.

## Чек-лист проверки

- [x]  Jupyter Notebook открыт
- [x]  Весь код выполняется без ошибок
- [x]  Ячейки с кодом расположены в порядке исполнения
- [x]  Данные загружены и подготовлены
- [x]  Модели обучены
- [x]  Значение метрики *F1* не меньше 0.75
- [x]  Выводы написаны