# Машинное обучение для текстов

# Задача

Интернет-магазин добавил возможность комментирования и дополнения описания своих товаров для пользователей. Необходимо создать инструмент для поиска токсичных комментариев и отправки их на модерацию, то есть обучить модель на классификацию комментариев на позитивные и негативные.

# Требования

- метрика f1 должна быть не меньше 0.75

# Шаг 1. Загрузка, анализ и подготовка данных

In [1]:
import pandas as pd
import re
# import numpy as np
import time

# import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


In [2]:
try:
    data = pd.read_csv('/datasets/toxic_comments.csv')
except FileNotFoundError:
    data = pd.read_csv('toxic_comments.csv')
data

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [4]:
data['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

In [5]:
data['toxic'] = pd.to_numeric(data['toxic'], downcast='integer')

Как видно из общего анализа данных, негативных комментариев всего лишь 10% от общего числа записей. Это необходимо будет учитывать. Также изменим тип `toxic` для уменьшения занимаемой памяти

In [6]:
tokenizer = nltk.tokenize.wordpunct_tokenize
wnl = nltk.WordNetLemmatizer()

In [7]:
def clear_prep_text(text):
    text = re.sub(r"[^a-zA-Z ]", " ", text).strip().lower()
    tokens = [wnl.lemmatize(token) for token in tokenizer(text)]
    lemmas = ' '.join(tokens)
    return lemmas

In [8]:
data['clear_text'] = data['text'].apply(clear_prep_text)
data.drop('text', axis=1, inplace=True)
data

Unnamed: 0,toxic,clear_text
0,0,explanation why the edits made under my userna...
1,0,d aww he match this background colour i m seem...
2,0,hey man i m really not trying to edit war it s...
3,0,more i can t make any real suggestion on impro...
4,0,you sir are my hero any chance you remember wh...
...,...,...
159566,0,and for the second time of asking when your vi...
159567,0,you should be ashamed of yourself that is a ho...
159568,0,spitzer umm there no actual article for prosti...
159569,0,and it look like it wa actually you who put on...


# Шаг 2. Обучение моделей

In [9]:
train, test = train_test_split(data, test_size=.2, random_state=12345, stratify=data['toxic'])

## SGDClassifier

In [10]:
pipeline_sgd = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("SGD", SGDClassifier())
    ]
)

In [11]:
parameters_sgd = {
    "vect__max_df": (0.75,),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 2),),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l2',),
    "SGD__max_iter": (20,),
    "SGD__alpha": (0.000001,),
    "SGD__penalty": ('elasticnet',),
    # 'SGD__max_iter': (10, 50, 80),
}

In [12]:
gs_sgd = GridSearchCV(pipeline_sgd, parameters_sgd, n_jobs=-1, verbose=1, scoring='f1')

In [13]:
start = time.time()
gs_sgd.fit(train['clear_text'], train['toxic'])
print(f'Work time: {round(time.time() - start, 2)}')
print(f'Best F1 score of SGDClassifier on train: {gs_sgd.best_score_}')
print(gs_sgd.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Work time: 44.06
Best F1 score of SGDClassifier on train: 0.7929049905631452
{'SGD__alpha': 1e-06, 'SGD__max_iter': 20, 'SGD__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2)}


In [14]:
predict = gs_sgd.predict(test['clear_text'])
print(f'Best F1 score of SGDClassifier on test: {f1_score(test.toxic, predict)}')

Best F1 score of SGDClassifier on test: 0.7988602078444519


## LogisticRegression

In [15]:
pipeline_lr = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("lr", LogisticRegression())
    ]
)

In [16]:
parameters_lr = {
    "vect__max_df": (0.75,),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 2),), 
    # 'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l2',),
    "lr__C": (100,)
}

In [17]:
gs_lr = GridSearchCV(pipeline_lr, parameters_lr, n_jobs=-1, verbose=1, scoring='f1')

In [18]:
start = time.time()
gs_lr.fit(train['clear_text'], train['toxic'])
print(f'Work time: {round(time.time() - start, 2)}')
print(f'Best F1 score of LogisticRegression on train: {gs_lr.best_score_}')
print(gs_lr.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Work time: 143.49
Best F1 score of LogisticRegression on train: 0.7899803589204651
{'lr__C': 100, 'tfidf__norm': 'l2', 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2)}


In [19]:
predict = gs_lr.predict(test['clear_text'])
print(f'Best F1 score of LogisticRegression on test: {f1_score(test.toxic, predict)}')

Best F1 score of LogisticRegression on test: 0.8028805895159941


<b>Вывод</b>
- данные загружены
- текстовые данные лемантизированы
- подобраны параметры к векторизации и моделям SGDClassifier и LogisticRegression. f1 в обоих случаях составила 0.8 (оставлены только лучшие параметры для уменьшения времени расчетов)