# 1. Подготовка

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import RandomizedSearchCV
from tqdm import notebook
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv ('toxic_comments.csv')

In [3]:
df.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [5]:
def predproc (review):
    review = re.sub("[^A-Za-z]", " ", review)
    review = review.lower()
    review = word_tokenize(review)
    wordnet_lemmatizer = WordNetLemmatizer()
    review = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    return review      

In [6]:
corpus = []
for i in notebook.tqdm(range(0, len(df))):
    review = predproc(df.text[i])
    corpus.append(review)

HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))




In [7]:
corpus = pd.DataFrame(corpus)
df['lem'] = corpus

In [8]:
print("БЫЛО:", df.text[0])
print('-----------------------------------------------------------------------------------------------------------------------')
print("СТАЛО:", df.lem[0])
print("=======================================================================================================================")
print("БЫЛО:", df.text[1000])
print('-----------------------------------------------------------------------------------------------------------------------')
print("СТАЛО:", df.lem[1000])
print("=======================================================================================================================")
print("БЫЛО:", df.text[10000])
print('-----------------------------------------------------------------------------------------------------------------------')
print("СТАЛО:", df.lem[10000])
print("=======================================================================================================================")
print("БЫЛО:", df.text[100000])
print('-----------------------------------------------------------------------------------------------------------------------')
print("СТАЛО:", df.lem[100000])

БЫЛО: Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
-----------------------------------------------------------------------------------------------------------------------
СТАЛО: explanation edit make username hardcore metallica fan revert vandalisms closure gas vote new york dolls fac please remove template talk page since retire
БЫЛО: Rex Mundi 

I've created a stub on Rex Mundi at Rex Mundi High School.  Only thing I know about it is that both my Aunt Donna and Bob Griese went there.  Please add anything you might know about it.

BTW, my dad was a Panther; I live in Princeton myself.
-----------------------------------------------------------------------------------------------------------------------
СТАЛО: rex mundi create stub rex mundi rex mundi high school thing 

In [82]:
data = df.copy()

In [83]:
train, test = train_test_split (data, test_size=0.2)

In [84]:
train, valid = train_test_split (train, test_size=0.2)

In [85]:
corpus_train = train['lem'].values.astype('U')
corpus_valid = valid['lem'].values.astype('U')
corpus_test = test['lem'].values.astype('U')
y_train = train['toxic']
y_valid = valid['toxic']
y_test = test['toxic']

In [86]:
count_tf_idf = TfidfVectorizer()

In [87]:
tf_idf_train = count_tf_idf.fit_transform(corpus_train)
tf_idf_valid = count_tf_idf.transform(corpus_valid)
tf_idf_test = count_tf_idf.transform(corpus_test)

#### Вывод: предобработка текста проведена. Проведена лемматизация слов с помощью библиотеки WordNetLemmatizer, убраны стоп слова согласно библиотеки stopwords английского языка, так же удалены все символы, регистр приведен к нижнему. Так же для обучения моделей текст был преобразован в вектора с помощью TfidfVectorizer.

# 2. Обучение

### DummyClassifier

#### для создания бейслайна адекватности моделей сначала создадим модель DummyClassifier со стратегией "слепого угадывания"

In [88]:
dummy = DummyClassifier (strategy='uniform', random_state=42)

In [89]:
dummy.fit(tf_idf_train, y_train)

DummyClassifier(random_state=42, strategy='uniform')

In [90]:
train_pred = dummy.predict(tf_idf_train)
valid_pred = dummy.predict(tf_idf_valid)
test_pred = dummy.predict(tf_idf_test)

In [91]:
print('F1-score на обучающей выборке:', f1_score (y_train, train_pred))
print('F1-score на валидационной выборке:', f1_score (y_valid, valid_pred))
print('F1-score на тестовой выборке:', f1_score (y_test, test_pred))

F1-score на обучающей выборке: 0.16954887829637702
F1-score на валидационной выборке: 0.17037326050201587
F1-score на тестовой выборке: 0.17157117604242486


### LogisticRegression

In [132]:
logreg = LogisticRegression (penalty='l1', solver='saga')

In [133]:
%%time
logreg.fit(tf_idf_train, y_train)

Wall time: 21.6 s


LogisticRegression(penalty='l1', solver='saga')

In [134]:
train_pred = logreg.predict(tf_idf_train)
valid_pred = logreg.predict(tf_idf_valid)
test_pred = logreg.predict(tf_idf_test)

In [135]:
print('F1-score на обучающей выборке:', f1_score (y_train, train_pred))
print('F1-score на валидационной выборке:', f1_score (y_valid, valid_pred))
print('F1-score на тестовой выборке:', f1_score (y_test, test_pred))

F1-score на обучающей выборке: 0.7918524635287641
F1-score на валидационной выборке: 0.7778258986574275
F1-score на тестовой выборке: 0.765666140073723


### XGBClassifier

In [155]:
xgb = XGBClassifier (random_state=42, booster='dart', max_depth=20, n_estimators=150)

In [156]:
%%time
xgb.fit(tf_idf_train, y_train)

Wall time: 8min 43s


XGBClassifier(booster='dart', max_depth=20, n_estimators=150, random_state=42)

In [157]:
train_pred = xgb.predict(tf_idf_train)
valid_pred = xgb.predict(tf_idf_valid)
test_pred = xgb.predict(tf_idf_test)

In [158]:
print('F1-score на обучающей выборке:', f1_score (y_train, train_pred))
print('F1-score на валидационной выборке:', f1_score (y_valid, valid_pred))
print('F1-score на тестовой выборке:', f1_score (y_test, test_pred))

F1-score на обучающей выборке: 0.8619841486745012
F1-score на валидационной выборке: 0.751228226887003
F1-score на тестовой выборке: 0.7321265313585664


### LGBMClassifier

In [110]:
param = {'max_depth':(1, 5, 10, 15, 20, 25, 30),
        'learning_rate':(0.1, 0.2, 0.3, 0.4, 0.5),
        'n_estimators':(50, 100, 150, 200, 250),
        'boosting_type':['gbdt', 'dart', 'goss']}

In [111]:
lgbm = RandomizedSearchCV (LGBMClassifier(), 
                               param_distributions = param, 
                               n_iter=100, 
                               cv=3, 
                               n_jobs=-1, 
                               random_state=42,
                               verbose=1)

In [112]:
time
lgbm.fit(tf_idf_train, y_train, eval_set=(tf_idf_valid, y_valid), verbose=False)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 32.8min finished


Wall time: 33min 31s


RandomizedSearchCV(cv=3, estimator=LGBMClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt', 'dart',
                                                          'goss'],
                                        'learning_rate': (0.1, 0.2, 0.3, 0.4,
                                                          0.5),
                                        'max_depth': (1, 5, 10, 15, 20, 25, 30),
                                        'n_estimators': (50, 100, 150, 200,
                                                         250)},
                   random_state=42, verbose=1)

In [113]:
lgbm.best_params_

{'n_estimators': 250,
 'max_depth': 30,
 'learning_rate': 0.5,
 'boosting_type': 'dart'}

In [114]:
train_pred = lgbm.predict(tf_idf_train)
valid_pred = lgbm.predict(tf_idf_valid)
test_pred = lgbm.predict(tf_idf_test)

In [115]:
print('F1-score на обучающей выборке:', f1_score (y_train, train_pred))
print('F1-score на валидационной выборке:', f1_score (y_valid, valid_pred))
print('F1-score на тестовой выборке:', f1_score (y_test, test_pred))

F1-score на обучающей выборке: 0.8825577714466604
F1-score на валидационной выборке: 0.7848689771766696
F1-score на тестовой выборке: 0.7662650602409639


### CatBoostClassifier

In [116]:
cat = CatBoostClassifier (random_state=42)

In [117]:
%%time
cat.fit(tf_idf_train, y_train, eval_set=(tf_idf_valid, y_valid), early_stopping_rounds=50, use_best_model=True, verbose=250)

Learning rate set to 0.099349
0:	learn: 0.5937403	test: 0.5938197	best: 0.5938197 (0)	total: 660ms	remaining: 10m 59s
250:	learn: 0.1366729	test: 0.1428809	best: 0.1428809 (250)	total: 2m 34s	remaining: 7m 42s
500:	learn: 0.1160828	test: 0.1306922	best: 0.1306922 (500)	total: 5m 10s	remaining: 5m 8s
750:	learn: 0.1033601	test: 0.1247724	best: 0.1247724 (750)	total: 7m 47s	remaining: 2m 35s
999:	learn: 0.0955501	test: 0.1222068	best: 0.1221868 (990)	total: 10m 20s	remaining: 0us

bestTest = 0.1221867809
bestIteration = 990

Shrink model to first 991 iterations.
Wall time: 10min 26s


<catboost.core.CatBoostClassifier at 0x1dc16ad8788>

In [118]:
train_pred = cat.predict(tf_idf_train)
valid_pred = cat.predict(tf_idf_valid)
test_pred = cat.predict(tf_idf_test)

In [119]:
print('F1-score на обучающей выборке:', f1_score (y_train, train_pred))
print('F1-score на валидационной выборке:', f1_score (y_valid, valid_pred))
print('F1-score на тестовой выборке:', f1_score (y_test, test_pred))

F1-score на обучающей выборке: 0.8099146057447045
F1-score на валидационной выборке: 0.7650297291345518
F1-score на тестовой выборке: 0.7459400938289427


# 3. Выводы

#### В данном проекте для предсказания токсичности коментария мы применили такие процедуры предобработки текста как лемматизация,удаление знаков препинания и спец. символов при помощи регулярных выражений, а так же выбрасывали стоп слова и переводили текст в векторный вид для обучения моделей.
#### В качестве бейслайна был выбран DummyClassifier с стратегией слепого угадывания, который показал очень низкую вероятность отгадать класс, все модели показали много лучший результат, что свидетельствует об их адекватности. В качестве основной модели выбрали LGBM как самый быстро обучающийся бустер дающий необходимую точность из условий задания, альтернативно можно использовать логистическую регрессию, которая так же показала отличные результаты в данной задаче.