# Проект для «Викишоп»

## Подготовка

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv('/datasets/toxic_comments.csv')

In [3]:
data.head(5)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
text     159571 non-null object
toxic    159571 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [5]:
data.duplicated().sum()

0

In [6]:
data.isna().sum()

text     0
toxic    0
dtype: int64

### Очищаем текст

In [7]:
def clear_text(text):
    lol = re.sub(r'[^a-zA-Z ]', ' ', text) 
    lol2 = " ".join(lol.split())
    return lol2

In [8]:
data['lemm_text'] = data['text'].apply(clear_text)

### Лемматизация

In [9]:
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()


In [10]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w, pos='v') for w in w_tokenizer.tokenize(text)]

In [11]:
data['lemm_text'] = data['lemm_text'].apply(lemmatize_text)

In [12]:
def join(text):
    lemm_text = " ".join(text) 
    return lemm_text

In [13]:
data['lemm_text'] = data['lemm_text'].apply(join)

### Подготовим выборки

In [14]:
data['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

In [15]:
df_train, df_test = train_test_split(data, test_size=0.25, random_state=12345) 

In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119678 entries, 111565 to 77285
Data columns (total 3 columns):
text         119678 non-null object
toxic        119678 non-null int64
lemm_text    119678 non-null object
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39893 entries, 146790 to 36200
Data columns (total 3 columns):
text         39893 non-null object
toxic        39893 non-null int64
lemm_text    39893 non-null object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [18]:
Х_train = df_train['lemm_text']
y_train = df_train['toxic']
X_test = df_test['lemm_text']

### Lemm to TF-IDF

In [19]:
stopwords = text.ENGLISH_STOP_WORDS

In [20]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords)

In [21]:
tf_idf_model = count_tf_idf.fit(Х_train)
train_tf_idf = count_tf_idf.transform(Х_train)
test_tf_idf = count_tf_idf.transform(X_test)

## Обучение

### CatBoostRegressor

In [22]:
model_c = CatBoostClassifier(iterations=15,
                           learning_rate=1,
                           depth=5)

In [23]:
model_c.fit(train_tf_idf, y_train)

0:	learn: 0.2604647	total: 3.48s	remaining: 48.7s
1:	learn: 0.2343785	total: 6.39s	remaining: 41.5s
2:	learn: 0.2161862	total: 9.48s	remaining: 37.9s
3:	learn: 0.2068241	total: 12.4s	remaining: 34.1s
4:	learn: 0.1988127	total: 15.4s	remaining: 30.8s
5:	learn: 0.1917381	total: 18.3s	remaining: 27.4s
6:	learn: 0.1868503	total: 21.2s	remaining: 24.2s
7:	learn: 0.1827173	total: 24.1s	remaining: 21.1s
8:	learn: 0.1800848	total: 27s	remaining: 18s
9:	learn: 0.1748550	total: 30s	remaining: 15s
10:	learn: 0.1721967	total: 32.9s	remaining: 12s
11:	learn: 0.1694731	total: 35.8s	remaining: 8.94s
12:	learn: 0.1672723	total: 38.7s	remaining: 5.95s
13:	learn: 0.1654878	total: 41.6s	remaining: 2.97s
14:	learn: 0.1630793	total: 44.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f8c72b62350>

In [24]:
pred_c = model_c.predict(test_tf_idf)

### LogisticRegression

In [25]:
model_l = LogisticRegression(C=2.0)
model_l.fit(train_tf_idf, y_train)



LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
pred_l = model_l.predict(test_tf_idf)

### DecisionTreeRegressor

In [27]:
best_model = None
best_result = 1000
y_test = df_test['toxic']
for est in range(1, 20):
    model_t = RandomForestClassifier(random_state=12345, n_estimators=est) 
    model_t.fit(train_tf_idf, y_train) 
    pred_t = model_t.predict(test_tf_idf)
    result = f1_score(y_test, pred_l) 
    if result > best_result:
        best_model = model_t
        best_result = result

print("f1 наилучшей модели на тренировочной выборке:", best_result)

f1 наилучшей модели на тренировочной выборке: 1000


In [28]:
model_t = DecisionTreeClassifier(random_state=12345, max_depth=100)
model_t.fit(train_tf_idf, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=12345, splitter='best')

In [29]:
pred_t = model_t.predict(test_tf_idf)

### RandomForestRegressor

In [30]:
model_f = RandomForestClassifier(random_state=12345, max_depth=100, n_estimators = 25)
model_f.fit(train_tf_idf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)

In [31]:
pred_f = model_f.predict(test_tf_idf)

## Выводы

In [32]:
y_test = df_test['toxic']

In [33]:
f1_l = f1_score(y_test, pred_l)
f1_t = f1_score(y_test, pred_t)
f1_f = f1_score(y_test, pred_f)
f1_c = f1_score(y_test, pred_c)

In [34]:
print("F1 на Логической классификации:", f1_l)
print("F1 на Древе решений:", f1_t)
print("F1 на Случайном лесе:", f1_f)
print("F1 на Катебусте:", f1_c)

F1 на Логической классификации: 0.7619997151402935
F1 на Древе решений: 0.7191586894971013
F1 на Случайном лесе: 0.25649765658287177
F1 на Катебусте: 0.7044006948465547


### Вывод

Проект сделан! Модель успешно преобразовывает коментарии и определяет их токсичность. Для наиболее достоверного определения таксичности коментариев необходимо использовать модель Логической классификации.