# Описание проекта

-Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять -описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения -других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.


In [89]:
import pandas as pd

from pymystem3 import Mystem

import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

import re

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from lightgbm import LGBMClassifier

# 1 Загрузил и подготовил данные

In [90]:
try:
    
    data = pd.read_csv('/Users/admin/Desktop/Rudra PRACTICUM/Projects/my_project/Datasets/toxic_comments.csv')
    
except:

    data = pd.read_csv('/datasets/toxic_comments.csv')

In [3]:
data

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [4]:
corpus = data.text

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [8]:
def lemmatized(text):
    stopword = set(stopwords.words("english"))
    text = " ".join(re.sub(r'[^a-zA-Z]', ' ', text).split())
    word_list = nltk.word_tokenize(text)
    without_stop_words = [word for word in word_list if not word in stopword]
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in without_stop_words])

In [9]:
corpus = corpus.apply(lemmatized).values.astype('U')

In [10]:
stopword = set(stopwords.words("english"))

# Ищем лучшую модель

In [13]:
y = data.toxic

In [14]:
X = corpus

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12345, stratify=y
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size=0.25, random_state=12345, stratify=y_train
)

In [17]:
tf_idf = TfidfVectorizer(stop_words=stopword)
X_train = tf_idf.fit_transform(X_train)
X_valid = tf_idf.transform(X_valid)
X_test = tf_idf.transform(X_test)

In [42]:
def ML(model_ml, features_train, target_train, features_valid, target_valid, parametrs, **fiter):
    clf = model_ml()
    grid = GridSearchCV(clf, parametrs, cv=5, n_jobs=5)
    grid.fit(features_train, target_train)
    g = grid.best_params_
    
    model = model_ml(random_state=12345, **g)
    model.fit(features_train, target_train, **fiter)
    predicted_valid = model.predict(features_valid)
    f1 = f1_score(target_valid, predicted_valid)
    print(f'f1 = {f1}, гиперпараметры {g}')
    
    return model

## Логистическая регрессия

In [83]:
model = LogisticRegression(random_state=12345, class_weight='balanced')

In [84]:
model.fit(X_train, y_train);



In [85]:
predicted = model.predict(X_valid)

In [86]:
f1_score(y_valid, predicted)

0.7408896034297963

## Дерево решений

In [30]:
parametrs_tree = {
    'max_depth': range (90, 110, 1), 
    'class_weight': ['balanced']
}

In [31]:
model_tree = ML(
    DecisionTreeClassifier, 
    X_train, y_train, X_valid, y_valid, 
    parametrs_tree
);

f1 = 0.648038049940547, гиперпараметры {'class_weight': 'balanced', 'max_depth': 100}


## Случайный лес

In [44]:
parametrs_random_tree = {
    'max_depth': range (17,21, 2), 
    'n_estimators': [25, 30, 40, 60],
    'class_weight': ['balanced'],
    'n_jobs': [-1]
}

In [45]:
model_random_tree = ML(
    RandomForestClassifier, 
    X_train, y_train, X_valid, y_valid, 
    parametrs_random_tree
);

f1 = 0.3513265038087733, гиперпараметры {'class_weight': 'balanced', 'max_depth': 19, 'n_estimators': 40, 'n_jobs': -1}


## LGBM

In [87]:
parametrs_LGBM = {
    'max_depth': range (27,33, 1), 
    'n_estimators': [250, 300, 400],
    'n_jobs': [-1]
}

In [88]:
model_LGBM = ML(
    LGBMClassifier, 
    X_train, y_train, X_valid, y_valid, 
    parametrs_LGBM
);

f1 = 0.7712260072626664, гиперпараметры {'max_depth': 29, 'n_estimators': 400, 'n_jobs': -1}


### Проверяю на тестовой выборке

In [92]:
predicted_valid = model.predict(X_test)
f1 = f1_score(y_test, predicted_valid)
f1

0.754607977991747

# Вывод 

- ### лучшей моделью оказалась LGBM. Наихудшей и самой медленно обучающейся случайный лес, ну и саммой быстрообучаемой и близкой нужному значению метрики логистическая регрессия
- ### показатель f1 на тестовой выборке модели LGBM составил 0.754607977991747