# Описание проекта
Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. 
Обучим модель классифицировать комментарии на позитивные и негативные. В нашем распоряжении набор данных с разметкой о токсичности правок.
Построим модель со значением метрики качества F1 не меньше 0.75. 


In [2]:
import numpy as np
import pandas as pd
import nltk
import re
import tqdm
import warnings

from lightgbm import LGBMClassifier

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline

from tqdm import notebook

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/admin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
try:
    data = pd.read_csv('toxic_comments.csv', index_col = [0])
except:
    data = pd.read_csv('/datasets/toxic_comments.csv', index_col = [0])
    
data.head(10)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
5,"""\n\nCongratulations from me as well, use the ...",0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0
8,Sorry if the word 'nonsense' was offensive to ...,0
9,alignment on this subject and which are contra...,0


In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [8]:
data['text'] = data['text'].map(lambda x: clean_text(x))


In [9]:
def tokenizer (text):
    words = word_tokenize(text)
    return words

In [10]:
data['text'] = data['text'].map(lambda x: tokenizer(x))

In [11]:
def lemmatizer(text):
    lemma_words = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lemma_words

In [12]:
data['text'] = data['text'].map(lambda x:lemmatizer(x))

In [13]:
def clear_text(text):
    
    clear_text = " ".join(text)#.split())
    return clear_text.lower()

In [14]:
data['text'] = data['text'].map(lambda x:clear_text(x))

In [15]:
data.head()

Unnamed: 0,text,toxic
0,explanation why the edits made under my userna...,0
1,d aww he match this background colour i am see...,0
2,hey man i am really not trying to edit war it ...,0
3,more i can not make any real suggestion on imp...,0
4,you sir are my hero any chance you remember wh...,0


In [16]:
train, test = train_test_split(data,
                               test_size = 0.2,
                               random_state=12345)

In [17]:
corpus_train = train['text']
corpus_test = test['text']

In [18]:
corpus_train

97494     bushranger you are a grass with no sense of hu...
4383      need administrative help i have been blocked i...
103777    i would also like to point out that he ha used...
38619       you cant block me you fucking retard brb nigger
128443    i believe that the frequency of the wave need ...
                                ...                        
110090    hahaha i dont live in a lie like you and dont ...
85493                                 march 2006 march 2006
133387    agreed we really should try to stick to the su...
130469    umm killer do you not like that he copied your...
77361      bradford city i am removing unreferanced content
Name: text, Length: 127433, dtype: object

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stopwords = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=list(stopwords), lowercase = False)
tf_idf_train = count_tf_idf.fit_transform(corpus_train)
tf_idf_test = count_tf_idf.transform(corpus_test)

In [21]:
target_train = train['toxic']
target_test = test['toxic']

Первая модель - логистическая регрессия

In [22]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(stopwords))),
    ('logreg', LogisticRegression(class_weight='balanced',
                                    solver = 'liblinear',
                                  random_state=12345)),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'logreg__C': [1,2,6]
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, scoring='f1', verbose=0)
grid_search_tune.fit(corpus_train, target_train)



GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=['had',
                                                                    'same',
                                                                    'themselves',
                                                                    "that'll",
                                                                    'being',
                                                                    'were',
                                                                    'yourself',
                                                                    "hadn't",
                                                                    'do',
                                                                    "mightn't",
                                                                    'this',
                                                                    'ours',
   

In [23]:
print('Лучшее значение ', grid_search_tune.best_params_, 'F1:', grid_search_tune.best_score_)

Лучшее значение  {'logreg__C': 6, 'tfidf__max_df': 0.25, 'tfidf__ngram_range': (1, 2)} F1: 0.7711527136199212


Вторая модель - решающее дерево

In [25]:
parameters_tr = {'max_depth': range(1, 101, 10)}

grid_tr = GridSearchCV(DecisionTreeClassifier(random_state=12345),
                       cv = 2,
                       param_grid = parameters_tr,
                       n_jobs = -1,
                       verbose = False,
                       scoring = 'f1').fit(tf_idf_train, target_train)

In [26]:
print('Лучшее значение ', grid_tr.best_params_, 'F1:', grid_tr.best_score_)

Лучшее значение  {'max_depth': 91} F1: 0.7118024475314888


Метрика 0.71

In [27]:
model_lgbm = LGBMClassifier()

parameters_lgbm = {'n_estimators': (50, 201, 50),
             'learning_rate': [0.01, 0.1, 0.3, 0.5],
             'num_leaves': [10, 20, 30, 40, 50]}

grid_lgbm = GridSearchCV(LGBMClassifier(),
                         param_grid = parameters_lgbm,
                         cv = 3,
                         n_jobs = -1, 
                         scoring = 'f1').fit(tf_idf_train, target_train)


print('Лучшие параметры', grid_lgbm.best_params_, 'F1:', grid_lgbm.best_score_)

Метрика соответствует требованиям 0.764

Лучшая модель - линейная регрессия. Протестируем ее

In [29]:
prediction = grid_search_tune.predict(corpus_test)
print("F1_score:", f1_score(target_test, prediction))

F1_score: 0.7843254561770864


На тестовой выборке получена удовлетворительная метрика - 0.78

Вывод:
В ходе работы были обработаны комментарии - произведена токенизация, лемматизация, очистка. Далее они переведены в векторное представление. 
На данных обучены три модели. LightGBM поаказала наилучший результат.