In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('labeled.csv')
df['comment'] = df['comment'].str.lower()

In [3]:
df[~df.comment.str.contains("http")]

Unnamed: 0,comment,toxic
0,"верблюдов-то за что? дебилы, бл...\n",1.0
1,"хохлы, это отдушина затюканого россиянина, мол...",1.0
2,собаке - собачья смерть\n,1.0
3,"страницу обнови, дебил. это тоже не оскорблени...",1.0
4,"тебя не убедил 6-страничный пдф в том, что скр...",1.0
...,...,...
14407,вонючий совковый скот прибежал и ноет. а вот и...,1.0
14408,а кого любить? гоблина тупорылого что-ли? или ...,1.0
14409,"посмотрел утомленных солнцем 2. и оказалось, ч...",0.0
14410,крымотред нарушает правила раздела т.к в нем н...,1.0


In [23]:
import re
def tokenizer(text):
    return re.split('[_,][=][-][:][;][*][/][\n][)][(]', text)

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /zfs/hybrilit.jinr.ru/user/m/myakotin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('russian')

In [8]:
bad_words = pd.read_csv('bad_words.txt', header=None)
y_bad  = np.array([1 for i in range(len(bad_words))])
bad_words['y'] = y_bad
bad_words

Unnamed: 0,0,y
0,абортмахер,1
1,анал,1
2,анус,1
3,армячок,1
4,архамудия,1
...,...,...
3976,ёбарь,1
3977,ёбвашумать,1
3978,ёбкай,1
3979,ёбнуть,1


In [9]:
good_words = pd.read_csv('good_words.txt', header=None)[15:]
y_good  = np.array([0 for i in range(len(good_words))])
good_words['y'] = y_good
good_words

Unnamed: 0,0,y
15,абажур,0
16,абажура,0
17,абажурам,0
18,абажурами,0
19,абажурах,0
...,...,...
1532624,посмурневшему,0
1532625,посмурневшею,0
1532626,посмурневши,0
1532627,посмурневшие,0


In [70]:
sum(df['toxic'].values)

4826.0

In [71]:
#X = np.concatenate((df['comment'].values, bad_words[0].values, good_words[0].values), axis=0)
#y = np.concatenate((df['toxic'].values, bad_words['y'].values, good_words['y'].values), axis=0)
X = np.concatenate((df['comment'].values, bad_words[0].values), axis=0)
y = np.concatenate((df['toxic'].values, bad_words['y'].values), axis=0)

In [72]:
len(y) - sum(y)

9586.0

In [75]:
len(y)

18393

In [76]:
X_train = X[:14000]
y_train = y[:14000]
X_test = X[14000:]
y_test = y[14000:]

In [74]:
X_train = X[:1300000]
y_train = y[:1300000]
X_test = X[1300000:]
y_test = y[1300000:]

In [77]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB



tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=True,
                       preprocessor=None)


param_grid = [{'vect__ngram_range': [(1, 1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer],
              'clf__fit_prior':[True, False],
              'clf__alpha': [0.001, 0.01, 0.1, 1.0, 5.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
              'vect__stop_words': [stop, None],
              'vect__use_idf':[False],
              'vect__norm':[None],
              'clf__fit_prior':[True, False],
              'clf__alpha': [0.001, 0.01, 0.1, 1.0, 5.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', MultinomialNB())])
nb_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring='f1_weighted',
                          cv=5, verbose=2,
                          n_jobs=-1)
nb_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 115 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:    8.5s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid=[{'clf__alpha': [0.001, 0.01, 0.1, 1.0, 5.0, 10.0,
                                         100.0],
                          'clf__fit_prior': [True, False],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['и', 'в', 'во', 'не', 'что',
                                                'он', 'на', 'я', 'с', 'со',
                                                'как', 'а', 'то', 'все', 'она',
                                                'так', 'его', 'но', 'да', 'ты',
                                                '...
                         {'clf__alpha': [0.001, 0.01, 0.1, 1.0, 5.0, 10.0,
                                         100.0],
                          'clf__fit_prior': [True, False],
                          'vect__

In [86]:
from sklearn.metrics import recall_score

print('Наилучший набор параметров: %s'
     % nb_tfidf.best_params_)
print('Правильность при перекрестной проверке: %.3f'
     % nb_tfidf.best_score_)


print('Правильность при испытании: %.3f'
     % recall_score(nb_tfidf.predict(X_test), y_test))

Наилучший набор параметров: {'clf__alpha': 0.1, 'clf__fit_prior': False, 'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__stop_words': None, 'vect__use_idf': False}
Правильность при перекрестной проверке: 0.865
Правильность при испытании: 0.967


In [87]:
nb_tfidf.fit(X, y)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:    7.9s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid=[{'clf__alpha': [0.001, 0.01, 0.1, 1.0, 5.0, 10.0,
                                         100.0],
                          'clf__fit_prior': [True, False],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['и', 'в', 'во', 'не', 'что',
                                                'он', 'на', 'я', 'с', 'со',
                                                'как', 'а', 'то', 'все', 'она',
                                                'так', 'его', 'но', 'да', 'ты',
                                                '...
                         {'clf__alpha': [0.001, 0.01, 0.1, 1.0, 5.0, 10.0,
                                         100.0],
                          'clf__fit_prior': [True, False],
                          'vect__

In [117]:
test = 'хуйло пидорское'
nb_tfidf.predict([test])

array([1.])

In [106]:
nb_tfidf.predict_proba([test])

array([[0.05206134, 0.94793866]])

In [119]:
test = 'привет мама'
nb_tfidf.predict([test])

array([1.])

In [120]:
if nb_tfidf.predict_proba([test])[0][0] > 0.12:
    print(0)
else:
    print(1)

0


In [92]:
nb_tfidf.score(good_words[0].values, good_words['y'].values)

0.993845682179329

In [93]:
nb_tfidf.score(bad_words[0].values, bad_words['y'].values)

0.9992458521870287

In [122]:
import pickle
import os
s = pickle.dump(nb_tfidf, open(os.path.join('model_new.pkl'), 'wb'), protocol=4)