In [1]:
import time
import string
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.sparse import hstack

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.metrics import average_precision_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [2]:
XY = pd.read_csv('XY.csv', header = 0)
XY.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised,normal,threat,insult,obscenity
0,41127,–¥–≤–æ—Ä–Ω–∏–∫–∞ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å!,–¥–≤–æ—Ä–Ω–∏–∫ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂,–¥–≤–æ—Ä–Ω–∏–∫ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å,0,1,0,0
1,6812,"–º–æ—è —Å—Ç–∞—Ä—à–∞—è –Ω–µ–¥–µ–ª—é —à–∏–ø–µ–ª–∞, –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª–∞ –ø–æ–¥–∫–∏–¥...",–º–æ—è —Å—Ç–∞—Ä—à –Ω–µ–¥–µ–ª —à–∏–ø–µ–ª –Ω–µ –ø—Ä–∏–Ω–∏–º–∞ –ø–æ–¥–∫–∏–¥—ã—à –∫–æ—Ç–æ...,–º–æ–π —Å—Ç–∞—Ä—à–∏–π –Ω–µ–¥–µ–ª—è —à–∏–ø–µ—Ç—å –Ω–µ –ø—Ä–∏–Ω–∏–º–∞—Ç—å –ø–æ–¥–∫–∏–¥—ã...,1,0,0,0
2,6256,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤–∞–º–∏ —Å–æ–≥–ª–∞—Å–Ω–∞!,–ø–æ–ª–Ω–æ—Å—Ç —Å –≤–∞–º —Å–æ–≥–ª–∞—Å–Ω,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤—ã —Å–æ–≥–ª–∞—Å–Ω—ã–π,1,0,0,0


In [3]:
XY['exclamation_num'] = XY.text.str.count('!')
XY['question_num'] = XY.text.str.count('\?')

XY_train, XY_test = train_test_split(XY, test_size = 0.3, shuffle = True, random_state = 42)
XY_train.reset_index(drop = True, inplace = True)
XY_test.reset_index(drop = True, inplace = True)
XY_train_abn = XY_train.loc[XY_train.normal == 0, :].reset_index(drop = True)
XY_train.shape, XY_train_abn.shape

((104142, 10), (18547, 10))

In [4]:
def hyperopt_tdidf_logit_label(label):
    
    X = XY_train.text
    y = XY_train[label]
    
    @ignore_warnings(category=ConvergenceWarning)
    def hyperopt_tdidf_logit(params):
        
        pipe = Pipeline([('trans', TfidfVectorizer(min_df = 2)), ('clf', LogisticRegression(**params))])

        score = cross_val_score(estimator = pipe, X = X, y = y,
                                cv = StratifiedKFold(n_splits = 7), scoring = 'average_precision')

        score_mean = score.mean()

        return -score_mean

    space_tfidf_logit = {'C': hp.uniform('C', 1, 25)}
    
    best = fmin(fn = hyperopt_tdidf_logit, space = space_tfidf_logit, algo = tpe.suggest, max_evals = 50)
    
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    pipe = Pipeline([('trans', TfidfVectorizer(min_df = 2)), ('clf', LogisticRegression(**best))])
    pipe.fit(X, y)
    
    print(average_precision_score(y_score = pipe.predict_proba(X)[:, 1], y_true = y))
    
    return pipe

In [5]:
# algs_1 = dict()
# for label in ['normal', 'insult', 'obscenity', 'threat']:
#     print('----------------------------------------------------------------------------------------------------')
#     print(label)
#     print('----------------------------------------------------------------------------------------------------')
#     print()
#     algs_1[label] = hyperopt_tdidf_logit_label(label)

In [6]:
# algs_1

In [40]:
%%time
# text feature extraction - level 1
trans_1_2_11 = TfidfVectorizer(min_df = 2)
text_train_1 = trans_1_2_11.fit_transform(XY_train.text)
text_test_1 = trans_1_2_11.transform(XY_test.text)

# classifier - level 1
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# clf_1_normal = LogisticRegression(C = 5.3).fit(text_train_1, XY_train['normal'])
# clf_1_insult = LogisticRegression(C = 12).fit(text_train_1, XY_train['insult'])
# clf_1_threat = LogisticRegression(C = 10.6).fit(text_train_1, XY_train['threat'])
# clf_1_obscenity = LogisticRegression(C = 21.8).fit(text_train_1, XY_train['obscenity'])

# pred_train_normal_1 = clf_1_normal.predict_proba(text_train_1)[:, 1].reshape(-1, 1)
# pred_train_insult_1 = clf_1_insult.predict_proba(text_train_1)[:, 1].reshape(-1, 1)
# pred_train_threat_1 = clf_1_threat.predict_proba(text_train_1)[:, 1].reshape(-1, 1)
# pred_train_obscenity_1 = clf_1_obscenity.predict_proba(text_train_1)[:, 1].reshape(-1, 1)

# pred_test_normal_1 = clf_1_normal.predict_proba(text_test_1)[:, 1].reshape(-1, 1)
# pred_test_insult_1 = clf_1_insult.predict_proba(text_test_1)[:, 1].reshape(-1, 1)
# pred_test_threat_1 = clf_1_threat.predict_proba(text_test_1)[:, 1].reshape(-1, 1)
# pred_test_obscenity_1 = clf_1_obscenity.predict_proba(text_test_1)[:, 1].reshape(-1, 1)

# text feature extraction - level 2
stopWords = stopwords.words('russian') + ['—ç—Ç–æ', '–≤—Å—ë', '–≤–µ—Å—å', '–µ—â—ë', '—á–µ–ª–æ–≤–µ–∫', '—Ç–≤–æ–π', '–∫–æ—Ç–æ—Ä—ã–π', '–∏–¥—Ç–∏', '—Å—É–¥',
                                          '—Å–≤–æ–π', '—Ä—É–∫–∞', '–Ω—É–∂–Ω–æ', '—Ä–µ–±—ë–Ω–æ–∫', '–µ—ë', '–∂–∏—Ç—å', '–ø—Ä–æ—Å—Ç–æ', '–Ω–∞—à', '–≤–∞—à',
                                          '—Ä–æ—Å—Å–∏—è', '—Å—Ç—Ä–∞–Ω–∞', '–º–æ—á—å', '–Ω–∞—Ä–æ–¥', '–ø—É—Ç–∏–Ω', '–ø—É—Ç–∏–Ω—Å–∫–∏–π', '—Ä–æ—Å—Å–∏—è',
                                          '–Ω–æ–≥–∞', '–∂–µ–Ω–∞', '–º–µ—Å—Ç–æ', '–º—É–∂–∏–∫', '–¥–∞–ª—ë–∫–∏–π', '–º–∞–º–∞', '–¥–µ–Ω—å', '—Å–∫–∞–∑–∞—Ç—å',
                                          '–∫–∞–∂–¥—ã–π', '–ø—É—Å—Ç—å', '–¥–µ–ª–∞—Ç—å', '–ª—é–±–∏—Ç—å', '–∑–Ω–∞—Ç—å', '—Ö–æ—Ä–æ—à–∏–π', '–±–æ–ª—å—à–æ–π',
                                          '–∑–µ–º–ª—è', '—Å–ª–æ–≤–æ', '–Ω–∞–π—Ç–∏', '—Å—Ç–µ–Ω–∫–∞', '–≤–º–µ—Å—Ç–µ', '–≤–∑—è—Ç—å', '—Å–∞–º—ã–π', '—è–π—Ü–æ',
                                          '—Å–∫–æ–ª—å–∫–æ', '—Å–º–æ—Ç—Ä–µ—Ç—å', '—Å–¥–µ–ª–∞—Ç—å', '–≥–æ–ª–æ–≤–∞', '–≥–æ–≤–æ—Ä–∏—Ç—å', '–≤–æ–æ–±—â–µ', '–≥–æ–¥',
                                          '–¥–µ–Ω—å–≥–∞', '–ø—Ä–æ–¥–∞–∂–Ω—ã–π', '–ø–∏—Å–∞—Ç—å', '—Ä–∞–±–æ—Ç–∞—Ç—å', '–¥—É–º–∞—Ç—å', '–∂–∏–∑–Ω—å', '–º–æ–∑–≥',
                                          '—Ä—É—Å—Å–∫–∏–π', '—Å—Ä–∞–∑—É', '–º–∞–ª–æ', '–ø–ª–æ—â–∞–¥—å', '—Å–æ–±–∞–∫–∞', '–µ—Å—Ç–∏', '—Ä–æ—Ç', '—Ö–æ—Ç–µ—Ç—å',
                                          '–¥–∞–≤–∞—Ç—å', '–º–∞—Ç—å', '–≤—ã–µ—Å—Ç–∏', '—Å–∏–¥–µ—Ç—å', '–ø–æ–π—Ç–∏', '–¥–∞—Ç—å', '–¥–∞–≤–Ω–æ', '—Å–∞–∂–∞—Ç—å',
                                          '–ø–æ–ª–Ω—ã–π', '–ø–æ—Ä–∞', '—Å—Ç–∞—Ç—å', '–¥–æ–ª–∂–Ω—ã–π', '—Å—Ç–∞—Ç—å', '–≤—Ä–µ–º—è', '–ø–æ–∫–∞', '–≤–ª–∞—Å—Ç—å',
                                          '–Ω–∏–∫—Ç–æ', '–ø—Ä–∏–≤—è–∑–∞—Ç—å', '–±–æ–≥', '—Å–∫–æ—Ä–æ', '–∫–æ—Ä–º–∏—Ç—å' '–≤—Ä–∞–≥', '—à–µ—è', '–±–∞—à–∫–∞', 
                                          '–±–∞–±–∞', '–º—É–∂', '–ø–æ–∫–∞–∑–∞—Ç—å', '—É–∫—Ä–∞–∏–Ω–∞', '—Å—Ç–∞—Ä—ã–π', '—Ä–æ–¥–∏—Ç–µ–ª—å', '–ø–æ—Å–∞–¥–∏—Ç—å',
                                          '–≤–∏–¥–µ—Ç—å', '–≤—Ä–∞–≥', '—Å—É–ø–µ—Ä', '–∂–µ–Ω—â–∏–Ω–∞', '—Å—Ç–æ–∏—Ç—å', '–∫–ª–∞—Å—Å–Ω—ã–π', '–ø–µ—Ä–≤—ã–π', 
                                          '–Ω–∞—á–∞—Ç—å', '–≤–∞–ª–∏—Ç—å', '–ø—Ä–µ–¥–∞—Ç–µ–ª—å', 'fr', 'fr fr', '—Å–ª–µ–¥—Å—Ç–≤–∏–µ', '–ø—Ä–∏–¥—É—Ä–æ–∫', 
                                          '–ø—Ä–∏–≤–µ—Ç', '–Ω—É–∂–Ω—ã–π', '—Ä–µ—à–∏—Ç—å', '8oi', '—á–µ', '–∫–æ—Ä–º–∏—Ç—å', '–¥—Ä—É–≥', '–¥—É—Ä–∞–∫', 
                                          '–ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å', '–¥–µ–ª–æ', '–æ–±–∞', '–ø–æ—á–µ–º—É', '–º–∏—Ä', '—É–º–µ—Ç—å','–æ—Ç–≤–µ—Ç–∏—Ç—å', '—Å–µ–º—å—è',
                                          '—è–∑—ã–∫', '–≤–∏–¥–Ω–æ', '–±–æ—è—Ç—å—Å—è', '–¥–∞–≤–∏—Ç—å', '–≤—Å—è–∫–∏–π', '—Ö–æ–∑—è–∏–Ω', '–∫—Ä–∞—Å–∏–≤—ã–π', 
                                          '–≥–ª–∞–∑', '–¥–µ–≤–æ—á–∫–∞', '–ø–æ—Å—Ç–∞–≤–∏—Ç—å', '–º–∞–ª–µ–Ω—å–∫–∏–π', '—Ö–æ—Ç–µ—Ç—å—Å—è', '–æ—Å—Ç–∞–ª—å–Ω–æ–π',
                                          '–æ—á–µ–Ω—å', '—Å—Ç–∞–ª–∏–Ω', '–∑–∞–∫–æ–Ω', '–Ω–∞–≤–µ—Ä–Ω–æ–µ', '–ø—Ä–∏–π—Ç–∏', '–∏–º–µ—Ç—å', '–∫–ª–∞—Å—Å', '—Ç—è',
                                          '–Ω–æ—á—å', '–∂–¥–∞—Ç—å', '—Ç–∞–∫–∂–µ']
# tf = TfidfVectorizer(min_df = 0.01, ngram_range = (1, 2), stop_words = stopWords)
tf_insult = CountVectorizer(max_features = 50, ngram_range = (1, 2), 
                            stop_words = stopWords).fit(XY_train.loc[XY_train.insult == 1, 'text_lemmatised'])
tf_threat = CountVectorizer(max_features = 75, ngram_range = (1, 2), 
                            stop_words = stopWords).fit(XY_train.loc[XY_train.threat == 1, 'text_lemmatised'])
tf_obscenity = CountVectorizer(max_features = 75, ngram_range = (1, 2), 
                               stop_words = stopWords).fit(XY_train.loc[XY_train.obscenity == 1, 'text_lemmatised'])

# text_train_insult_2 = (tf_insult.transform(XY_train.text_lemmatised).toarray() > 0).astype('int')
# text_test_insult_2 = (tf_insult.transform(XY_test.text_lemmatised).toarray() > 0).astype('int')

# text_train_threat_2 = (tf_threat.transform(XY_train.text_lemmatised).toarray() > 0).astype('int')
# text_test_threat_2 = (tf_threat.transform(XY_test.text_lemmatised).toarray() > 0).astype('int')

# text_train_obscenity_2 = (tf_obscenity.transform(XY_train.text_lemmatised).toarray() > 0).astype('int')
# text_test_obscenity_2 = (tf_obscenity.transform(XY_test.text_lemmatised).toarray() > 0).astype('int')

CPU times: user 3.36 s, sys: 70.9 ms, total: 3.43 s
Wall time: 3.48 s


In [41]:
list(set(tf_insult.vocabulary_) | set(tf_threat.vocabulary_) | set(tf_obscenity.vocabulary_))

['–±—ã–¥–ª–æ',
 '–¥—Ä–æ—á–∏—Ç—å',
 '–∫–æ–Ω—á–∏—Ç—å',
 '–º–∏–Ω—É—Ç—å',
 '–æ—Ç—Ä–µ–∑–∞—Ç—å',
 '–µ–±–∞–ª',
 '–ø–∏–∑–¥–µ—Ü',
 '–≤–¥—É—Ç—å',
 '–ø–æ–ª–∏–∑–∞—Ç—å',
 '—Å–æ—Å–Ω—É—Ç—å',
 '–ø–æ–≤–µ—Å–∏—Ç—å',
 '—Å—ã–Ω',
 '–±—Ä–∞—Ç—å',
 '—É—Ç–æ–ø–∏—Ç—å',
 '–ø–∏–∑–¥–∞',
 '—Å–ø–µ—Ä–º–∞',
 '—Å–µ—Å—Ç—Ä–∞',
 '–∫–æ–∑—ë–ª',
 '–ø–∏–¥–æ—Ä–∞—Å',
 '–Ω–∞–±–∏—Ç—å',
 '–≥–æ–≤–Ω–æ',
 '–∑–∞–∫–æ–ø–∞—Ç—å',
 '—Ö–æ—Ö–æ–ª',
 '–≤–æ—Ä',
 '–¥—É—Ä–∞',
 '—É–±–ª—é–¥–æ–∫',
 '–∫–∞–∑–Ω–∏—Ç—å',
 '–≤—ã–µ–±–∞–ª',
 '–ª–∏–∑–∞—Ç—å',
 '–∑–∞—Å–∞–¥–∏—Ç—å',
 '—á–º–æ',
 '—Ä–∞—Å—Ç—Ä–µ–ª–∞',
 '–∑–∞–¥–Ω–∏—Ü–∞',
 '–ø—Ä–∏–ª—é–¥–Ω–æ',
 '–≥–Ω–∏–¥–∞',
 '—Å—É–∫–∞',
 '—Ä–∞—Å—Ç—Ä–µ–ª—è—Ç—å',
 '–Ω–∞—Å–æ—Å–∞—Ç—å',
 '—Å–æ—Å–∞—Ç—å —Å–æ—Å–∞—Ç—å',
 '–ø–∏–∑–¥–∏—Ç—å',
 '–ø—Ä–æ–∫–ª—è—Ç—ã–π',
 '–±–∏—Ç—å',
 '–æ—Ç—Å—Ç—Ä–µ–ª–∏–≤–∞—Ç—å',
 '–µ–±—É',
 '—É–±–∏–≤–∞—Ç—å',
 '–∫–æ–Ω—á–µ–Ω—ã–π',
 '–∂–∏–≤–æ—Ç–Ω–æ–µ',
 '—Å–µ–∫—Å',
 '–±–ª—è–¥—å',
 '–Ω–∞–ø–∏—Å–∞—Ç—å',
 '—à–ª—é—Ö–∞',
 '–æ—á–∫–æ',
 '—Ñ–∞—à–∏—Å—Ç',
 '–µ–±–µ—Ç–∞',
 '–æ—Ç—Ä—É–±–∏—Ç—å',
 '–ø–æ—Å–æ—Å–∞—Ç—å',
 '—Å–¥–

In [294]:
from sklearn.metrics import f1_score
def mini_hyperopt(param):
    y_pred = np.array(pred_train_normal_1 >= param['threshold'], dtype = 'int')
    return -f1_score(y_pred = y_pred, y_true = XY_train['normal'])
    
space_thres = {'threshold': hp.uniform('threshold', 0, 1)}
    
best_mini = fmin(fn = mini_hyperopt, space = space_thres, algo = tpe.suggest, max_evals = 500)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:20<00:00, 24.43trial/s, best loss: -0.9909100993058516]


In [295]:
pred_train_normal_bool_1 = np.array(pred_train_normal_1 >= round(best_mini['threshold'], 3), dtype = 'int')
pred_test_normal_bool_1 = np.array(pred_test_normal_1 >= round(best_mini['threshold'], 3), dtype = 'int')

In [296]:
train_2_insult = np.concatenate([pred_train_insult_1, pred_train_normal_bool_1, text_train_insult_2], 
                                axis = 1)
train_2_threat = np.concatenate([pred_train_threat_1, pred_train_normal_bool_1, text_train_threat_2], 
                                axis = 1)
train_2_obscenity = np.concatenate([pred_train_obscenity_1, pred_train_normal_bool_1, text_train_obscenity_2], 
                                   axis = 1)

test_2_insult = np.concatenate([pred_test_insult_1, pred_test_normal_bool_1, text_test_insult_2], 
                               axis = 1)
test_2_threat = np.concatenate([pred_test_threat_1, pred_test_normal_bool_1, text_test_threat_2], 
                               axis = 1)
test_2_obscenity = np.concatenate([pred_test_obscenity_1, pred_test_normal_bool_1, text_test_obscenity_2], 
                                  axis = 1)

In [311]:
def hyperopt_2(params):
    
    lg = LogisticRegression(**params).fit(train_2_insult, XY_train['insult'])
    score = average_precision_score(y_score = lg.predict_proba(test_2_insult)[:, 1], y_true = XY_test['insult'])
    
    return -score
    
best_insult_C_2 = fmin(fn = hyperopt_2, space = {'C': hp.uniform('C', 0.01, 10),
                                                 'class_weight': hp.choice('class_weight', [None, 'balanced'])},
                       algo = tpe.suggest, max_evals = 250)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 250/250 [03:37<00:00,  1.15trial/s, best loss: -0.9153770907605178]


In [312]:
best_insult_C_2['class_weight'] = [None, 'balanced'][best_insult_C_2['class_weight']]
best_insult_C_2

{'C': 0.03578957898023842, 'class_weight': 'balanced'}

In [313]:
lg = LogisticRegression(**best_insult_C_2).fit(train_2_insult, XY_train['insult'])

In [314]:
average_precision_score(y_score = lg.predict_proba(test_2_insult)[:, 1], y_true = XY_test['insult'])

0.9153770907605178

In [315]:
def hyperopt_2(params):
    
    lg = LogisticRegression(**params).fit(train_2_threat, XY_train['threat'])
    score = average_precision_score(y_score = lg.predict_proba(test_2_threat)[:, 1], y_true = XY_test['threat'])
    
    return -score
    
best_threat_C_2 = fmin(fn = hyperopt_2, space = {'C': hp.uniform('C', 0.01, 10),
                                                 'class_weight': hp.choice('class_weight', [None, 'balanced'])},
                       algo = tpe.suggest, max_evals = 250)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 250/250 [03:15<00:00,  1.28trial/s, best loss: -0.8853170353601449]


In [316]:
best_threat_C_2['class_weight'] = [None, 'balanced'][best_threat_C_2['class_weight']]
best_threat_C_2

{'C': 0.02527516977232648, 'class_weight': 'balanced'}

In [317]:
lg = LogisticRegression(**best_threat_C_2).fit(train_2_threat, XY_train['threat'])

In [318]:
average_precision_score(y_score = lg.predict_proba(test_2_threat)[:, 1], y_true = XY_test['threat'])

0.8853170353601449

In [319]:
def hyperopt_2(params):
    
    lg = LogisticRegression(**params).fit(train_2_obscenity, XY_train['obscenity'])
    score = average_precision_score(y_score = lg.predict_proba(test_2_obscenity)[:, 1], y_true = XY_test['obscenity'])
    
    return -score
    
best_obscenity_C_2 = fmin(fn = hyperopt_2, space = {'C': hp.uniform('C', 0.01, 10),
                                                    'class_weight': hp.choice('class_weight', [None, 'balanced'])},
                          algo = tpe.suggest, max_evals = 250)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 250/250 [02:12<00:00,  1.89trial/s, best loss: -0.7729510611264273]


In [320]:
best_obscenity_C_2['class_weight'] = [None, 'balanced'][best_obscenity_C_2['class_weight']]
best_obscenity_C_2

{'C': 0.2923206108631109, 'class_weight': None}

In [321]:
lg = LogisticRegression(**best_obscenity_C_2).fit(train_2_obscenity, XY_train['obscenity'])

In [322]:
average_precision_score(y_score = lg.predict_proba(test_2_obscenity)[:, 1], y_true = XY_test['obscenity'])

0.7729510611264273

In [323]:
X_final_test = pd.read_csv('X_final_test.csv', header = 0)
X_final_test.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised
0,167315,–∫–∞–∫–∞—è –ø—Ä–µ–ª–µ—Å—Ç—å!!!üòç,–∫–∞–∫–∞—è –ø—Ä–µ–ª–µ—Å—Ç—åüòç,–∫–∞–∫–æ–π –ø—Ä–µ–ª–µ—Å—Ç—åüòç
1,224546,–∫–∞–∞–ª –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤—å—é?,–∫–∞–∞ –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤,–∫–∞–∞–ª–∞ –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤—å
2,241309,–≥–Ω–æ–π–Ω—ã–µ –ø–∏–¥–æ—Ä—ã –∞–ª–ª—ã –æ–Ω–∏,–≥–Ω–æ–π–Ω –ø–∏–¥–æ—Ä –∞–ª–ª –æ–Ω–∏,–≥–Ω–æ–π–Ω—ã–π –ø–∏–¥–æ—Ä –∞–ª–ª–∞ –æ–Ω–∏


In [325]:
# text feature extraction - level 1
trans_1_2_11 = TfidfVectorizer(min_df = 2)
text_train_1 = trans_1_2_11.fit_transform(XY.text)
text_test_1 = trans_1_2_11.transform(X_final_test.text)

# classifier - level 1
warnings.filterwarnings("ignore", category=ConvergenceWarning)
clf_1_normal = LogisticRegression(C = 5.3).fit(text_train_1, XY['normal'])
clf_1_insult = LogisticRegression(C = 12).fit(text_train_1, XY['insult'])
clf_1_threat = LogisticRegression(C = 10.6).fit(text_train_1, XY['threat'])
clf_1_obscenity = LogisticRegression(C = 21.8).fit(text_train_1, XY['obscenity'])

pred_train_normal_1 = clf_1_normal.predict_proba(text_train_1)[:, 1].reshape(-1, 1)
pred_train_insult_1 = clf_1_insult.predict_proba(text_train_1)[:, 1].reshape(-1, 1)
pred_train_threat_1 = clf_1_threat.predict_proba(text_train_1)[:, 1].reshape(-1, 1)
pred_train_obscenity_1 = clf_1_obscenity.predict_proba(text_train_1)[:, 1].reshape(-1, 1)

pred_test_normal_1 = clf_1_normal.predict_proba(text_test_1)[:, 1].reshape(-1, 1)
pred_test_insult_1 = clf_1_insult.predict_proba(text_test_1)[:, 1].reshape(-1, 1)
pred_test_threat_1 = clf_1_threat.predict_proba(text_test_1)[:, 1].reshape(-1, 1)
pred_test_obscenity_1 = clf_1_obscenity.predict_proba(text_test_1)[:, 1].reshape(-1, 1)

# text feature extraction - level 2
stopWords = stopwords.words('russian') + ['—ç—Ç–æ', '–≤—Å—ë', '–≤–µ—Å—å', '–µ—â—ë', '—á–µ–ª–æ–≤–µ–∫', '—Ç–≤–æ–π', '–∫–æ—Ç–æ—Ä—ã–π', '–∏–¥—Ç–∏', '—Å—É–¥',
                                          '—Å–≤–æ–π', '—Ä—É–∫–∞', '–Ω—É–∂–Ω–æ', '—Ä–µ–±—ë–Ω–æ–∫', '–µ—ë', '–∂–∏—Ç—å', '–ø—Ä–æ—Å—Ç–æ', '–Ω–∞—à', '–≤–∞—à',
                                          '—Ä–æ—Å—Å–∏—è', '—Å—Ç—Ä–∞–Ω–∞', '–º–æ—á—å', '–Ω–∞—Ä–æ–¥', '–ø—É—Ç–∏–Ω', '–ø—É—Ç–∏–Ω—Å–∫–∏–π', '—Ä–æ—Å—Å–∏—è',
                                          '–Ω–æ–≥–∞', '–∂–µ–Ω–∞', '–º–µ—Å—Ç–æ', '–º—É–∂–∏–∫', '–¥–∞–ª—ë–∫–∏–π', '–º–∞–º–∞', '–¥–µ–Ω—å', '—Å–∫–∞–∑–∞—Ç—å',
                                          '–∫–∞–∂–¥—ã–π', '–ø—É—Å—Ç—å', '–¥–µ–ª–∞—Ç—å', '–ª—é–±–∏—Ç—å', '–∑–Ω–∞—Ç—å', '—Ö–æ—Ä–æ—à–∏–π', '–±–æ–ª—å—à–æ–π',
                                          '–∑–µ–º–ª—è', '—Å–ª–æ–≤–æ', '–Ω–∞–π—Ç–∏', '—Å—Ç–µ–Ω–∫–∞', '–≤–º–µ—Å—Ç–µ', '–≤–∑—è—Ç—å', '—Å–∞–º—ã–π', '—è–π—Ü–æ',
                                          '—Å–∫–æ–ª—å–∫–æ', '—Å–º–æ—Ç—Ä–µ—Ç—å', '—Å–¥–µ–ª–∞—Ç—å', '–≥–æ–ª–æ–≤–∞', '–≥–æ–≤–æ—Ä–∏—Ç—å', '–≤–æ–æ–±—â–µ', '–≥–æ–¥',
                                          '–¥–µ–Ω—å–≥–∞', '–ø—Ä–æ–¥–∞–∂–Ω—ã–π', '–ø–∏—Å–∞—Ç—å', '—Ä–∞–±–æ—Ç–∞—Ç—å', '–¥—É–º–∞—Ç—å', '–∂–∏–∑–Ω—å', '–º–æ–∑–≥',
                                          '—Ä—É—Å—Å–∫–∏–π', '—Å—Ä–∞–∑—É', '–º–∞–ª–æ', '–ø–ª–æ—â–∞–¥—å', '—Å–æ–±–∞–∫–∞', '–µ—Å—Ç–∏', '—Ä–æ—Ç', '—Ö–æ—Ç–µ—Ç—å',
                                          '–¥–∞–≤–∞—Ç—å', '–º–∞—Ç—å', '–≤—ã–µ—Å—Ç–∏', '—Å–∏–¥–µ—Ç—å', '–ø–æ–π—Ç–∏', '–¥–∞—Ç—å', '–¥–∞–≤–Ω–æ', '—Å–∞–∂–∞—Ç—å',
                                          '–ø–æ–ª–Ω—ã–π', '–ø–æ—Ä–∞', '—Å—Ç–∞—Ç—å', '–¥–æ–ª–∂–Ω—ã–π', '—Å—Ç–∞—Ç—å', '–≤—Ä–µ–º—è', '–ø–æ–∫–∞', '–≤–ª–∞—Å—Ç—å',
                                          '–Ω–∏–∫—Ç–æ', '–ø—Ä–∏–≤—è–∑–∞—Ç—å', '–±–æ–≥', '—Å–∫–æ—Ä–æ', '–∫–æ—Ä–º–∏—Ç—å' '–≤—Ä–∞–≥', '—à–µ—è', '–±–∞—à–∫–∞', 
                                          '–±–∞–±–∞', '–º—É–∂', '–ø–æ–∫–∞–∑–∞—Ç—å', '—É–∫—Ä–∞–∏–Ω–∞', '—Å—Ç–∞—Ä—ã–π', '—Ä–æ–¥–∏—Ç–µ–ª—å', '–ø–æ—Å–∞–¥–∏—Ç—å',
                                          '–≤–∏–¥–µ—Ç—å', '–≤—Ä–∞–≥', '—Å—É–ø–µ—Ä', '–∂–µ–Ω—â–∏–Ω–∞']

tf_insult = CountVectorizer(max_features = 25, ngram_range = (1, 2), 
                            stop_words = stopWords).fit(XY.loc[XY.insult == 1, 'text_lemmatised'])
tf_threat = CountVectorizer(max_features = 25, ngram_range = (1, 2), 
                            stop_words = stopWords).fit(XY.loc[XY.threat == 1, 'text_lemmatised'])
tf_obscenity = CountVectorizer(max_features = 25, ngram_range = (1, 2), 
                               stop_words = stopWords).fit(XY.loc[XY.obscenity == 1, 'text_lemmatised'])

text_train_insult_2 = (tf_insult.transform(XY.text_lemmatised).toarray() > 0).astype('int')
text_test_insult_2 = (tf_insult.transform(X_final_test.text_lemmatised).toarray() > 0).astype('int')

text_train_threat_2 = (tf_threat.transform(XY.text_lemmatised).toarray() > 0).astype('int')
text_test_threat_2 = (tf_threat.transform(X_final_test.text_lemmatised).toarray() > 0).astype('int')

text_train_obscenity_2 = (tf_obscenity.transform(XY.text_lemmatised).toarray() > 0).astype('int')
text_test_obscenity_2 = (tf_obscenity.transform(X_final_test.text_lemmatised).toarray() > 0).astype('int')

def mini_hyperopt(param):
    y_pred = np.array(pred_train_normal_1 >= param['threshold'], dtype = 'int')
    return -f1_score(y_pred = y_pred, y_true = XY['normal'])
    
space_thres = {'threshold': hp.uniform('threshold', 0, 1)}
    
best_mini = fmin(fn = mini_hyperopt, space = space_thres, algo = tpe.suggest, max_evals = 500)


pred_train_normal_bool_1 = np.array(pred_train_normal_1 >= round(best_mini['threshold'], 3), dtype = 'int')
pred_test_normal_bool_1 = np.array(pred_test_normal_1 >= round(best_mini['threshold'], 3), dtype = 'int')


train_2_insult = np.concatenate([pred_train_insult_1, pred_train_normal_bool_1, text_train_insult_2], 
                                axis = 1)
train_2_threat = np.concatenate([pred_train_threat_1, pred_train_normal_bool_1, text_train_threat_2], 
                                axis = 1)
train_2_obscenity = np.concatenate([pred_train_obscenity_1, pred_train_normal_bool_1, text_train_obscenity_2], 
                                   axis = 1)

test_2_insult = np.concatenate([pred_test_insult_1, pred_test_normal_bool_1, text_test_insult_2], 
                               axis = 1)
test_2_threat = np.concatenate([pred_test_threat_1, pred_test_normal_bool_1, text_test_threat_2], 
                               axis = 1)
test_2_obscenity = np.concatenate([pred_test_obscenity_1, pred_test_normal_bool_1, text_test_obscenity_2], 
                                  axis = 1)

lg_insult = LogisticRegression(**best_insult_C_2).fit(train_2_insult, XY['insult'])
lg_threat = LogisticRegression(**best_threat_C_2).fit(train_2_threat, XY['threat'])
lg_obscenity = LogisticRegression(**best_obscenity_C_2).fit(train_2_obscenity, XY['obscenity'])

pred_test_insult_2 = lg_insult.predict_proba(test_2_insult)[:, 1].reshape(-1, 1)
pred_test_threat_2 = lg_threat.predict_proba(test_2_threat)[:, 1].reshape(-1, 1)
pred_test_obscenity_2 = lg_obscenity.predict_proba(test_2_obscenity)[:, 1].reshape(-1, 1)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:29<00:00, 17.08trial/s, best loss: -0.9914620782152145]


In [326]:
predictions = np.hstack([pred_test_normal_1, pred_test_insult_2, pred_test_threat_2, pred_test_obscenity_2])

In [327]:
labels = ['normal', 'insult', 'threat', 'obscenity']

In [328]:
final_predictions = pd.concat([pd.DataFrame(X_final_test.id.values, columns = ['id']),
                               pd.DataFrame(predictions, columns = labels)], axis = 1)

In [329]:
final_predictions

Unnamed: 0,id,normal,insult,threat,obscenity
0,167315,0.998016,0.023758,0.008704,0.000562
1,224546,0.926286,0.038597,0.012242,0.000585
2,241309,0.017610,0.999779,0.105649,0.002020
3,31170,0.000691,0.999798,0.098300,0.003319
4,173358,0.957654,0.033564,0.008842,0.000577
...,...,...,...,...,...
99510,192320,0.356521,0.333875,0.085667,0.030779
99511,6646,0.099780,0.999746,0.103001,0.002165
99512,215218,0.874958,0.152845,0.008986,0.000564
99513,139806,0.773577,0.200878,0.009405,0.000571


In [330]:
result = final_predictions.loc[:, ['id', 'normal', 'insult', 'obscenity', 'threat']]

In [331]:
result.to_csv('result', index = False, header = True)