In [14]:
import time
import string
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter
from scipy.sparse import hstack

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

# Preprocessing

In [2]:
XY = pd.read_csv('XY.csv', header = 0)
XY.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised,normal,threat,insult,obscenity
0,41127,дворника надо тоже уничтожить!,дворник надо тоже уничтож,дворник надо тоже уничтожить,0,1,0,0
1,6812,"моя старшая неделю шипела, не принимала подкид...",моя старш недел шипел не принима подкидыш кото...,мой старший неделя шипеть не принимать подкиды...,1,0,0,0
2,6256,полностью с вами согласна!,полност с вам согласн,полностью с вы согласный,1,0,0,0


In [3]:
XY_train, XY_test = train_test_split(XY, test_size = 0.3, shuffle = True, random_state = 42)
XY_train.reset_index(drop = True, inplace = True)
XY_test.reset_index(drop = True, inplace = True)
XY_train_abn = XY_train.loc[XY_train.normal == 0, :].reset_index(drop = True)
XY_train.shape, XY_train_abn.shape

((104142, 8), (18547, 8))

# Label predictions
## Function manufacture

In [4]:
stopWords = stopwords.words('russian')

In [5]:
@ignore_warnings(category=ConvergenceWarning)
def search_best_tfidf_and_algo(X, y, ns = 3, num_of_evals = 10):
    
    def hyperopt_tfidf_algo_score(params):
        try:
            trans = TfidfVectorizer(stop_words = params['stop_words'], min_df = params['min_df'],
                                    ngram_range = params['ngram_range'])
            
            if params['norm'] == 'lemma':
                X_prep = trans.fit_transform(X.text_lemmatised) 
            elif params['norm'] == 'stem':
                X_prep = trans.fit_transform(X.text_stemmed)
            else:
                X_prep = trans.fit_transform(X.text)
            
            clf = LogisticRegression(C = params['C'], class_weight = params['class_weight'])

            current_score_scores = cross_val_score(clf, X_prep, y, cv = StratifiedKFold(n_splits = ns),
                                                   scoring = 'average_precision')
            
            mean_score = np.mean(current_score_scores)
            current_score = mean_score if np.std(current_score_scores * 100) < 1.5 else mean_score / 1.5
            
        except:
            current_score = 0
            print(f"Bad min_df: {algo_params['min_df']}")
        
        return -current_score
        
    space_tfidf = {
        'min_df': hp.choice('min_df', np.arange(2, 8, 1)),
        'stop_words': hp.choice('stop_words', [stopWords, None]),
        'ngram_range': hp.choice('ngram_range', [(1, 1), (1, 2)]),
        'norm': hp.choice('norm', ['lemma', 'stem', 'initial'])
    }
    
    space_algo = {'C': hp.uniform('C', 10**(0), 10**(2)), 
                  'class_weight': hp.choice('class_weight', ['balanced', None])}
    
    space = dict(**space_tfidf, **space_algo)
    
    best = fmin(fn = hyperopt_tfidf_algo_score, space = space, algo = tpe.suggest, max_evals = num_of_evals)
    
    best['min_df'] = np.arange(1, 6, 1)[best['min_df']]
    best['stop_words'] = [stopWords, None][best['stop_words']]
    best['ngram_range'] = [(1, 1), (1, 2)][best['ngram_range']]
    
    best['norm'] = ['lemma', 'stem', 'initial'][best['norm']]
    
    best['class_weight'] = ['balanced', None][best['class_weight']]
    
    return best

In [6]:
@ignore_warnings(category=ConvergenceWarning)
def build_best_tfidf_and_algo(params, X_train, y_train, X_valid, y_valid):
    
    trans = TfidfVectorizer(min_df = params['min_df'], stop_words = params['stop_words'], 
                            ngram_range = params['ngram_range'])
    
    if params['norm'] == 'lemma':
        X_train_prep = trans.fit_transform(X_train.text_lemmatised)
        X_valid_prep = trans.transform(X_valid.text_lemmatised)
    elif params['norm'] == 'stem':
        X_train_prep = trans.fit_transform(X_train.text_stemmed)
        X_valid_prep = trans.transform(X_valid.text_stemmed)
    else:
        X_train_prep = trans.fit_transform(X_train.text)
        X_valid_prep = trans.transform(X_valid.text)
    
    clf = LogisticRegression(C = params['C'], class_weight = params['class_weight'])
    clf.fit(X_train_prep, y_train)
    
    aps_train = average_precision_score(y_true = y_train, y_score = clf.predict_proba(X_train_prep)[:, 1])
    aps_valid = average_precision_score(y_true = y_valid, y_score = clf.predict_proba(X_valid_prep)[:, 1])
    
    return {'norm': params['norm'], 'transformer': trans, 'classifier': clf, 'scores': [aps_train, aps_valid]}

In [7]:
def search_build(XY, label, balancing = True, test_size = 0.2, ns = 5, num_of_evals = 10):
    
    if balancing:
        sample_num = min(XY.loc[XY[label] == 1].shape[0], XY.loc[XY[label] == 0].shape[0])
        XY_balanced = pd.concat([XY.loc[XY[label] == 1].sample(sample_num, random_state = 42),
                                 XY.loc[XY[label] == 0].sample(sample_num, random_state = 42)], axis = 0)
    else:
        XY_balanced = XY
    
    XY_train, XY_valid = train_test_split(XY_balanced, stratify = XY_balanced[label], test_size = test_size,
                                          shuffle = True, random_state = 42)
    XY_train.reset_index(drop = True, inplace = True)
    XY_valid.reset_index(drop = True, inplace = True)
    
    X_train = XY_train.drop(columns = label)
    y_train = XY_train.loc[:, label].values
    
    X_valid = XY_valid.drop(columns = label)
    y_valid = XY_valid.loc[:, label].values
    
    params = search_best_tfidf_and_algo(X_train, y_train, ns = ns, num_of_evals = num_of_evals)
    
    return build_best_tfidf_and_algo(params, X_train, y_train, X_valid, y_valid)

In [8]:
def monster_search(XY: pd.DataFrame, labels: list = ['normal'], balancing: list = [True], test_size: float = 0.2, 
                   ns: list = [10], num_of_evals: list = [10], final_algos: dict = {}) -> dict:
    
    for l, label in enumerate(labels):
        print()
        print('---------------------------------------------------------------------------------------------------')
        print(label)
        print('---------------------------------------------------------------------------------------------------')
        
        algo_result = search_build(XY, label, balancing = balancing[l], test_size = test_size, 
                                   ns = ns[l], num_of_evals = num_of_evals[l])
        
        final_algos[label] = algo_result
    
    return final_algos

In [37]:
final_algos_1 = monster_search(XY_train, labels = ['normal', 'insult', 'obscenity', 'threat'],
                               balancing = [False] * 4, test_size = 0.2, ns = [5] * 4, num_of_evals = [250] * 4)


---------------------------------------------------------------------------------------------------
normal
---------------------------------------------------------------------------------------------------
100%|██████████| 50/50 [05:49<00:00,  6.99s/trial, best loss: -0.9911795125105891]

---------------------------------------------------------------------------------------------------
insult
---------------------------------------------------------------------------------------------------
100%|██████████| 50/50 [06:14<00:00,  7.49s/trial, best loss: -0.8936379998879065]

---------------------------------------------------------------------------------------------------
obscenity
---------------------------------------------------------------------------------------------------
100%|██████████| 50/50 [05:27<00:00,  6.54s/trial, best loss: -0.7525734737778982] 

---------------------------------------------------------------------------------------------------
threat
---------------

In [10]:
# final_algos_1
# {'normal': {'norm': 'stem',
#   'transformer': TfidfVectorizer(min_df=3, ngram_range=(1, 2)),
#   'classifier': LogisticRegression(C=43.61225142163715, class_weight='balanced'),
#   'scores': [0.9997501061237812, 0.9906656653390087]},
#  'insult': {'norm': 'stem',
#   'transformer': TfidfVectorizer(min_df=4, ngram_range=(1, 2), stop_words=stopWords),
#   'classifier': LogisticRegression(C=14.163874788479188),
#   'scores': [0.9703978115236113, 0.9018463501179763]},
#  'obscenity': {'norm': 'initial',
#   'transformer': TfidfVectorizer(min_df=2),
#   'classifier': LogisticRegression(C=41.88942360620957),
#   'scores': [0.9920524059927307, 0.752306287824738]},
#  'threat': {'norm': 'initial',
#   'transformer': TfidfVectorizer(min_df=10, ngram_range=(1, 2)),
#   'classifier': LogisticRegression(C=2.047808035445657),
#   'scores': [0.9337172577508484, 0.8503346208600301]}}

{'normal': {'classifier': LogisticRegression(C = 9, class_weight = 'balanced'),
            'norm': 'stem',
            'scores': [0.9998298284695576, 0.9929094934296164],
            'transformer': TfidfVectorizer(min_df = 1, ngram_range = (1, 1))},
 'insult': {'classifier': LogisticRegression(C = 10, class_weight = None),
            'norm': 'initial',
            'scores': [0.9866985845174877, 0.9050294725010097],
            'transformer': TfidfVectorizer(min_df = 2, ngram_range = (1, 1))},
 'obscenity': {'classifier': LogisticRegression(C = 94, class_weight = None),
               'norm': 'lemma',
               'scores': [0.9901362947613137, 0.7137456145906628],
               'transformer': TfidfVectorizer(min_df = 2, ngram_range = (1, 2), stop_words = stopWords)},
 'threat': {'classifier': LogisticRegression(C = 14, class_weight = None),
            'norm': 'initial',
            'scores': [0.9896822419756792, 0.8677644706078192],
            'transformer': TfidfVectorizer(min_df = 3, ngram_range = (1, 1))}}

## Label predictions

In [11]:
def get_prob_predictions(data, algos):
    
    if algos['normal']['norm'] == 'lemma':
        X_train_n = algos['normal']['transformer'].transform(data.text_lemmatised)
    elif algos['normal']['norm'] == 'stem':
        X_train_n = algos['normal']['transformer'].transform(data.text_stemmed)
    else:
        X_train_n = algos['normal']['transformer'].transform(data.text)
    
    preds = algos['normal']['classifier'].predict_proba(X_train_n)[:, 1].reshape(-1, 1)
    predicted_labels_n = np.abs(algos['normal']['classifier'].predict(X_train_n) - 1).reshape(-1, 1)
    
    for label, algo in [(key, value) for key, value in algos.items() if key != 'normal']:
        
        if algo['norm'] == 'lemma':
            X_train_abn = algo['transformer'].transform(data.text_lemmatised)
        elif algo['norm'] == 'stem':
            X_train_abn = algo['transformer'].transform(data.text_stemmed)
        else:
            X_train_abn = algo['transformer'].transform(data.text)
        
        preds_abn = algo['classifier'].predict_proba(X_train_abn)[:, 1].reshape(-1, 1) #* predicted_labels_n
        preds = np.concatenate([preds, preds_abn], axis = 1)
    
    return preds

In [16]:
# predictions = get_prob_predictions(XY_train, final_algos_1)
# average_precision_score(y_true = XY_train.loc[:, list(final_algos_1.keys())],
#                         y_score = predictions, average = 'macro') # 0.9565494854057154

In [17]:
# predictions = get_prob_predictions(XY_test, final_algos_1)
# average_precision_score(y_true = XY_test.loc[:, list(final_algos_1.keys())], 
#                         y_score = predictions, average = 'macro') # 0.8762971108666133

In [26]:
X_final_test = pd.read_csv('X_final_test.csv', header = 0)
X_final_test.head()

Unnamed: 0,id,text,text_stemmed,text_lemmatised
0,167315,какая прелесть!!!😍,какая прелесть😍,какой прелесть😍
1,224546,каал какой не с кровью?,каа какой не с кров,каала какой не с кровь
2,241309,гнойные пидоры аллы они,гнойн пидор алл они,гнойный пидор алла они
3,31170,чё ты губы шлёшь в помаде?фу блядь,че ты губ шлеш в помадеф бляд,что ты губа слать в помадефа блядь
4,173358,матрона помогает реально это правда. сама к не...,матрон помога реальн эт правд сам к ней езд на...,матрона помогать реально это правда сам к она ...


In [184]:
predictions = get_prob_predictions(X_final_test, final_algos_1)

In [185]:
final_frame = pd.concat([pd.DataFrame(X_final_test.id.values, columns = ['id']), 
                         pd.DataFrame(predictions, columns = list(final_algos_1.keys()))], axis = 1)

In [187]:
final = final_frame.loc[:, ['id', 'normal', 'insult', 'obscenity', 'threat']]

In [189]:
final.to_csv('result', index = False, header = True)

In [None]:
# predictions = get_prob_predictions(XY, loaded_model, activates_s)
# average_precision_score(y_true = XY.loc[:, ['normal', 'insult', 'threat', 'obscenity']],
#                         y_score = predictions, average = 'macro')

In [None]:
# import pickle
# pickle.dump(final_algos, open('final_algos', 'wb'))
# pickle.dump(activates_s, open('activates_s', 'wb'))

In [None]:
# loaded_models = pickle.load(open('final_algos', 'rb'))
# loaded_activates_s = pickle.load(open('activates_s', 'rb'))