In [1]:
import time
import string
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.sparse import hstack

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [2]:
XY = pd.read_csv('XY.csv', header = 0)
XY.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised,normal,threat,insult,obscenity
0,41127,дворника надо тоже уничтожить!,дворник надо тоже уничтож,дворник надо тоже уничтожить,0,1,0,0
1,6812,"моя старшая неделю шипела, не принимала подкид...",моя старш недел шипел не принима подкидыш кото...,мой старший неделя шипеть не принимать подкиды...,1,0,0,0
2,6256,полностью с вами согласна!,полност с вам согласн,полностью с вы согласный,1,0,0,0


In [3]:
XY_train, XY_test = train_test_split(XY, test_size = 0.3, shuffle = True, random_state = 42)
XY_train.reset_index(drop = True, inplace = True)
XY_test.reset_index(drop = True, inplace = True)
XY_train_abn = XY_train.loc[XY_train.normal == 0, :].reset_index(drop = True)
XY_train.shape, XY_train_abn.shape

((104142, 8), (18547, 8))

In [9]:
def predict_easy(model, XY, label, norm = None):
    
    if norm == 'lemma':
        predictions = model.predict_proba(XY.text_lemmatised)[:, 1]
    elif norm == 'stem':
        predictions = model.predict_proba(XY.text_stemmed)[:, 1]
    else:
        predictions = model.predict_proba(XY.text)[:, 1]
    
    return predictions

@ignore_warnings(category=ConvergenceWarning)
def easy_model(XY, label, norm = None):
    pipe = Pipeline([('trans', TfidfVectorizer()),
                     ('clf', LogisticRegressionCV(Cs = 50, scoring = 'average_precision'))])
    
    if norm == 'lemma':
        pipe.fit(XY.text_lemmatised, XY[label])
    elif norm == 'stem':
        pipe.fit(XY.text_stemmed, XY[label])
    else:
        pipe.fit(XY.text, XY[label])
    
    return pipe

@ignore_warnings(category=ConvergenceWarning)
def easy_model_with_bagging(XY, label, C = 1, norm = None):
    pipe = Pipeline([('trans', TfidfVectorizer()),
                     ('clf', BaggingClassifier(base_estimator = LogisticRegression(C = C),
                                               n_estimators = 100, max_samples = 0.8, random_state = 42))])
    if norm == 'lemma':
        pipe.fit(XY.text_lemmatised, XY[label])
    elif norm == 'stem':
        pipe.fit(XY.text_stemmed, XY[label])
    else:
        pipe.fit(XY.text, XY[label])
    
    return pipe

@ignore_warnings(category=ConvergenceWarning)
def middle_model(XY, y_preds, label, norm = None):
    
    trans = TfidfVectorizer()
    scaler = StandardScaler()
    clf = LogisticRegressionCV(Cs = 10, scoring = 'average_precision') # class_weight = 'balanced', 
    
    if norm == 'lemma':
        X_text = trans.fit_transform(XY.text_lemmatised)
    elif norm == 'stem':
        X_text = trans.fit_transform(XY.text_stemmed)
    else:
        X_text = trans.fit_transform(XY.text)
    
    X = hstack([X_text, scaler.fit_transform(y_preds)]) if y_preds is not None else X_text
    
    clf.fit(X, XY[label])
    
    return (trans, scaler, clf)

def predict_middle(pipeline, XY, y_preds, label, lemma = True):
    
    if norm == 'lemma':
        X_text = pipeline[0].transform(XY.text_lemmatised)
    elif norm == 'stem':
        X_text = pipeline[0].transform(XY.text_stemmed)
    else:
        X_text = pipeline[0].transform(XY.text)
    
    X = hstack([X_text, pipeline[1].transform(y_preds)]) if y_preds is not None else X_text
    
    predictions = pipeline[2].predict_proba(X)[:, 1]
    
    return predictions.reshape(-1, 1)

In [5]:
algs_1 = dict()
for label in tqdm(['normal', 'insult', 'obscenity', 'threat']):
    algs_1[label] = easy_model(XY_train, label, norm = None)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [7]:
print(average_precision_score(y_true = XY_train.loc[:, list(algs_1.keys())],
                              y_score = np.hstack([predict_easy(algs_1[label], XY_train, 
                                                                label, norm = None).reshape(-1, 1)
                                                   for label in list(algs_1.keys())]),
                              average = 'macro'))

print(average_precision_score(y_true = XY_test.loc[:, list(algs_1.keys())],
                              y_score = np.hstack([predict_easy(algs_1[label], XY_test, 
                                                                label, norm = None).reshape(-1, 1)
                                                   for label in list(algs_1.keys())]),
                              average = 'macro'))
# min_df = 1 (при увеличении min_df качество ухудшалось) при Cs = 10
# 0.9989094360279609
# 0.8994449421853303
# min_df = 1 (при увеличении min_df качество ухудшалось) при Cs = 25
# 0.9989016018343937
# 0.8999228739030766
# {'normal': 10, 'insult': 22, 'obscenity': 464, 'threat': 22}

0.9999995112883777
0.8842803881293029


In [8]:
Cs = {'normal': 10, 'insult': 22, 'obscenity': 464, 'threat': 22}
Cs

{'normal': 10, 'insult': 22, 'obscenity': 464, 'threat': 22}

In [46]:
algs_2 = dict()
for label in tqdm(['normal', 'insult', 'obscenity', 'threat']):
    algs_2[label] = easy_model_with_bagging(XY_train, label, C = Cs[label], norm = None)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [47]:
print(average_precision_score(y_true = XY_train.loc[:, list(algs_2.keys())],
                              y_score = np.hstack([predict_easy(algs_2[label], XY_train, 
                                                                label, norm = None).reshape(-1, 1)
                                                   for label in list(algs_2.keys())]),
                              average = 'macro'))

print(average_precision_score(y_true = XY_test.loc[:, list(algs_2.keys())],
                              y_score = np.hstack([predict_easy(algs_2[label], XY_test, 
                                                                label, norm = None).reshape(-1, 1)
                                                   for label in list(algs_2.keys())]),
                              average = 'macro'))
# 0.9977239305853635
# 0.8993253485125248 видимо бэггинг не сильно помогает, завтра попробую снизить max_samples

0.9977239305853635
0.8993253485125248


In [88]:
algs_4 = dict()
st = True
preds_train_labels = None # XY_train.id.astype('int').values.reshape(-1, 1)
preds_test_labels = None # XY_test.id.astype('int').values.reshape(-1, 1)
for label in tqdm(['normal', 'insult', 'threat', 'obscenity']):
#     algs_4[label] = middle_model(XY_train, preds_train_labels[:, :1], label, lemma = st)
    if label == 'normal':
        algs_4[label] = middle_model(XY_train, None, label, lemma = st)
        preds_train_labels = predict_middle(algs_4[label], XY_train, None, label, lemma = st)
        preds_test_labels = predict_middle(algs_4[label], XY_test, None, label, lemma = st)
    else:
        algs_4[label] = middle_model(XY_train, preds_train_labels, label, lemma = st)
        preds_train_labels = np.hstack([preds_train_labels, predict_middle(algs_4[label], XY_train,
                                                                           preds_train_labels,
                                                                           label, lemma = st)])
        preds_test_labels = np.hstack([preds_test_labels, predict_middle(algs_4[label], XY_test,
                                                                         preds_test_labels,
                                                                         label, lemma = st)])

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [89]:
average_precision_score(y_true = XY_train.loc[:, algs_4.keys()], y_score = preds_train_labels) # 0.9863651181168624

0.9661960724174861

In [90]:
average_precision_score(y_true = XY_test.loc[:, algs_4.keys()], y_score = preds_test_labels) # 0.8783407636309197

0.8740037830570584

In [91]:
average_precision_score(y_true = XY_test.loc[:, 'insult'], y_score = preds_test_labels[:, 1]) # 

0.8945224269407825

In [None]:
# 0.8667308183251967 0.0001, 0.9999, (1, 2)
# 0.8698880391021648 0.00005, 0.99995, (1, 2)
# 0.8717028592584694 - - -
# 0.8753309379947046 0.00005, 0.99990, (1, 2)
# не было баланса классов, теперь есть
# 0.8717310060366801 0.00005, 0.99990, (1, 2)

### Public: 0.8863923769191875 (easy model)
### Public: 0.8990635497945589 (easy model with appropriate scoring)

In [11]:
X_final_test = pd.read_csv('X_final_test.csv', header = 0)
X_final_test.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised
0,167315,какая прелесть!!!😍,какая прелесть😍,какой прелесть😍
1,224546,каал какой не с кровью?,каа какой не с кров,каала какой не с кровь
2,241309,гнойные пидоры аллы они,гнойн пидор алл они,гнойный пидор алла они


In [19]:
Cs = {'normal': 10, 'insult': 22, 'obscenity': 464, 'threat': 22}
algs_final = dict()
for label in tqdm(['normal', 'insult', 'obscenity', 'threat']):
    algs_final[label] = easy_model_with_bagging(XY, label, C = Cs[label], stem = False)

In [20]:
average_precision_score(y_true = XY.loc[:, list(algs_final.keys())], 
                        y_score = np.hstack([predict_easy(algs_final[label], XY, label, stem = False).reshape(-1, 1)
                                             for label in list(algs_final.keys())]), 
                        average = 'macro')
# check: 

In [13]:
predictions = np.hstack([predict_easy(algs_final[label], X_final_test, label, stem = False).reshape(-1, 1)
                         for label in list(algs_final.keys())])

In [15]:
final_predictions = pd.concat([pd.DataFrame(X_final_test.id.values, columns = ['id']),
                               pd.DataFrame(predictions, columns = list(algs_final.keys()))], axis = 1)

In [16]:
result = final_predictions.loc[:, ['id', 'normal', 'insult', 'obscenity', 'threat']]

In [17]:
result.to_csv('result', index = False, header = True)