In [1]:
import time
import string
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.sparse import hstack

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.metrics import average_precision_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [6]:
XY = pd.read_csv('XY.csv', header = 0)
XY.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised,normal,threat,insult,obscenity
0,41127,–¥–≤–æ—Ä–Ω–∏–∫–∞ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å!,–¥–≤–æ—Ä–Ω–∏–∫ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂,–¥–≤–æ—Ä–Ω–∏–∫ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å,0,1,0,0
1,6812,"–º–æ—è —Å—Ç–∞—Ä—à–∞—è –Ω–µ–¥–µ–ª—é —à–∏–ø–µ–ª–∞, –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª–∞ –ø–æ–¥–∫–∏–¥...",–º–æ—è —Å—Ç–∞—Ä—à –Ω–µ–¥–µ–ª —à–∏–ø–µ–ª –Ω–µ –ø—Ä–∏–Ω–∏–º–∞ –ø–æ–¥–∫–∏–¥—ã—à –∫–æ—Ç–æ...,–º–æ–π —Å—Ç–∞—Ä—à–∏–π –Ω–µ–¥–µ–ª—è —à–∏–ø–µ—Ç—å –Ω–µ –ø—Ä–∏–Ω–∏–º–∞—Ç—å –ø–æ–¥–∫–∏–¥—ã...,1,0,0,0
2,6256,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤–∞–º–∏ —Å–æ–≥–ª–∞—Å–Ω–∞!,–ø–æ–ª–Ω–æ—Å—Ç —Å –≤–∞–º —Å–æ–≥–ª–∞—Å–Ω,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤—ã —Å–æ–≥–ª–∞—Å–Ω—ã–π,1,0,0,0


In [7]:
XY['exclamation_num'] = XY.text.str.count('!')
XY['question_num'] = XY.text.str.count('\?')

XY_train, XY_test = train_test_split(XY, test_size = 0.3, shuffle = True, random_state = 42)
XY_train.reset_index(drop = True, inplace = True)
XY_test.reset_index(drop = True, inplace = True)
XY_train_abn = XY_train.loc[XY_train.normal == 0, :].reset_index(drop = True)
XY_train.shape, XY_train_abn.shape

((104142, 10), (18547, 10))

In [10]:
vocab = ['–±—ã–¥–ª–æ',
 '–¥—Ä–æ—á–∏—Ç—å',
 '–∫–æ–Ω—á–∏—Ç—å',
 '–æ—Ç—Ä–µ–∑–∞—Ç—å',
 '–µ–±–∞–ª',
 '–ø–∏–∑–¥–µ—Ü',
 '–≤–¥—É—Ç—å',
 '–ø–æ–ª–∏–∑–∞—Ç—å',
 '—Å–æ—Å–Ω—É—Ç—å',
 '–ø–æ–≤–µ—Å–∏—Ç—å',
 '—É—Ç–æ–ø–∏—Ç—å',
 '–ø–∏–∑–¥–∞',
 '—Å–ø–µ—Ä–º–∞',
 '–∫–æ–∑—ë–ª',
 '–ø–∏–¥–æ—Ä–∞—Å',
 '–Ω–∞–±–∏—Ç—å',
 '–≥–æ–≤–Ω–æ',
 '–∑–∞–∫–æ–ø–∞—Ç—å',
 '—Ö–æ—Ö–æ–ª',
 '–≤–æ—Ä',
 '–¥—É—Ä–∞',
 '—É–±–ª—é–¥–æ–∫',
 '–∫–∞–∑–Ω–∏—Ç—å',
 '–≤—ã–µ–±–∞–ª',
 '–ª–∏–∑–∞—Ç—å',
 '–∑–∞—Å–∞–¥–∏—Ç—å',
 '—á–º–æ',
 '—Ä–∞—Å—Ç—Ä–µ–ª–∞',
 '–∑–∞–¥–Ω–∏—Ü–∞',
 '–ø—Ä–∏–ª—é–¥–Ω–æ',
 '–≥–Ω–∏–¥–∞',
 '—Å—É–∫–∞',
 '—Ä–∞—Å—Ç—Ä–µ–ª—è—Ç—å',
 '–Ω–∞—Å–æ—Å–∞—Ç—å',
 '—Å–æ—Å–∞—Ç—å —Å–æ—Å–∞—Ç—å',
 '–ø–∏–∑–¥–∏—Ç—å',
 '–ø—Ä–æ–∫–ª—è—Ç—ã–π',
 '–±–∏—Ç—å',
 '–æ—Ç—Å—Ç—Ä–µ–ª–∏–≤–∞—Ç—å',
 '–µ–±—É',
 '—É–±–∏–≤–∞—Ç—å',
 '–∫–æ–Ω—á–µ–Ω—ã–π',
 '–∂–∏–≤–æ—Ç–Ω–æ–µ',
 '—Å–µ–∫—Å',
 '–±–ª—è–¥—å',
 '—à–ª—é—Ö–∞',
 '–æ—á–∫–æ',
 '—Ñ–∞—à–∏—Å—Ç',
 '–µ–±–µ—Ç–∞',
 '–æ—Ç—Ä—É–±–∏—Ç—å',
 '–ø–æ—Å–æ—Å–∞—Ç—å',
 '—Å–¥–æ—Ö–Ω—É—Ç—å',
 '—Å—Ç—Ä–µ–ª—è—Ç—å',
 '—É–Ω–∏—á—Ç–æ–∂–∞—Ç—å',
 '—Ä–∞–∫',
 '—Å–∂–µ—á—å',
 '–≥–∞–¥',
 '—Å–∏—Å—å–∫–∞',
 '—ë–±–∞–Ω—ã–π',
 '–ø—Ä–∏–±–∏—Ç—å',
 '—Ç—Ä–∞—Ö–∞—Ç',
 '–¥–µ–±–∏–ª',
 '—à–∞–ª–∞–≤—ã–π',
 '—Ä–∞—Å—Ç—Ä–µ–ª–∏–≤–∞—Ç—å',
 '–Ω–µ–≥—Ä',
 '–æ—Ç—Å–∞—Å—ã–≤–∞—Ç—å',
 '–æ—Ç–æ—Ä–≤–∞—Ç—å',
 '—Ö–µ—Ä',
 '—É—Ä–æ–¥',
 '–ø–∏–¥–∞—Ä–∞—Å',
 '–∫–∞—Å—Ç—Ä–∏—Ä–æ–≤–∞—Ç—å',
 '–ø–æ–ø',
 '–ø—Ä–∏—Å—Ç—Ä–µ–ª–∏—Ç—å',
 '–ø–∞–¥–ª–æ',
 '—Ç—Ä–∞—Ö–Ω—É—Ç—å',
 '–∑–∞—Å—É–Ω—É—Ç—å',
 '–º–æ—Ä–¥–∞',
 '–≤—ã–µ–±–∞—Ç—å',
 '–∂–æ–ø–∞',
 '—Å—Ä–∞–∫',
 '–∂–∏–≤—å—ë–º',
 '—Ö—É–π–Ω—è',
 '–º–æ—á–∏—Ç—å',
 '—Ä–∞—Å—Å—Ç—Ä–µ–ª—è—Ç—å',
 '–∞–¥',
 '—Ä–∞—Å—Å—Ç—Ä–µ–ª',
 '–µ–±—É—Ç',
 '—É–Ω–∏—á—Ç–æ–∂–∏—Ç—å',
 '–ø–æ–¥–≤–µ—Å–∏—Ç—å',
 '–≥–∞–Ω–¥–æ–Ω',
 '–º—Ä–∞–∑—å',
 '–ø–æ–¥—Ä–æ—á–∏—Ç—å',
 '—Ä–∞—Å—Å—Ç—Ä–µ–ª–∏–≤–∞—Ç—å',
 '–±–ª—è',
 '–æ—Ç–ª–∏–∑–∞—Ç—å',
 '–µ–±–∞–Ω—É—Ç—å',
 '–≤–µ—à–∞—Ç—å',
 '—Ö—É–π —Å–æ—Å–∞—Ç—å',
 '–≥–æ—Ä–µ—Ç—å',
 '–∫–æ–ª',
 '–ø–∏–¥–æ—Ä',
 '—Å–≤–æ–ª–æ—á—å',
 '–∏–¥–∏–æ—Ç',
 '–µ–±–∞—Ç—å',
 '–ø–µ—Ç—É—Ö',
 '–ø–∏—Å—è',
 '—Ä–æ—Ç–∏–∫',
 '–ø–æ–ø–∫–∞',
 '—Ö—Ä–µ–Ω',
 '—Å—É—á–∫–∞',
 '–Ω–∞–∫–∞–∑–∞—Ç—å',
 '—Ö—É–π',
 '–¥–æ–ª–±–æ–µ–±',
 '–¥—ã—Ä–∫–∞',
 '—Å–æ—Å–∞—Ç—å —Ö—É–π',
 '—Ç–æ—á–Ω–æ',
 '–Ω–∞—Ö—É–π',
 '—Ç—É–ø–æ–π',
 '–≤–æ–Ω—é—á–∏–π',
 '—Ç—Ä–∞—Ö–∞—Ç—å—Å—è',
 '—Ç—Ä–∞—Ö–∞—Ç—å',
 '—Ç–≤–∞—Ä—å',
 '—Ä–æ–∂–∞',
 '—á–ª–µ–Ω',
 '–≥–æ—Ä–µ—Ç—å –∞–¥',
 '–æ—Ç—Å–æ—Å–∞—Ç—å',
 '–¥–æ–ª–±–æ–µ—Å—Ç–∏',
 '—Å–æ—Å–∞—Ç—å',
 '–ø–∏–¥–∞—Ä',
 '—É–±–∏—Ç—å',
 '—Å–º–µ—Ä—Ç—å']

In [9]:
def hyperopt_tdidf_logit_label(label):
    
    X_u = XY_train.text
    X_l = XY_train.text_lemmatised
    y = XY_train[label]
    
    @ignore_warnings(category=ConvergenceWarning)
    def hyperopt_tdidf_logit(params):
        
        X_1 = TfidfVectorizer(min_df = int(params['min_df']), ngram_range = (1, 1)).fit_transform(X_u)
        X_2 = CountVectorizer(vocabulary = vocab, binary = True).fit_transform(X_l)
        
        X = hstack([X_1, X_2])
        
        score = cross_val_score(estimator = LogisticRegression(C = params['C']), X = X, y = y,
                                cv = StratifiedKFold(n_splits = 7), scoring = 'average_precision')

        score_mean = score.mean()
        
        return -score_mean

    space_tfidf_logit = {'C': hp.uniform('C', 0.1, 25),
                         'min_df': hp.quniform('min_df', 2, 12, 1)}
    
    best = fmin(fn = hyperopt_tdidf_logit, space = space_tfidf_logit, algo = tpe.suggest, max_evals = 75)
    
    tfidf = TfidfVectorizer(min_df = int(best['min_df']), ngram_range = (1, 1))
    count = CountVectorizer(vocabulary = vocab, binary = True)
    
    X_1 = tfidf.fit_transform(X_u)
    X_2 = count.fit_transform(X_l)
    
    X = hstack([X_1, X_2])
    
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    
    ls = LogisticRegression(C = best['C'])
    
    ls.fit(X, y)
    
    print(average_precision_score(y_score = ls.predict_proba(X)[:, 1], y_true = y))
    
    return {'tf': tfidf, 'coun': count, 'logit': ls}

In [11]:
algs_1 = dict()
for label in ['normal', 'insult', 'obscenity', 'threat']:
    print('----------------------------------------------------------------------------------------------------')
    print(label)
    print('----------------------------------------------------------------------------------------------------')
    print()
    algs_1[label] = hyperopt_tdidf_logit_label(label)

----------------------------------------------------------------------------------------------------
normal
----------------------------------------------------------------------------------------------------

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [11:29<00:00, 13.78s/trial, best loss: -0.9939286414522565]
0.9991902153444302
----------------------------------------------------------------------------------------------------
insult
----------------------------------------------------------------------------------------------------

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [11:59<00:00, 14.38s/trial, best loss: -0.9110842515877565]
0.9896949743792532
----------------------------------------------------------------------------------------------------
obscenity
----------------------------------------------------------------------------------------------------

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [13:00<00:00, 15.61s/trial, best loss: -0.8042385337524033]
0.9889213308636742
-

In [12]:
algs_1
# 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [32:12<00:00, 19.33s/trial, best loss: -0.9052236649167692] - insult

# 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [36:35<00:00, 21.95s/trial, best loss: -0.8018728019201194] - obscenity

# 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [34:22<00:00, 20.63s/trial, best loss: -0.8884257267154007] - threat

{'normal': {'tf': TfidfVectorizer(min_df=2),
  'coun': CountVectorizer(binary=True,
                  vocabulary=['–±—ã–¥–ª–æ', '–¥—Ä–æ—á–∏—Ç—å', '–∫–æ–Ω—á–∏—Ç—å', '–æ—Ç—Ä–µ–∑–∞—Ç—å', '–µ–±–∞–ª',
                              '–ø–∏–∑–¥–µ—Ü', '–≤–¥—É—Ç—å', '–ø–æ–ª–∏–∑–∞—Ç—å', '—Å–æ—Å–Ω—É—Ç—å',
                              '–ø–æ–≤–µ—Å–∏—Ç—å', '—É—Ç–æ–ø–∏—Ç—å', '–ø–∏–∑–¥–∞', '—Å–ø–µ—Ä–º–∞', '–∫–æ–∑—ë–ª',
                              '–ø–∏–¥–æ—Ä–∞—Å', '–Ω–∞–±–∏—Ç—å', '–≥–æ–≤–Ω–æ', '–∑–∞–∫–æ–ø–∞—Ç—å', '—Ö–æ—Ö–æ–ª',
                              '–≤–æ—Ä', '–¥—É—Ä–∞', '—É–±–ª—é–¥–æ–∫', '–∫–∞–∑–Ω–∏—Ç—å', '–≤—ã–µ–±–∞–ª',
                              '–ª–∏–∑–∞—Ç—å', '–∑–∞—Å–∞–¥–∏—Ç—å', '—á–º–æ', '—Ä–∞—Å—Ç—Ä–µ–ª–∞', '–∑–∞–¥–Ω–∏—Ü–∞',
                              '–ø—Ä–∏–ª—é–¥–Ω–æ', ...]),
  'logit': LogisticRegression(C=5.123683620913078)},
 'insult': {'tf': TfidfVectorizer(min_df=2),
  'coun': CountVectorizer(binary=True,
                  vocabulary=['–±—ã–¥–ª–æ', '–¥—Ä–æ—á–∏—Ç—å', '–∫–æ–Ω—

In [13]:
X_final_test = pd.read_csv('X_final_test.csv', header = 0)
X_final_test.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised
0,167315,–∫–∞–∫–∞—è –ø—Ä–µ–ª–µ—Å—Ç—å!!!üòç,–∫–∞–∫–∞—è –ø—Ä–µ–ª–µ—Å—Ç—åüòç,–∫–∞–∫–æ–π –ø—Ä–µ–ª–µ—Å—Ç—åüòç
1,224546,–∫–∞–∞–ª –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤—å—é?,–∫–∞–∞ –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤,–∫–∞–∞–ª–∞ –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤—å
2,241309,–≥–Ω–æ–π–Ω—ã–µ –ø–∏–¥–æ—Ä—ã –∞–ª–ª—ã –æ–Ω–∏,–≥–Ω–æ–π–Ω –ø–∏–¥–æ—Ä –∞–ª–ª –æ–Ω–∏,–≥–Ω–æ–π–Ω—ã–π –ø–∏–¥–æ—Ä –∞–ª–ª–∞ –æ–Ω–∏


In [14]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# trans = TfidfVectorizer(min_df = 2)
# text_train = trans.fit_transform(XY.text)
# text_test = trans.transform(X_final_test.text)


# clf_normal = LogisticRegression(C = 6.4).fit(text_train, XY['normal'])
# clf_insult = LogisticRegression(C = 11.2).fit(text_train, XY['insult'])
# clf_threat = LogisticRegression(C = 10.6).fit(text_train, XY['threat'])

# pred_test_normal = clf_normal.predict_proba(text_test)[:, 1].reshape(-1, 1)
# pred_test_insult = clf_insult.predict_proba(text_test)[:, 1].reshape(-1, 1)
# pred_test_threat = clf_threat.predict_proba(text_test)[:, 1].reshape(-1, 1)


trans_1_normal = TfidfVectorizer(min_df = 2, ngram_range = (1, 1))
text_1_train_normal = trans_1_normal.fit_transform(XY.text)
text_1_test_normal = trans_1_normal.transform(X_final_test.text)

trans_1_insult = TfidfVectorizer(min_df = 2, ngram_range = (1, 1))
text_1_train_insult = trans_1_insult.fit_transform(XY.text)
text_1_test_insult = trans_1_insult.transform(X_final_test.text)

trans_1_threat = TfidfVectorizer(min_df = 3, ngram_range = (1, 1))
text_1_train_threat = trans_1_threat.fit_transform(XY.text)
text_1_test_threat = trans_1_threat.transform(X_final_test.text)

trans_1_obscenity = TfidfVectorizer(min_df = 2, ngram_range = (1, 1))
text_1_train_obscenity = trans_1_obscenity.fit_transform(XY.text)
text_1_test_obscenity = trans_1_obscenity.transform(X_final_test.text)


trans_2 = CountVectorizer(vocabulary = vocab, binary = True)
text_2_train = trans_2.fit_transform(XY.text_lemmatised)
text_2_test = trans_2.transform(X_final_test.text_lemmatised)


text_train_normal = hstack([text_1_train_normal, text_2_train])
text_train_insult = hstack([text_1_train_insult, text_2_train])
text_train_threat = hstack([text_1_train_threat, text_2_train])
text_train_obscenity = hstack([text_1_train_obscenity, text_2_train])

text_test_normal = hstack([text_1_test_normal, text_2_test])
text_test_insult = hstack([text_1_test_insult, text_2_test])
text_test_threat = hstack([text_1_test_threat, text_2_test])
text_test_obscenity = hstack([text_1_test_obscenity, text_2_test])


clf_normal = LogisticRegression(C = 5.12).fit(text_train_normal, XY['normal'])
clf_insult = LogisticRegression(C = 18.19).fit(text_train_insult, XY['insult'])
clf_threat = LogisticRegression(C = 7.63).fit(text_train_threat, XY['threat']) # 19.775484
clf_obscenity = LogisticRegression(C = 13.82).fit(text_train_obscenity, XY['obscenity']) # 20.8466131

pred_test_normal = clf_normal.predict_proba(text_test_normal)[:, 1].reshape(-1, 1)
pred_test_insult = clf_insult.predict_proba(text_test_insult)[:, 1].reshape(-1, 1)
pred_test_threat = clf_threat.predict_proba(text_test_threat)[:, 1].reshape(-1, 1)
pred_test_obscenity = clf_obscenity.predict_proba(text_test_obscenity)[:, 1].reshape(-1, 1)

In [15]:
predictions = np.hstack([pred_test_normal, pred_test_insult, pred_test_threat, pred_test_obscenity])

In [16]:
labels = ['normal', 'insult', 'threat', 'obscenity']

In [17]:
final_predictions = pd.concat([pd.DataFrame(X_final_test.id.values, columns = ['id']),
                               pd.DataFrame(predictions, columns = labels)], axis = 1)

In [18]:
result = final_predictions.loc[:, ['id', 'normal', 'insult', 'obscenity', 'threat']]

In [19]:
result.head()

Unnamed: 0,id,normal,insult,obscenity,threat
0,167315,0.998261,0.000232,0.00015,8e-05
1,224546,0.946274,0.033348,0.002244,0.030082
2,241309,0.003021,0.999378,0.001123,0.019074
3,31170,0.000455,0.999731,0.014532,0.015765
4,173358,0.947059,0.061836,0.004384,0.001584


In [20]:
result.to_csv('result', index = False, header = True)