In [1]:
import time
import string
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.sparse import hstack

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample as ho_sample

In [2]:
XY = pd.read_csv('XY.csv', header = 0)
XY.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised,normal,threat,insult,obscenity
0,41127,–¥–≤–æ—Ä–Ω–∏–∫–∞ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å!,–¥–≤–æ—Ä–Ω–∏–∫ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂,–¥–≤–æ—Ä–Ω–∏–∫ –Ω–∞–¥–æ —Ç–æ–∂–µ —É–Ω–∏—á—Ç–æ–∂–∏—Ç—å,0,1,0,0
1,6812,"–º–æ—è —Å—Ç–∞—Ä—à–∞—è –Ω–µ–¥–µ–ª—é —à–∏–ø–µ–ª–∞, –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª–∞ –ø–æ–¥–∫–∏–¥...",–º–æ—è —Å—Ç–∞—Ä—à –Ω–µ–¥–µ–ª —à–∏–ø–µ–ª –Ω–µ –ø—Ä–∏–Ω–∏–º–∞ –ø–æ–¥–∫–∏–¥—ã—à –∫–æ—Ç–æ...,–º–æ–π —Å—Ç–∞—Ä—à–∏–π –Ω–µ–¥–µ–ª—è —à–∏–ø–µ—Ç—å –Ω–µ –ø—Ä–∏–Ω–∏–º–∞—Ç—å –ø–æ–¥–∫–∏–¥—ã...,1,0,0,0
2,6256,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤–∞–º–∏ —Å–æ–≥–ª–∞—Å–Ω–∞!,–ø–æ–ª–Ω–æ—Å—Ç —Å –≤–∞–º —Å–æ–≥–ª–∞—Å–Ω,–ø–æ–ª–Ω–æ—Å—Ç—å—é —Å –≤—ã —Å–æ–≥–ª–∞—Å–Ω—ã–π,1,0,0,0


In [3]:
XY_train, XY_test = train_test_split(XY, test_size = 0.3, shuffle = True, random_state = 42)
XY_train.reset_index(drop = True, inplace = True)
XY_test.reset_index(drop = True, inplace = True)
XY_train_abn = XY_train.loc[XY_train.normal == 0, :].reset_index(drop = True)
XY_train.shape, XY_train_abn.shape

((104142, 8), (18547, 8))

# Let's go

In [8]:
def hyperopt_tdidf_logit_label(label):
    @ignore_warnings(category=ConvergenceWarning)
    def hyperopt_tdidf_logit(params):
        space_trans = {key: value for key, value in params.items() if (key != 'C') and (key != 'stemming')}

        trans = TfidfVectorizer(**space_trans)
        clf = LogisticRegression(class_weight = 'balanced', C = params['C'])

        pipe = Pipeline([('trans', trans), ('clf', clf)])

        X = XY_train.text_stemmed if params['stemming'] else XY_train.text

        score = cross_val_score(estimator = pipe, X = X, y = XY_train[label],
                                cv = StratifiedKFold(n_splits = 5), scoring = 'average_precision')

        score_mean = score.mean()

        return -score_mean

    space_tfidf_logit = {
        'stemming': hp.choice('stemming', [True, False]),    
#         'stop_words': hp.choice('stop_words', [stopWords, None]),
        'ngram_range': hp.choice('ngram_range', [(1, 1), (1, 2)]),
        'min_df': hp.uniform('min_df', 0.00, 0.05),
        'max_df': hp.uniform('max_df', 0.95, 1.00),
        'C': hp.uniform('C', 0.01, 100)
    }
    
    best = fmin(fn = hyperopt_tdidf_logit, space = space_tfidf_logit, algo = tpe.suggest, max_evals = 30)
    
    best['stemming'] = [True, False][best['stemming']]
#     best['stop_words'] = [stopWords, None][best['stop_words']]
    best['ngram_range'] = [(1, 1), (1, 2)][best['ngram_range']]
    
    return best

In [9]:
best_normal = hyperopt_tdidf_logit_label('normal')
best_insult = hyperopt_tdidf_logit_label('insult')
best_threat = hyperopt_tdidf_logit_label('threat')
best_obscenity = hyperopt_tdidf_logit_label('obscenity')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [10:09<00:00, 20.32s/trial, best loss: -0.9899153336125238]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [08:12<00:00, 16.42s/trial, best loss: -0.8455040744324431]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [09:36<00:00, 19.21s/trial, best loss: -0.7539220637762643] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [12:21<00:00, 24.70s/trial, best loss: -0.7202495063035603] 


In [18]:
best_normal

{'C': 26.69055673760855,
 'max_df': 0.9628048616138088,
 'min_df': 0.00011950169636565193,
 'ngram_range': (1, 1),
 'stemming': True,
 'stop_words': None}

In [19]:
best_insult

{'C': 38.43711032544301,
 'max_df': 0.9761293205934343,
 'min_df': 0.0001306429356985548,
 'ngram_range': (1, 1),
 'stemming': False,
 'stop_words': ['–∏',
  '–≤',
  '–≤–æ',
  '–Ω–µ',
  '—á—Ç–æ',
  '–æ–Ω',
  '–Ω–∞',
  '—è',
  '—Å',
  '—Å–æ',
  '–∫–∞–∫',
  '–∞',
  '—Ç–æ',
  '–≤—Å–µ',
  '–æ–Ω–∞',
  '—Ç–∞–∫',
  '–µ–≥–æ',
  '–Ω–æ',
  '–¥–∞',
  '—Ç—ã',
  '–∫',
  '—É',
  '–∂–µ',
  '–≤—ã',
  '–∑–∞',
  '–±—ã',
  '–ø–æ',
  '—Ç–æ–ª—å–∫–æ',
  '–µ–µ',
  '–º–Ω–µ',
  '–±—ã–ª–æ',
  '–≤–æ—Ç',
  '–æ—Ç',
  '–º–µ–Ω—è',
  '–µ—â–µ',
  '–Ω–µ—Ç',
  '–æ',
  '–∏–∑',
  '–µ–º—É',
  '—Ç–µ–ø–µ—Ä—å',
  '–∫–æ–≥–¥–∞',
  '–¥–∞–∂–µ',
  '–Ω—É',
  '–≤–¥—Ä—É–≥',
  '–ª–∏',
  '–µ—Å–ª–∏',
  '—É–∂–µ',
  '–∏–ª–∏',
  '–Ω–∏',
  '–±—ã—Ç—å',
  '–±—ã–ª',
  '–Ω–µ–≥–æ',
  '–¥–æ',
  '–≤–∞—Å',
  '–Ω–∏–±—É–¥—å',
  '–æ–ø—è—Ç—å',
  '—É–∂',
  '–≤–∞–º',
  '–≤–µ–¥—å',
  '—Ç–∞–º',
  '–ø–æ—Ç–æ–º',
  '—Å–µ–±—è',
  '–Ω–∏—á–µ–≥–æ',
  '–µ–π',
  '–º–æ–∂–µ—Ç',
  '–æ–Ω–∏',
  '—Ç—É—Ç',
  '–≥–¥–µ',
  '–µ—Å—Ç—å',
  '–Ω–∞–¥–æ',
  '–Ω–µ–π',
  '–¥–ª—è',
  

In [20]:
best_threat

{'C': 12.153337935599467,
 'max_df': 0.9699701370487585,
 'min_df': 0.00020556610553198323,
 'ngram_range': (1, 2),
 'stemming': True,
 'stop_words': ['–∏',
  '–≤',
  '–≤–æ',
  '–Ω–µ',
  '—á—Ç–æ',
  '–æ–Ω',
  '–Ω–∞',
  '—è',
  '—Å',
  '—Å–æ',
  '–∫–∞–∫',
  '–∞',
  '—Ç–æ',
  '–≤—Å–µ',
  '–æ–Ω–∞',
  '—Ç–∞–∫',
  '–µ–≥–æ',
  '–Ω–æ',
  '–¥–∞',
  '—Ç—ã',
  '–∫',
  '—É',
  '–∂–µ',
  '–≤—ã',
  '–∑–∞',
  '–±—ã',
  '–ø–æ',
  '—Ç–æ–ª—å–∫–æ',
  '–µ–µ',
  '–º–Ω–µ',
  '–±—ã–ª–æ',
  '–≤–æ—Ç',
  '–æ—Ç',
  '–º–µ–Ω—è',
  '–µ—â–µ',
  '–Ω–µ—Ç',
  '–æ',
  '–∏–∑',
  '–µ–º—É',
  '—Ç–µ–ø–µ—Ä—å',
  '–∫–æ–≥–¥–∞',
  '–¥–∞–∂–µ',
  '–Ω—É',
  '–≤–¥—Ä—É–≥',
  '–ª–∏',
  '–µ—Å–ª–∏',
  '—É–∂–µ',
  '–∏–ª–∏',
  '–Ω–∏',
  '–±—ã—Ç—å',
  '–±—ã–ª',
  '–Ω–µ–≥–æ',
  '–¥–æ',
  '–≤–∞—Å',
  '–Ω–∏–±—É–¥—å',
  '–æ–ø—è—Ç—å',
  '—É–∂',
  '–≤–∞–º',
  '–≤–µ–¥—å',
  '—Ç–∞–º',
  '–ø–æ—Ç–æ–º',
  '—Å–µ–±—è',
  '–Ω–∏—á–µ–≥–æ',
  '–µ–π',
  '–º–æ–∂–µ—Ç',
  '–æ–Ω–∏',
  '—Ç—É—Ç',
  '–≥–¥–µ',
  '–µ—Å—Ç—å',
  '–Ω–∞–¥–æ',
  '–Ω–µ–π',
  '–¥–ª—è',
 

In [21]:
best_obscenity

{'C': 76.39223651587547,
 'max_df': 0.9690781859494049,
 'min_df': 1.2200877751358113e-05,
 'ngram_range': (1, 1),
 'stemming': False,
 'stop_words': None}

In [10]:
# best_normal = {'C': 73.52580652715072,
#                'max_df': 0.9696315615375243,
#                'min_df': 2.1701497893422727e-05,
#                'ngram_range': (1, 2),
#                'stemming': False,
#                'stop_words': None
#               }
# Average precision score train: 0.9996649882629408 from another notebook
# Average precision score test: 0.9924950275198423

In [11]:
@ignore_warnings(category=ConvergenceWarning)
def cook_algo(XY, label, params):
    space_trans = {key: value for key, value in params.items() if (key != 'C') and (key != 'stemming')}
    
    trans = TfidfVectorizer(**space_trans)
    clf = LogisticRegression(class_weight = 'balanced', C = params['C'])
    
    X = XY.text_stemmed if params['stemming'] else XY.text
    y = XY[label]
        
    pipe = Pipeline([('trans', TfidfVectorizer(**space_trans)), 
                     ('clf', LogisticRegression(C = params['C']))])
    pipe.fit(X, y)
    
    print(f'Average precision score: {average_precision_score(y_score = pipe.predict_proba(X)[:, 1], y_true = y)}')
    
    return pipe

In [12]:
def my_predict(XY, label, params, pipeline):
    X = XY.text_stemmed if params['stemming'] else XY.text
    y = XY[label]
    
    y_pred = pipeline.predict_proba(X)[:, 1]
    print(f'Average precision score: {average_precision_score(y_score = y_pred, y_true = y)}')
    
    return y_pred.reshape(-1, 1)

In [13]:
pipe_normal = cook_algo(XY_train, 'normal', best_normal)
pipe_insult = cook_algo(XY_train, 'insult', best_insult)
pipe_threat = cook_algo(XY_train, 'threat', best_threat)
pipe_obscenity = cook_algo(XY_train, 'obscenity', best_obscenity)

Average precision score: 0.9962720089665845
Average precision score: 0.9286916279788466
Average precision score: 0.8754930896826478
Average precision score: 0.9933882121816242


In [14]:
preds_normal_train = my_predict(XY_train, 'normal', best_normal, pipe_normal)
preds_normal_test = my_predict(XY_test, 'normal', best_normal, pipe_normal)

Average precision score: 0.9962720089665845
Average precision score: 0.9909018049370828


In [15]:
preds_insult_train = my_predict(XY_train, 'insult', best_insult, pipe_insult)
preds_insult_test = my_predict(XY_test, 'insult', best_insult, pipe_insult)

Average precision score: 0.9286916279788466
Average precision score: 0.8672494906009367


In [16]:
preds_threat_train = my_predict(XY_train, 'threat', best_threat, pipe_threat)
preds_threat_test = my_predict(XY_test, 'threat', best_threat, pipe_threat)

Average precision score: 0.8754930896826478
Average precision score: 0.7869322209839801


In [17]:
preds_obscenity_train = my_predict(XY_train, 'obscenity', best_obscenity, pipe_obscenity)
preds_obscenity_test = my_predict(XY_test, 'obscenity', best_obscenity, pipe_obscenity)

Average precision score: 0.9933882121816242
Average precision score: 0.7647738738449493


In [4]:
X_final_test = pd.read_csv('X_final_test.csv', header = 0)
X_final_test.head(3)

Unnamed: 0,id,text,text_stemmed,text_lemmatised
0,167315,–∫–∞–∫–∞—è –ø—Ä–µ–ª–µ—Å—Ç—å!!!üòç,–∫–∞–∫–∞—è –ø—Ä–µ–ª–µ—Å—Ç—åüòç,–∫–∞–∫–æ–π –ø—Ä–µ–ª–µ—Å—Ç—åüòç
1,224546,–∫–∞–∞–ª –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤—å—é?,–∫–∞–∞ –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤,–∫–∞–∞–ª–∞ –∫–∞–∫–æ–π –Ω–µ —Å –∫—Ä–æ–≤—å
2,241309,–≥–Ω–æ–π–Ω—ã–µ –ø–∏–¥–æ—Ä—ã –∞–ª–ª—ã –æ–Ω–∏,–≥–Ω–æ–π–Ω –ø–∏–¥–æ—Ä –∞–ª–ª –æ–Ω–∏,–≥–Ω–æ–π–Ω—ã–π –ø–∏–¥–æ—Ä –∞–ª–ª–∞ –æ–Ω–∏


In [None]:
# predictions = np.hstack([predict_with_aka_bagging(algs_final[label], X_final_test, label).reshape(-1, 1)
#                          for label in list(algs_final.keys())])

In [None]:
# final_predictions = pd.concat([pd.DataFrame(X_final_test.id.values, columns = ['id']),
#                                pd.DataFrame(predictions, columns = list(algs_final.keys()))], axis = 1)

In [None]:
# result = final_predictions.loc[:, ['id', 'normal', 'insult', 'obscenity', 'threat']]

In [None]:
# result.to_csv('result', index = False, header = True)