In [1]:
TRAIN_DATA_PATH = "../../train.csv"
TEST_DATA_PATH = "../../test.csv"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from lightgbm import LGBMClassifier

from sklearn.base import clone
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [3]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

In [4]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [5]:
light_gbm_pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('word_vect', TfidfVectorizer()),
        ('char_vect', TfidfVectorizer())
    ])),
    ('selection', SelectFromModel(LogisticRegression(solver = 'sag'))),
    ('clf', LGBMClassifier())
])

In [6]:
params = {
    'vect__word_vect__sublinear_tf': True,
    'vect__word_vect__strip_accents': 'unicode',
    'vect__word_vect__analyzer': 'word',
    'vect__word_vect__token_pattern': r'\w{1,}',
    'vect__word_vect__max_features': 50000,
    'vect__word_vect__ngram_range': (1, 2),
    'vect__char_vect__sublinear_tf': True,
    'vect__char_vect__strip_accents': 'unicode',
    'vect__char_vect__analyzer': 'char',
    'vect__char_vect__max_features': 50000,
    'vect__char_vect__ngram_range': (2, 6),
    'selection__threshold': 0.2,
    'clf__learning_rate': 0.2,
    'clf__application': 'binary',
    'clf__num_leaves': 20,
    'clf__verbosity': -1,
    'clf__metric': 'auc',
    'clf__bagging_fraction': 0.8,
    'clf__feature_fraction': 0.6,
    'clf__lamba_l1': 1,
    'clf__lambda_l2': 1,
    'clf__num_boost_rounds': 10,
    'clf__verbose_eval': 10
}

light_gbm_pipeline.set_params(**params)

Pipeline(memory=None,
     steps=[('vect', FeatureUnion(n_jobs=1,
       transformer_list=[('word_vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_... subsample=1.0, subsample_for_bin=200000, subsample_freq=1,
        verbose_eval=10, verbosity=-1))])

In [8]:
import pickle
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
labels = ['insult', 'identity_hate']
cv = {}
for label in labels:
    print ('Running for ' + label)
    cv[label] = cross_validate(light_gbm_pipeline, train[comment_col], train[label], 
                               cv = 5, verbose = 10, scoring = ('accuracy', 'roc_auc', 'neg_log_loss'))
    save_obj(cv[label],'run2_'+str(label))
    print (cv[label])

Running for insult
[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9737427541908193, roc_auc=0.9817415572892643, neg_log_loss=-0.06787993064736134, total= 9.6min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 23.1min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9730534231552561, roc_auc=0.9815289219970007, neg_log_loss=-0.07000396401912409, total=11.2min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 46.2min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.974274613022498, roc_auc=0.9814442409241184, neg_log_loss=-0.0670858661017691, total=11.8min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 71.1min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.974274613022498, roc_auc=0.9809747734201407, neg_log_loss=-0.06870037649414037, total= 9.6min


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 92.6min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9748378403785292, roc_auc=0.9795579957871474, neg_log_loss=-0.068900930153661, total= 9.3min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 114.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 114.0min finished


{'fit_time': array([389.28029895, 468.8664248 , 514.00592899, 409.47734499,
       384.404737  ]), 'score_time': array([189.0172379 , 200.69914317, 192.01856899, 168.02803898,
       176.30765486]), 'test_accuracy': array([0.97374275, 0.97305342, 0.97427461, 0.97427461, 0.97483784]), 'train_accuracy': array([0.98351037, 0.98352604, 0.98338516, 0.9834635 , 0.98330696]), 'test_roc_auc': array([0.98174156, 0.98152892, 0.98144424, 0.98097477, 0.979558  ]), 'train_roc_auc': array([0.99470333, 0.99421594, 0.99449614, 0.99450524, 0.99466388]), 'test_neg_log_loss': array([-0.06787993, -0.07000396, -0.06708587, -0.06870038, -0.06890093]), 'train_neg_log_loss': array([-0.04314391, -0.04367502, -0.04352074, -0.04355107, -0.04325041])}
Running for identity_hate
[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9929813567288109, roc_auc=0.979990053046668, neg_log_loss=-0.023017114469877538, total= 9.0min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.6min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9927931315410164, roc_auc=0.9753051933580331, neg_log_loss=-0.023972960603169943, total= 8.6min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 39.6min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9927617973303252, roc_auc=0.9725811134887403, neg_log_loss=-0.024498972271868365, total= 8.0min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 58.1min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9925424578554867, roc_auc=0.9779115980169816, neg_log_loss=-0.023219180018141057, total= 8.1min


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 77.1min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9930438052265463, roc_auc=0.9817573611412831, neg_log_loss=-0.02267851865148976, total= 7.9min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 95.5min remaining:    0.0s


{'fit_time': array([366.55602098, 343.80965781, 321.98435283, 326.49450707,
       314.92090702]), 'score_time': array([174.26789975, 174.13587713, 155.48625517, 160.60081005,
       156.80085921]), 'test_accuracy': array([0.99298136, 0.99279313, 0.9927618 , 0.99254246, 0.99304381]), 'train_accuracy': array([0.99840979, 0.99828447, 0.99841763, 0.9982923 , 0.9983628 ]), 'test_roc_auc': array([0.97999005, 0.97530519, 0.97258111, 0.9779116 , 0.98175736]), 'train_roc_auc': array([0.99972727, 0.99972156, 0.99970015, 0.99975848, 0.99971652]), 'test_neg_log_loss': array([-0.02301711, -0.02397296, -0.02449897, -0.02321918, -0.02267852]), 'train_neg_log_loss': array([-0.00640658, -0.00639257, -0.00632672, -0.00631057, -0.0063767 ])}


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 95.5min finished


In [11]:
print("Stats from insult")
for i in cv['insult']:
    print("Average "+str(i)+" = " + str(np.mean(cv['insult'][i])))
    
print("Stats from identity hate")
for i in cv['identity_hate']:
    print("Average "+str(i)+" = " + str(np.mean(cv['identity_hate'][i])))

Stats from insult
Average fit_time = 433.20694694519045
Average score_time = 185.214128780365
Average test_accuracy = 0.9740366487539202
Average train_accuracy = 0.9834384071840159
Average test_roc_auc = 0.9810494978835342
Average train_roc_auc = 0.9945169063701613
Average test_neg_log_loss = -0.06851421348321117
Average train_neg_log_loss = -0.04342823116954435
Stats from identity hate
Average fit_time = 334.7530891418457
Average score_time = 164.2583402633667
Average test_accuracy = 0.9928245097364371
Average train_accuracy = 0.9983533976355214
Average test_roc_auc = 0.9775090638103412
Average train_roc_auc = 0.9997247953144613
Average test_neg_log_loss = -0.02347734920290933
Average train_neg_log_loss = -0.006362625019995178


