In [4]:
TRAIN_DATA_PATH = "../../train.csv"
TEST_DATA_PATH = "../../test.csv"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from lightgbm import LGBMClassifier

from sklearn.base import clone
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [6]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

In [7]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [8]:
light_gbm_pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('word_vect', TfidfVectorizer()),
        ('char_vect', TfidfVectorizer())
    ])),
    ('selection', SelectFromModel(LogisticRegression(solver = 'sag'))),
    ('clf', LGBMClassifier())
])

In [9]:
params = {
    'vect__word_vect__sublinear_tf': True,
    'vect__word_vect__strip_accents': 'unicode',
    'vect__word_vect__analyzer': 'word',
    'vect__word_vect__token_pattern': r'\w{1,}',
    'vect__word_vect__max_features': 50000,
    'vect__word_vect__ngram_range': (1, 2),
    'vect__char_vect__sublinear_tf': True,
    'vect__char_vect__strip_accents': 'unicode',
    'vect__char_vect__analyzer': 'char',
    'vect__char_vect__max_features': 50000,
    'vect__char_vect__ngram_range': (2, 6),
    'selection__threshold': 0.2,
    'clf__learning_rate': 0.2,
    'clf__application': 'binary',
    'clf__num_leaves': 20,
    'clf__verbosity': -1,
    'clf__metric': 'auc',
    'clf__bagging_fraction': 0.8,
    'clf__feature_fraction': 0.6,
    'clf__lamba_l1': 1,
    'clf__lambda_l2': 1,
    'clf__num_boost_rounds': 10,
    'clf__verbose_eval': 10
}

light_gbm_pipeline.set_params(**params)

Pipeline(memory=None,
     steps=[('vect', FeatureUnion(n_jobs=1,
       transformer_list=[('word_vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_... subsample=1.0, subsample_for_bin=200000, subsample_freq=1,
        verbose_eval=10, verbosity=-1))])

In [13]:
labels = ['toxic', 'severe_toxic']
cv = {}
for label in labels:
    print ('Running for ' + label)
    cv[label] = cross_validate(light_gbm_pipeline, train[comment_col], train[label], 
                               cv = 5, verbose = 10, scoring = ('accuracy', 'roc_auc', 'neg_log_loss'))
    print (cv[label])

Running for toxic
[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9614914616951277, roc_auc=0.9741697464421412, neg_log_loss=-0.10542004547790662, total= 9.7min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.7min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9623374588751371, roc_auc=0.9765451718730415, neg_log_loss=-0.10166939194149983, total=10.0min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 41.9min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.963370307701949, roc_auc=0.9747658097501293, neg_log_loss=-0.10225583892648642, total=10.1min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 63.8min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9625869524346682, roc_auc=0.9765828896486127, neg_log_loss=-0.10327577485038417, total=10.3min


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 87.6min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.961771065083195, roc_auc=0.9735143489940172, neg_log_loss=-0.10632890748537532, total=18.2min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 119.4min remaining:    0.0s


{'fit_time': array([418.50400519, 441.79352236, 427.46366787, 454.63654971,
       883.03019595]), 'score_time': array([163.74072862, 158.98369193, 179.50701404, 162.4488461 ,
       207.98612094]), 'test_accuracy': array([0.96149146, 0.96233746, 0.96337031, 0.96258695, 0.96177107]), 'train_accuracy': array([0.9712822 , 0.97124303, 0.97161143, 0.97119625, 0.97124348]), 'test_roc_auc': array([0.97416975, 0.97654517, 0.97476581, 0.97658289, 0.97351435]), 'train_roc_auc': array([0.98958266, 0.98922174, 0.98948577, 0.98940798, 0.98976177]), 'test_neg_log_loss': array([-0.10542005, -0.10166939, -0.10225584, -0.10327577, -0.10632891]), 'train_neg_log_loss': array([-0.07761468, -0.07851727, -0.07786666, -0.07812039, -0.07705607])}
Running for severe_toxic
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 119.4min finished
  if diff:
  if diff:


[CV]  , accuracy=0.9897853673821088, roc_auc=0.9823913268653108, neg_log_loss=-0.025611433749596308, total= 9.8min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 22.1min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9902237262643354, roc_auc=0.9858541761647338, neg_log_loss=-0.025286920114171236, total=10.0min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 46.6min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9893463683649809, roc_auc=0.9843537998800453, neg_log_loss=-0.028118389236971242, total= 8.1min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 65.6min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9899417183681143, roc_auc=0.9851398553697588, neg_log_loss=-0.02645488570922424, total= 8.2min


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 84.5min remaining:    0.0s


[CV]  ................................................................


  if diff:
  if diff:


[CV]  , accuracy=0.9900670552108792, roc_auc=0.9862547692906054, neg_log_loss=-0.025476337631086567, total= 8.4min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 103.8min remaining:    0.0s


{'fit_time': array([400.43777013, 398.24313116, 318.95824385, 334.04399323,
       343.94788313]), 'score_time': array([189.90395379, 203.62284184, 164.06704807, 160.68782473,
       161.57937884]), 'test_accuracy': array([0.98978537, 0.99022373, 0.98934637, 0.98994172, 0.99006706]), 'train_accuracy': array([0.99744626, 0.99760295, 0.99750112, 0.99761079, 0.99771262]), 'test_roc_auc': array([0.98239133, 0.98585418, 0.9843538 , 0.98513986, 0.98625477]), 'train_roc_auc': array([0.99939876, 0.99951552, 0.99953252, 0.99954034, 0.99947988]), 'test_neg_log_loss': array([-0.02561143, -0.02528692, -0.02811839, -0.02645489, -0.02547634]), 'train_neg_log_loss': array([-0.00883899, -0.00872426, -0.00855632, -0.00846592, -0.00862764])}


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 103.9min finished


In [15]:
import pickle
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
save_obj(cv,'toxic and severe toxic')

In [20]:
print("Stats from toxic")
for i in cv['toxic']:
    print("Average "+str(i)+" = " + str(np.mean(cv['toxic'][i])))
    
print("Stats from severe_toxic")
for i in cv['severe_toxic']:
    print("Average "+str(i)+" = " + str(np.mean(cv['severe_toxic'][i])))

Stats from toxic
Average fit_time = 525.0855882167816
Average score_time = 174.5332803249359
Average test_accuracy = 0.9623114491580154
Average train_accuracy = 0.9713152765328111
Average test_roc_auc = 0.9751155933415884
Average train_roc_auc = 0.9894919847527806
Average test_neg_log_loss = -0.10378999173633048
Average train_neg_log_loss = -0.07783501451848558
Stats from severe_toxic
Average fit_time = 359.12620429992677
Average score_time = 175.97220945358276
Average test_accuracy = 0.9898728471180839
Average train_accuracy = 0.9975747470898766
Average test_roc_auc = 0.984798785514091
Average train_roc_auc = 0.9994934053006382
Average test_neg_log_loss = -0.02618959328820992
Average train_neg_log_loss = -0.008642625322111559


