In [1]:
TRAIN_DATA_PATH = "../../../data/wikipedia/train.csv"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from lightgbm import LGBMClassifier

from sklearn.base import clone
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [3]:
train = pd.read_csv(TRAIN_DATA_PATH)

In [5]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)

In [6]:
light_gbm_pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('word_vect', TfidfVectorizer()),
        ('char_vect', TfidfVectorizer())
    ])),
    ('selection', SelectFromModel(LogisticRegression(solver = 'sag'))),
    ('clf', LGBMClassifier())
])

In [7]:
params = {
    'vect__word_vect__sublinear_tf': True,
    'vect__word_vect__strip_accents': 'unicode',
    'vect__word_vect__analyzer': 'word',
    'vect__word_vect__token_pattern': r'\w{1,}',
    'vect__word_vect__max_features': 50000,
    'vect__word_vect__ngram_range': (1, 2),
    'vect__char_vect__sublinear_tf': True,
    'vect__char_vect__strip_accents': 'unicode',
    'vect__char_vect__analyzer': 'char',
    'vect__char_vect__max_features': 50000,
    'vect__char_vect__ngram_range': (2, 6),
    'selection__threshold': 0.2,
    'clf__learning_rate': 0.2,
    'clf__application': 'binary',
    'clf__num_leaves': 20,
    'clf__verbosity': -1,
    'clf__metric': 'auc',
    'clf__bagging_fraction': 0.8,
    'clf__feature_fraction': 0.6,
    'clf__lamba_l1': 1,
    'clf__lambda_l2': 1,
    'clf__num_boost_rounds': 10,
    'clf__verbose_eval': 10
}

light_gbm_pipeline.set_params(**params)

Pipeline(memory=None,
     steps=[('vect', FeatureUnion(n_jobs=1,
       transformer_list=[('word_vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_... subsample=1.0, subsample_for_bin=200000, subsample_freq=1,
        verbose_eval=10, verbosity=-1))])

In [8]:
labels = label_cols
cv = {}
for label in labels:
    print ('Running for ' + label)
    cv[label] = cross_validate(light_gbm_pipeline, train[comment_col], train[label], 
                               cv = 10, verbose = 10, scoring = ('accuracy', 'roc_auc', 'neg_log_loss'))
    print (cv[label])

Running for insult
[CV]  ................................................................


KeyboardInterrupt: 

In [11]:
print("Stats from insult")
for i in cv['insult']:
    print("Average "+str(i)+" = " + str(np.mean(cv['insult'][i])))
    
print("Stats from identity hate")
for i in cv['identity_hate']:
    print("Average "+str(i)+" = " + str(np.mean(cv['identity_hate'][i])))

Stats from insult
Average fit_time = 433.20694694519045
Average score_time = 185.214128780365
Average test_accuracy = 0.9740366487539202
Average train_accuracy = 0.9834384071840159
Average test_roc_auc = 0.9810494978835342
Average train_roc_auc = 0.9945169063701613
Average test_neg_log_loss = -0.06851421348321117
Average train_neg_log_loss = -0.04342823116954435
Stats from identity hate
Average fit_time = 334.7530891418457
Average score_time = 164.2583402633667
Average test_accuracy = 0.9928245097364371
Average train_accuracy = 0.9983533976355214
Average test_roc_auc = 0.9775090638103412
Average train_roc_auc = 0.9997247953144613
Average test_neg_log_loss = -0.02347734920290933
Average train_neg_log_loss = -0.006362625019995178


