In [None]:
TRAIN_DATA_PATH = "downloads/train.csv.zip"
TEST_DATA_PATH = "downloads/test.csv.zip"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from lightgbm import LGBMClassifier

from sklearn.base import clone
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [None]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

In [None]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [None]:
light_gbm_pipeline = Pipeline([
    ('vect', FeatureUnion([
        ('word_vect', TfidfVectorizer()),
        ('char_vect', TfidfVectorizer())
    ])),
    ('selection', SelectFromModel(LogisticRegression(solver = 'sag'))),
    ('clf', LGBMClassifier())
])

In [None]:
params = {
    'vect__word_vect__sublinear_tf': True,
    'vect__word_vect__strip_accents': 'unicode',
    'vect__word_vect__analyzer': 'word',
    'vect__word_vect__token_pattern': r'\w{1,}',
    'vect__word_vect__max_features': 50000,
    'vect__word_vect__ngram_range': (1, 2),
    'vect__char_vect__sublinear_tf': True,
    'vect__char_vect__strip_accents': 'unicode',
    'vect__char_vect__analyzer': 'char',
    'vect__char_vect__max_features': 50000,
    'vect__char_vect__ngram_range': (2, 6),
    'selection__threshold': 0.2,
    'clf__learning_rate': 0.2,
    'clf__application': 'binary',
    'clf__num_leaves': 20,
    'clf__verbosity': -1,
    'clf__metric': 'auc',
    'clf__bagging_fraction': 0.8,
    'clf__feature_fraction': 0.6,
    'clf__lamba_l1': 1,
    'clf__lambda_l2': 1,
    'clf__num_boost_rounds': 10,
    'clf__verbose_eval': 10
}

light_gbm_pipeline.set_params(**params)

In [None]:
label = 'severe_toxic'
print ('Running for ' + label)
cv[label] = cross_validate(light_gbm_pipeline, train[comment_col], train[label], 
                           cv = 5, n_jobs = 3, verbose = 10, scoring = ('accuracy', 'roc_auc', 'neg_log_loss'))
print (cv[label])