In [1]:
TRAIN_DATA_PATH = "downloads/train.csv.zip"
TEST_DATA_PATH = "downloads/test.csv.zip"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from lightgbm import LGBMClassifier

from sklearn.base import clone
from sklearn.model_selection import learning_curve

In [3]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

In [4]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [5]:
def plot_learning_curve(pipeline, X, y, **kwargs):
    train_size, train_acc_folds, val_acc_folds = learning_curve(estimator=pipeline, X = X, y = y, **kwargs)
    
    train_acc = np.mean(train_acc_folds, axis = 1)
    train_val = np.mean(val_acc_folds, axis = 1)
    print([' %.4f' % score for score in train_val])
    print("\nAccuracy: %0.4f (+/- %0.4f)" % (np.mean(train_val), np.std(train_val) * 2)) 
    fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (8, 6))
    ax.plot(train_sizes, train_acc, lw = 3, color = "steelblue", label = "training")
    ax.plot(train_sizes, train_val, lw = 3, color = "green", label = "validation")

    ax.grid(alpha=0.25)
    ax.set_xlabel("training set size", fontsize=16)
    ax.set_ylabel("Accuracy", fontsize=16)
    ax.legend(loc="upper right", fontsize=16)


For better understanding `LGBMClassifer` refer [here](http://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api)

In [14]:
lightgbm_pipline = Pipeline([
    ('vect', FeatureUnion([
        ('word_vect', Pipeline([
            ('vect', CountVectorizer(analyzer = 'word')),
            ('tfidf', TfidfTransformer())
        ])),
#         ('char_vect', Pipeline([
#             ('vect', CountVectorizer(analyzer = 'char')),
#             ('tfidf', TfidfTransformer())
#         ]))
        # Union of other relevant features
    ])),
    ('clf', LGBMClassifier(class_weight = 'balanced'))
])
        

`clone` is necessary to support different pipelines with different params

In [None]:
## No char_vect
p1 = clone(lightgbm_pipline)
params = {
    'clf__boosting_type': 'goss',
    'vect__word_vect__vect__min_df': 10,
    'vect__word_vect__vect__strip_accents': 'unicode',
    'vect__word_vect__vect__max_features': 100,
    'clf__silent': False
}
p1.set_params(**params);
plot_learning_curve(p1, train[comment_col].values, train[label_cols]['toxic'],
                    train_sizes = np.linspace(0.1, 0.5, 5),
                    verbose=1,
                    cv = 3,
                    scoring = 'neg_log_loss',
                    n_jobs = 3)


[learning_curve] Training set sizes: [10638 21276 31914 42552 53190]


In [None]:
p2 = clone(lightgbm_pipline)

In [None]:
train[comment_col].shape

In [None]:
train[label_cols].values