In [1]:
import pandas as pd
import numpy as np
import re, string
from sklearn.model_selection import GridSearchCV

In [9]:
import lightgbm as lgb

In [4]:
from nlp_pipeline import *

In [5]:
pretrained = "data\\crawl-300d-2M.vec"

In [6]:
word_vector = get_pretrained(pretrained)

In [7]:
train = pd.read_csv("data\\train.csv").fillna(' ')
test = pd.read_csv("data\\test.csv").fillna(' ')

In [8]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(row):
    return re_tok.sub(r' \1 ', row).lower().split()

In [11]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq]
transforms = [tokenize]
gbm = lgb.LGBMClassifier(max_depth=3, metric="auc", num_leaves=20, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM"
models = [gbm]

In [13]:
pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models, word_index=word_vector, pretrained=pretrained)
print(pipe)

Train: (159571, 8)
Test: (153164, 2)
Train features: (0,)
Test features: (0,)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: LightGBM: gbdt None 1.0 0.1 3 20 0.001 0.0 100 -1 20 None None 0.0 0.2 True 1.0 200000 1 auc 0.9 0.8 5 | 
Transforms:  tokenize
Feature functions:  function asterix_freq uppercase_freq line_change_freq rep_freq question_freq
Metric: roc_auc
CV scores: {'LightGBM': -1}


In [14]:
pipe.engineer_features()

Engineering features


In [15]:
pipe.apply_transforms()

Applying transforms


In [16]:
pipe.create_embeddings()

Creating embeddings


In [17]:
param_grid = [
  {'num_leaves': [10, 15, 20], 
   'max_depth': [2,3,5],
   'learning_rate': [0.08,0.1,0.12]}
 ]

In [18]:
clf = GridSearchCV(gbm, param_grid)
clf.fit(pipe.train_features, train["severe_toxic"])

GridSearchCV(cv=None, error_score='raise',
       estimator=LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=20, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'num_leaves': [10, 15, 20], 'max_depth': [2, 3, 5], 'learning_rate': [0.08, 0.1, 0.12]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
clf.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'num_leaves': 10}

In [25]:
pipe.cross_val()

Cross-validating
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
Cross-validating toxic
roc_auc: 0.95795622029
Cross-validating severe_toxic
roc_auc: 0.983969351099
Cross-validating obscene
roc_auc: 0.971628670957
Cross-validating threat
roc_auc: 0.974643685149
Cross-validating insult
roc_auc: 0.969079826691
Cross-validating identity_hate
roc_auc: 0.974837895119


In [26]:
pipe.fit_predict_oof()

Creating out-of-fold meta training set for stacker
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
toxic
AUC: 0.956549533623
AUC: 0.957561008854
AUC: 0.960646629136
AUC: 0.9567841933
AUC: 0.958794141654
severe_toxic
AUC: 0.983588656074
AUC: 0.984221508909
AUC: 0.983461670074
AUC: 0.985227479713
AUC: 0.984391548349
obscene
AUC: 0.970237136293
AUC: 0.971839531338
AUC: 0.975834772677
AUC: 0.9707452455
AUC: 0.970833380932
threat
AUC: 0.97918670217
AUC: 0.970884365001
AUC: 0.969385609206
AUC: 0.977309996698
AUC: 0.970492082159
insult
AUC: 0.967139970311
AUC: 0.968618

In [27]:
pipe.create_submission()

Creating submissions
