In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from nlp_pipeline import *

In [2]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [3]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [17]:
subnums = [21,22,29,33,37,44,45,51,52,57,59,62,66]
# subnums = [21,22,29,33,37,44]
# subnums = [45,51,52,57,59,62,66]

In [18]:
oofs = [np.array(pd.read_csv("submissions\\oof_train" + str(num) + ".csv")[labels]) for num in subnums]
subs = [np.array(pd.read_csv("submissions\\submission" + str(num) + ".csv")[labels]) for num in subnums]

In [19]:
train_meta = np.hstack(oofs)
test_meta = np.hstack(subs)

In [20]:
pretrained = "data\\crawl-300d-2M.vec"
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq, has_ip, has_talk_tag, link_count, starts_with_i, starts_with_you, about_image]
transforms = [tokenize]
gbm = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM stacker"
models = [gbm]

In [21]:
pipe = NlpPipeline(train, test, "comment_text", labels, feature_funcs, transforms, models, word_index=None, pretrained=pretrained)

In [22]:
pipe.engineer_features()

Engineering features


In [23]:
pipe.train_features = np.hstack([pipe.train_features, train_meta])
pipe.test_features = np.hstack([pipe.test_features, test_meta])

In [24]:
pipe.cross_val()

Cross-validating
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=0.45, feature_fraction=0.45,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=125,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
Cross-validating toxic
roc_auc: 0.9881824341380547
Cross-validating severe_toxic
roc_auc: 0.9919512575891855
Cross-validating obscene
roc_auc: 0.9953419530149648
Cross-validating threat
roc_auc: 0.9932315401027237
Cross-validating insult
roc_auc: 0.9897935938419493
Cross-validating identity_hate
roc_auc: 0.9915402222995928


In [25]:
pipe.cv_scores # 0.9915737493264863

{'LightGBM stacker': 0.9916735001644117}

In [26]:
pipe.fit_predict_oof()

Creating out-of-fold meta training set for stacker
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=0.45, feature_fraction=0.45,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=125,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
toxic
AUC: 0.9883224656657963
AUC: 0.9869160164875278
AUC: 0.988839578249936
AUC: 0.9882628188914584
AUC: 0.9887312495385802
severe_toxic
AUC: 0.9917266123916032
AUC: 0.9919247359509796
AUC: 0.9924514688000978
AUC: 0.9916473244603421
AUC: 0.9912926617659128
obscene
AUC: 0.9954411213870601
AUC: 0.995289528634506
AUC: 0.9954051003761569
AUC: 0.9948869104217979
AUC: 0.9955459348614526
threat
AUC: 0.9971416251526801
AUC: 0.9936400750303431
AUC: 0.994832682008624
AUC:

In [27]:
pipe.create_submission(oof=False)

Creating submissions
