In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from nlp_pipeline import *

In [2]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [3]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [224]:
subnums = [21,22,29,33,37,44,45,51,52,57,59,62,66,68,74,76]

In [237]:
oofs = [np.array(pd.read_csv("submissions\\oof_train" + str(num) + ".csv")[labels]) for num in subnums]
subs = [np.array(pd.read_csv("submissions\\submission" + str(num) + ".csv")[labels]) for num in subnums]

In [226]:
train_meta = np.hstack(oofs)
test_meta = np.hstack(subs)

In [227]:
pretrained = "data\\crawl-300d-2M.vec"
# feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq, has_ip, has_talk_tag, link_count, starts_with_i, starts_with_you, about_image]
feature_funcs = [len, asterix_freq, uppercase_freq]
transforms = [tokenize]
gbm = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM stacker"
models = [gbm]

In [228]:
pipe = NlpPipeline(train, test, "comment_text", labels, feature_funcs, transforms, models, word_index=None, pretrained=pretrained)

In [229]:
pipe.engineer_features()

Engineering features


In [230]:
pipe.train_features = np.hstack([pipe.train_features, train_meta])
pipe.test_features = np.hstack([pipe.test_features, test_meta])

In [231]:
pipe.cross_val()

Cross-validating
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=0.45, feature_fraction=0.45,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=125,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
Cross-validating toxic
roc_auc: 0.9869390538830242
Cross-validating severe_toxic
roc_auc: 0.9915202966920938
Cross-validating obscene
roc_auc: 0.9945908245033449
Cross-validating threat
roc_auc: 0.9927009960709448
Cross-validating insult
roc_auc: 0.9889341845948636
Cross-validating identity_hate
roc_auc: 0.9899910014474038


In [232]:
pipe.cv_scores # 0.9919742115892705

{'LightGBM stacker': 0.9907793928652792}

In [233]:
pipe.fit_predict_oof()

Creating out-of-fold meta training set for stacker
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=0.45, feature_fraction=0.45,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=125,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
toxic
AUC: 0.987091774204931
AUC: 0.985256407537181
AUC: 0.9879533344130205
AUC: 0.9869851922058431
AUC: 0.9875452107328448
severe_toxic
AUC: 0.9910858897653386
AUC: 0.9918630069338064
AUC: 0.9920721264231176
AUC: 0.9916423585136191
AUC: 0.9909362433024984
obscene
AUC: 0.9946070125306509
AUC: 0.9942422353226185
AUC: 0.994826906051163
AUC: 0.9943617151459986
AUC: 0.9948295437884718
threat
AUC: 0.9969952050602784
AUC: 0.9929965243296922
AUC: 0.9950298578020667
AUC:

In [194]:
pipe.create_submission(oof=False)

Creating submissions


In [234]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)