In [1]:
import pandas as pd
import numpy as np
import re, string
from sklearn.model_selection import GridSearchCV

In [2]:
import lightgbm as lgb

In [3]:
from nlp_pipeline import *

In [32]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [4]:
pretrained = "data\\crawl-300d-2M.vec"

In [5]:
word_vector = get_pretrained(pretrained)

In [6]:
train = pd.read_csv("data\\train.csv").fillna(' ')
test = pd.read_csv("data\\test.csv").fillna(' ')

In [7]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(row):
    return re_tok.sub(r' \1 ', row).lower().split()

In [98]:
class_labels = [column for column in train.columns[2:8]]
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq, has_ip, has_talk_tag, link_count, starts_with_i, starts_with_you, about_image]
transforms = [tokenize]
gbm = lgb.LGBMClassifier(max_depth=3, metric="auc", num_leaves=20, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM"
logreg = LogisticRegression(C=0.2, class_weight='balanced', solver='newton-cg', max_iter=10)
logreg.name = "Logistic regression newton"
models = [logreg]

In [99]:
pipe = NlpPipeline(train, test, "comment_text", class_labels, feature_funcs, transforms, models, word_index=word_vector, pretrained=pretrained)
print(pipe)

Train: (159571, 8)
Test: (153164, 2)
Train features: (0,)
Test features: (0,)
Input column: comment_text
Class labels: toxic severe_toxic obscene threat insult identity_hate
Models: Logistic regression newton: 0.2 balanced False True 1 10 ovr 1 l2 None newton-cg 0.0001 0 False | 
Transforms:  tokenize
Feature functions:  function asterix_freq uppercase_freq line_change_freq rep_freq question_freq has_ip has_talk_tag link_count starts_with_i starts_with_you about_image
Metric: roc_auc
CV scores: {'Logistic regression newton': -1}


In [100]:
pipe.engineer_features()

Engineering features


In [101]:
pipe.apply_transforms()

Applying transforms


In [102]:
pipe.create_embeddings()

Creating embeddings


In [103]:
pipe.cross_val()

Cross-validating
LogisticRegression(C=0.2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=10,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
Cross-validating toxic




roc_auc: 0.9665814543234486
Cross-validating severe_toxic




roc_auc: 0.9844594071363023
Cross-validating obscene




roc_auc: 0.9773101556099288
Cross-validating threat




roc_auc: 0.9757080871827837
Cross-validating insult




roc_auc: 0.9726760268354722
Cross-validating identity_hate




roc_auc: 0.9711612691235416




In [105]:
pipe.cv_scores

{'Logistic regression newton': 0.9746494000352462}

In [26]:
pipe.fit_predict_oof()

Creating out-of-fold meta training set for stacker
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
toxic
AUC: 0.956549533623
AUC: 0.957561008854
AUC: 0.960646629136
AUC: 0.9567841933
AUC: 0.958794141654
severe_toxic
AUC: 0.983588656074
AUC: 0.984221508909
AUC: 0.983461670074
AUC: 0.985227479713
AUC: 0.984391548349
obscene
AUC: 0.970237136293
AUC: 0.971839531338
AUC: 0.975834772677
AUC: 0.9707452455
AUC: 0.970833380932
threat
AUC: 0.97918670217
AUC: 0.970884365001
AUC: 0.969385609206
AUC: 0.977309996698
AUC: 0.970492082159
insult
AUC: 0.967139970311
AUC: 0.968618

In [27]:
pipe.create_submission()

Creating submissions


In [109]:
train.comment_text.apply(len)

0          264
1          112
2          233
3          622
4           67
5           65
6           44
7          115
8          472
9           70
10        2875
11          56
12         319
13         819
14         219
15         610
16          57
17          48
18         118
19         440
20         266
21          58
22         543
23          97
24        1409
25         172
26         288
27         343
28         119
29          78
          ... 
159541     346
159542     164
159543      80
159544      19
159545     105
159546     534
159547      30
159548      27
159549      25
159550     173
159551     196
159552      92
159553     888
159554     182
159555     119
159556      57
159557     429
159558     134
159559     357
159560    1889
159561    1147
159562    1015
159563     394
159564     369
159565     653
159566     295
159567      99
159568      81
159569     116
159570     189
Name: comment_text, Length: 159571, dtype: int64