In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from nlp_pipeline import *

In [81]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

In [82]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [145]:
subnums = [21,22,29,51,33,37,44,45]

In [146]:
oofs = [np.array(pd.read_csv("submissions\\oof_train" + str(num) + ".csv")[labels]) for num in subnums]
subs = [np.array(pd.read_csv("submissions\\submission" + str(num) + ".csv")[labels]) for num in subnums]

In [147]:
train_meta = np.hstack(oofs)
test_meta = np.hstack(subs)

In [148]:
pretrained = "data\\crawl-300d-2M.vec"
feature_funcs = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq, has_ip, has_talk_tag, link_count, starts_with_i, starts_with_you, about_image]
transforms = [tokenize]
gbm = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM stacker"
logreg = LogisticRegression(C=0.2, class_weight='balanced', solver='newton-cg', max_iter=10)
logreg.name = "Logistic regression newton"
models = [gbm]

In [149]:
pipe = NlpPipeline(train, test, "comment_text", labels, feature_funcs, transforms, models, word_index=None, pretrained=pretrained)

In [150]:
pipe.engineer_features()

Engineering features


In [151]:
pipe.train_features = np.hstack([pipe.train_features, train_meta])
pipe.test_features = np.hstack([pipe.test_features, test_meta])

In [152]:
pipe.cross_val()

Cross-validating
LGBMClassifier(bagging_fraction=0.8, bagging_freq=5, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
        learning_rate=0.1, max_depth=3, metric='auc', min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=125,
        n_jobs=-1, num_leaves=10, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.2, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)
Cross-validating toxic
roc_auc: 0.9864835801875781
Cross-validating severe_toxic
roc_auc: 0.9914568952918872
Cross-validating obscene
roc_auc: 0.9948485908648828
Cross-validating threat
roc_auc: 0.9912946545225154
Cross-validating insult
roc_auc: 0.9887202405075779
Cross-validating identity_hate
roc_auc: 0.9895615340741799


In [154]:
pipe.cv_scores # 0.9904823005486548 # LR: 0.9878702206951869

{'LightGBM stacker': 0.9903942492414369}

In [35]:
pipe.fit_predict_oof()

Creating out-of-fold meta training set for stacker
LogisticRegression(C=0.2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=10,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
toxic




AUC: 0.9847995938548666
AUC: 0.9825052321689869
AUC: 0.9856223945158116
AUC: 0.9845276783930855
AUC: 0.984483928183934
severe_toxic
AUC: 0.9906844767441746
AUC: 0.9885144723050875
AUC: 0.991625597777355
AUC: 0.9909140977418247
AUC: 0.9906860273801289
obscene
AUC: 0.9932663487343849
AUC: 0.9926398558728324
AUC: 0.9931431614692452
AUC: 0.9926856960106076
AUC: 0.9933615432372709
threat
AUC: 0.9897085773314535
AUC: 0.9796965684651883
AUC: 0.9862135790238106
AUC: 0.9872965531945088
AUC: 0.9867598365889798
insult
AUC: 0.9875131819705527
AUC: 0.987464568831541
AUC: 0.9869834809796668
AUC: 0.9866348386379455
AUC: 0.9860848503047168
identity_hate
AUC: 0.9858949113942405
AUC: 0.9907937508070871
AUC: 0.9835491432897837
AUC: 0.9865327646104701
AUC: 0.9874606629203406
CV score: 0.9879349124246628
Fitting and predicting
Fitting submission classifier for toxic
Fitting submission classifier for severe_toxic
Fitting submission classifier for obscene
Fitting submission classifier for threat
Fitting subm

In [98]:
pipe.cv_scores

{'LightGBM stacker': 0.9954510654519456}

In [132]:
pipe.create_submission(oof=False)

Creating submissions


In [542]:
np.mean(train.comment_text.apply(about_image))

0.006987485194678231

In [531]:
i += 1
train.comment_text[i]

'Windows Phone upgradeability \n\nHi Gregory.  I reverted your change on the Windows phone article.  Neither Microsoft nor its partners have made any official statements on whether devices running Windows Phone 7.5 will be upgradeable to Windows Phone 8.  The references given by another user are blogs that are relying on Hearsay.'

In [435]:
i

229

In [550]:
pd.read_csv("submissions\\gru_ft_oof_template.csv")

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\r\r\r\r\nWhy the edits made unde...,0.000458,4.521454e-10,1.486791e-06,2.139895e-07,3.475134e-07,8.514877e-08
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.006773,1.933469e-06,6.015128e-04,5.319089e-06,4.312414e-04,2.021607e-05
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.000762,5.905761e-08,2.445611e-04,3.001501e-07,1.003712e-04,3.885808e-06
3,0001b41b1c6bb37e,"""\r\r\r\r\r\nMore\r\r\r\r\r\nI can't make any ...",0.000100,4.418383e-10,1.727175e-05,6.038349e-10,1.071730e-05,5.431227e-08
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.020912,1.077028e-07,3.585896e-04,4.841351e-05,3.066965e-04,4.787587e-06
5,00025465d4725e87,"""\r\r\r\r\r\n\r\r\r\r\r\nCongratulations from ...",0.000032,3.209740e-10,4.084681e-07,4.063975e-07,3.510345e-07,1.296002e-07
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.999352,2.451633e-01,9.727824e-01,6.576371e-04,5.310336e-01,6.138357e-03
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0.105072,7.341976e-07,5.869922e-04,1.906692e-05,1.788136e-03,1.332287e-05
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0.001671,2.037035e-09,2.204214e-05,4.103335e-08,3.481725e-05,1.445621e-06
9,00040093b2687caa,alignment on this subject and which are contra...,0.003130,1.480860e-07,3.295744e-04,5.160733e-07,4.447330e-04,7.842073e-06


In [570]:
pipe.train_features

array([[-2.20194960e-01, -1.33447432e-01,  1.37928568e-01, ...,
         2.13989450e-07,  3.47513400e-07,  8.51487700e-08],
       [-4.77508753e-01,  3.86481142e-01,  2.13714511e-01, ...,
         5.31908881e-06,  4.31241409e-04,  2.02160718e-05],
       [-2.72673431e-01, -1.33447432e-01, -3.70855575e-01, ...,
         3.00150077e-07,  1.00371151e-04,  3.88580838e-06],
       ...,
       [-5.29987224e-01, -1.33447432e-01, -2.37912952e-02, ...,
         5.56331340e-08,  3.28401320e-06,  1.73218090e-06],
       [-4.70737337e-01, -1.33447432e-01, -3.70058382e-01, ...,
         2.59967692e-06,  2.67140538e-04,  1.03846241e-05],
       [-3.47159003e-01, -1.33447432e-01, -3.27798728e-01, ...,
         2.32880600e-04,  1.56460820e-04,  3.23952550e-05]])

In [15]:
np.corrcoef(pipe.train_features[:,2],pipe.train_features[:,1])

array([[1.        , 0.16370526],
       [0.16370526, 1.        ]])