In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from scipy.sparse import hstack
from scipy.special import logit, expit

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    stop_words='english',
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    stop_words='english',
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [7]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [9]:
train_features = train_features.tocsr()
test_features = test_features.tocsr()

In [10]:
test_features.shape

(153164, 60000)

In [22]:
def pr(y_i, y):
    p = train_features[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(x, y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [23]:
preds = np.zeros((len(test), len(class_names)))  

In [26]:
subm = pd.read_csv('data\\sample_submission.csv')

In [31]:
preds = np.zeros((len(train), len(class_names)))

In [32]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)
for i, label in enumerate(class_names):
    for train_idx, pred_idx in folds.split(train[label]):
        print('fit', label)
        m,r = get_mdl(train_features[train_idx], train[label][train_idx])
        preds[:,i][pred_idx] = m.predict_proba(train_features[pred_idx].multiply(r))[:,1]
        print(roc_auc_score(train[label][pred_idx], preds[:,i][pred_idx]))

print("Saving out-of-fold")
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = labels)], axis=1)
submission.to_csv('oof_train_nblogreg.csv', index=False)

fit toxic
0.962930469030776
fit toxic
0.9618735741865077
fit toxic
0.9639397137864535
fit toxic
0.9662076463464705
fit toxic
0.9632339798265157
fit severe_toxic
0.9822216726745506
fit severe_toxic
0.9849048617270015
fit severe_toxic
0.986091274828139
fit severe_toxic
0.9824602761740998
fit severe_toxic
0.9814096213444636
fit obscene
0.9854751993512636
fit obscene
0.9783850420512978
fit obscene
0.9875120702685147
fit obscene
0.9822573562703825
fit obscene
0.9795367148298655
fit threat
0.9873849541259485
fit threat
0.9885523557320977
fit threat
0.9894933963375029
fit threat
0.986381850589502
fit threat
0.9758446066720414
fit insult
0.9801475648682588
fit insult
0.9784588072111013
fit insult
0.9804492089994591
fit insult
0.9796015232396261
fit insult
0.9762497486472347
fit identity_hate
0.9797239821895679
fit identity_hate
0.9824519198657007
fit identity_hate
0.9815120226067415
fit identity_hate
0.9667270059370491
fit identity_hate
0.9791796939333319
Saving out-of-fold


NameError: name 'labels' is not defined

In [33]:
print("Saving out-of-fold")
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = class_names)], axis=1)
submission.to_csv('oof_train_nblogreg.csv', index=False)

Saving out-of-fold


In [35]:
preds = np.zeros((len(test), len(class_names)))  

for i, label in enumerate(class_names):
    print('fit', label)
    m,r = get_mdl(train_features, train[label])
    preds[:,i] = m.predict_proba(test_features.multiply(r))[:,1]

print("Saving submission")
final_submid = pd.DataFrame({'id': subm["id"]})
final_submission = pd.concat([final_submid, pd.DataFrame(preds, columns = class_names)], axis=1)
final_submission.to_csv('nblogreg_preds.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
Saving submission


In [11]:
from nlp_pipeline import *

In [12]:
import lightgbm as lgb

In [16]:
feature_funcs = []
transforms = []
logreg = LogisticRegression(solver='sag')
logreg.name = "Logistic regression sag"
gbm = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
gbm.name = "LightGBM stacker"
models = [logreg]

pipe = NlpPipeline(train, test, "comment_text", class_names, feature_funcs, transforms, models, word_index=None, pretrained="char n-gram")

In [17]:
pipe.train_features = train_features
pipe.test_features = test_features

In [18]:
pipe.fit_predict_oof()

Creating out-of-fold meta training set for stacker
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)
toxic
AUC: 0.9789751815516097
AUC: 0.9774398964774006
AUC: 0.9802334294472687
AUC: 0.979530431066261
AUC: 0.9795140199028408
severe_toxic
AUC: 0.9879613562810241
AUC: 0.9895188664032896
AUC: 0.9891020263293753
AUC: 0.9894281553063295
AUC: 0.9863817956924953
obscene
AUC: 0.991222385264418
AUC: 0.9891188465187506
AUC: 0.9918137457036066
AUC: 0.9893193268028236
AUC: 0.9905650626886647
threat
AUC: 0.9940803842063225
AUC: 0.9905287984111222
AUC: 0.992184448803875
AUC: 0.9900140915396003
AUC: 0.9849107378397103
insult
AUC: 0.9833657711944025
AUC: 0.9831334299659442
AUC: 0.9840607077470257
AUC: 0.9827373877140632
AUC: 0.9817399781701382
identity_hate
AUC: 0.9831047975307894
AUC: 0.9877746

In [20]:
pipe.cv_scores

{'Logistic regression sag': 0.9858489262008926}

In [21]:
pipe.create_submission()

Creating submissions
