In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit

In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
train = pd.read_csv('data\\train.csv').fillna(' ')
test = pd.read_csv('data\\test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=15000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 5),
    max_features=20000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [7]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [12]:
from nlp_pipeline import *

In [13]:
feature_funcs = []
transforms = []
logreg = LogisticRegression(solver='sag')
logreg.name = "Logistic regression sag"

models = [logreg]

pipe = NlpPipeline(train, test, "comment_text", class_names, feature_funcs, transforms, models, word_index=None, pretrained=None)

In [24]:
pipe.train_features = train_features.tocsr()
pipe.test_features = test_features.tocsr()

In [25]:
pipe.fit_predict_oof()

Creating out-of-fold meta training set for stacker
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)
toxic
AUC: 0.9788665562785953
AUC: 0.9775498870065832
AUC: 0.9800778938259942
AUC: 0.9793231272962378
AUC: 0.9795399967520529
severe_toxic
AUC: 0.9879942897001028
AUC: 0.9892594533580586
AUC: 0.9890235892794066
AUC: 0.9891166124222875
AUC: 0.9865625451873737
obscene
AUC: 0.9909348174463731
AUC: 0.9891630531558426
AUC: 0.9918866975657054
AUC: 0.9887526467236452
AUC: 0.9905346573366114
threat
AUC: 0.9938223453188436
AUC: 0.9902763985435286
AUC: 0.9914478884406296
AUC: 0.9886331271391259
AUC: 0.9848679003853521
insult
AUC: 0.9833529710802892
AUC: 0.983091103365145
AUC: 0.9839895623451606
AUC: 0.9824383248534562
AUC: 0.9811303216172121
identity_hate
AUC: 0.9839519913464895
AUC: 0.98776

In [26]:
pipe.create_submission()

Creating submissions
