In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from scipy.sparse import hstack
from sklearn.metrics import roc_auc_score
from datetime import datetime

from utils import tokenize

In [2]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod(
            (datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' %
              (thour, tmin, round(tsec, 2)))

In [3]:
traintime = timer(None)
train_time = timer(None)
train = pd.read_csv('../data/toxic_comments/train.csv').fillna(' ')
test = pd.read_csv('../data/toxic_comments/test.csv').fillna(' ')

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

tr_ids = train[['id']]
train[class_names] = train[class_names].astype(np.int8)
target = train[class_names]

### Cleaning and preprocess

In [4]:
train['comment_tokens'] = train['comment_text'].map(lambda x: tokenize(x))
test['comment_tokens'] = test['comment_text'].map(lambda x: tokenize(x))

train['comment_text'] = train['comment_tokens'].map(lambda x: ' '.join(x))
test['comment_text'] = test['comment_tokens'].map(lambda x: ' '.join(x))

train.drop(["comment_tokens"], axis=1, inplace=True)
test.drop(["comment_tokens"], axis=1, inplace=True)

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])
timer(train_time)


 Time taken: 0 hours 17 minutes and 15.89 seconds.


In [5]:
train_time = timer(None)
print(' Part 1/2 of vectorizing ...')
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
timer(train_time)

 Part 1/2 of vectorizing ...

 Time taken: 0 hours 0 minutes and 56.44 seconds.


In [6]:
train_time = timer(None)
print(' Part 2/2 of vectorizing ...')
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
timer(train_time)

 Part 2/2 of vectorizing ...

 Time taken: 0 hours 45 minutes and 23.35 seconds.


### Union char and word features

In [7]:
train_features = hstack([train_char_features, train_word_features]).tocsr()
test_features = hstack([test_char_features, test_word_features]).tocsr()
timer(traintime)


 Time taken: 1 hours 41 minutes and 32.83 seconds.


In [8]:
all_parameters = {
    'C': [1.048113, 0.1930, 0.596362, 0.25595, 0.449843, 0.25595],
    'tol': [0.1, 0.1, 0.046416, 0.0215443, 0.1, 0.01],
    'solver': ['lbfgs', 'newton-cg', 'lbfgs', 'newton-cg', 'newton-cg', 'lbfgs'],
    'fit_intercept': [True, True, True, True, True, True],
    'penalty': ['l2', 'l2', 'l2', 'l2', 'l2', 'l2'],
    'class_weight': [None, 'balanced', 'balanced', 'balanced', 'balanced', 'balanced'],
}

folds = 5
scores = []
scores_classes = np.zeros((len(class_names), folds))


Time taken: 0 hours 30 minutes and 54.47 seconds.

In [9]:
submission = pd.DataFrame.from_dict({'id': test['id']})
submission_oof = train[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

idpred = tr_ids

In [10]:
traintime = timer(None)
for j, (class_name) in enumerate(class_names):
    #    train_target = train[class_name]

    classifier = LogisticRegression(
        C=all_parameters['C'][j],
        max_iter=200,
        tol=all_parameters['tol'][j],
        solver=all_parameters['solver'][j],
        fit_intercept=all_parameters['fit_intercept'][j],
        penalty=all_parameters['penalty'][j],
        dual=False,
        class_weight=all_parameters['class_weight'][j],
        verbose=0)

    avreal = target[class_name]
    lr_cv_sum = 0
    lr_pred = []
    lr_fpred = []
    lr_avpred = np.zeros(train.shape[0])

    train_time = timer(None)
    for i, (train_index, val_index) in enumerate(skf.split(train_features, target[class_name].values)):
        X_train, X_val = train_features[train_index], train_features[val_index]
        y_train, y_val = target.loc[train_index], target.loc[val_index]

        classifier.fit(X_train, y_train[class_name])
        scores_val = classifier.predict_proba(X_val)[:, 1]
        lr_avpred[val_index] = scores_val
        lr_y_pred = classifier.predict_proba(test_features)[:, 1]
        scores_classes[j][i] = roc_auc_score(y_val[class_name], scores_val)
        print('\n Fold %02d class %s AUC: %.6f' % ((i + 1), class_name, scores_classes[j][i]))

        if i > 0:
            lr_fpred = lr_pred + lr_y_pred
        else:
            lr_fpred = lr_y_pred

        lr_pred = lr_fpred

    lr_cv_score = (lr_cv_sum / folds)
    lr_oof_auc = roc_auc_score(avreal, lr_avpred)
    print('\n Average class %s AUC:\t%.6f' % (class_name, np.mean(scores_classes[j])))
    print(' Out-of-fold class %s AUC:\t%.6f' % (class_name, lr_oof_auc))
    timer(train_time)

    submission[class_name] = lr_pred / folds
    submission_oof['prediction_' + class_name] = lr_avpred


 Fold 01 class toxic AUC: 0.978427

 Fold 02 class toxic AUC: 0.977971

 Fold 03 class toxic AUC: 0.979633

 Fold 04 class toxic AUC: 0.978673

 Fold 05 class toxic AUC: 0.981027

 Average class toxic AUC:	0.979146
 Out-of-fold class toxic AUC:	0.979142

 Time taken: 0 hours 14 minutes and 27.11 seconds.

 Fold 01 class severe_toxic AUC: 0.987972

 Fold 02 class severe_toxic AUC: 0.988784

 Fold 03 class severe_toxic AUC: 0.990986

 Fold 04 class severe_toxic AUC: 0.987391

 Fold 05 class severe_toxic AUC: 0.989689

 Average class severe_toxic AUC:	0.988965
 Out-of-fold class severe_toxic AUC:	0.988953

 Time taken: 0 hours 22 minutes and 58.8 seconds.

 Fold 01 class obscene AUC: 0.991359

 Fold 02 class obscene AUC: 0.991048

 Fold 03 class obscene AUC: 0.990723

 Fold 04 class obscene AUC: 0.990488

 Fold 05 class obscene AUC: 0.991101

 Average class obscene AUC:	0.990944
 Out-of-fold class obscene AUC:	0.990943

 Time taken: 0 hours 10 minutes and 33.42 seconds.

 Fold 01 class t

In [13]:
print('\n Overall AUC:\t%.6f' % (np.mean(scores_classes)))
submission.to_csv('submission_tuned_LR02_2_vectorizers.csv', index=False)
submission_oof.to_csv('oof_tuned_LR02.csv', index=False)
timer(traintime)


 Overall AUC:	0.986238

 Time taken: 7 hours 46 minutes and 58.77 seconds.


Fold 01 class toxic AUC: 0.977868

 Fold 02 class toxic AUC: 0.979100

 Fold 03 class toxic AUC: 0.977227

 Fold 04 class toxic AUC: 0.980188

 Fold 05 class toxic AUC: 0.978339
 
 Average class toxic AUC:	0.978544
 Out-of-fold class toxic AUC:	0.978549

 Time taken: 0 hours 13 minutes and 3.29 seconds.

 Fold 01 class severe_toxic AUC: 0.990015

 Fold 02 class severe_toxic AUC: 0.988798

 Fold 03 class severe_toxic AUC: 0.988556

 Fold 04 class severe_toxic AUC: 0.989866

 Fold 05 class severe_toxic AUC: 0.988006

 Average class severe_toxic AUC:	0.989048
 Out-of-fold class severe_toxic AUC:	0.989038

 Time taken: 0 hours 18 minutes and 51.72 seconds.

 Fold 01 class obscene AUC: 0.990285
 
 Fold 02 class obscene AUC: 0.990708

 Fold 03 class obscene AUC: 0.990812

 Fold 04 class obscene AUC: 0.991840

 Fold 05 class obscene AUC: 0.991806

 Average class obscene AUC:	0.991090
 Out-of-fold class obscene AUC:	0.991086

 Time taken: 0 hours 12 minutes and 14.55 seconds.

 Fold 01 class threat AUC: 0.991979

 Fold 02 class threat AUC: 0.992755

 Fold 03 class threat AUC: 0.992445

 Fold 04 class threat AUC: 0.985758
 
 Fold 05 class threat AUC: 0.992098

 Average class threat AUC:	0.991007
 Out-of-fold class threat AUC:	0.990981

 Time taken: 0 hours 19 minutes and 32.27 seconds.

 Fold 01 class insult AUC: 0.983164

 Fold 02 class insult AUC: 0.984527
 
 Fold 03 class insult AUC: 0.981855
 
 Fold 04 class insult AUC: 0.982395
 
 Fold 05 class insult AUC: 0.987513
 
 Average class insult AUC:	0.9861212
 Out-of-fold class insult AUC:	0.985211
 
 Fold 01 class identity_hate AUC: 0.992364

 Fold 02 class identity_hate AUC: 0.991254
 
 Fold 03 class identity_hate AUC: 0.993211
 
 Fold 04 class identity_hate AUC: 0.993521
 
 Fold 05 class identity_hate AUC: 0.990023
 
 Average class identity_hate AUC:	0.992107
 Out-of-fold class identity_hate AUC:	0.991901
 