Updates to my first Kaggle kernel in the hope to make it better.


In [None]:
import numpy as np 
import pandas as pd
import os
from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, HuberRegressor, Ridge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split,  cross_val_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import re

import time

In [None]:
print(os.listdir("../input"))

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# for testing purposes
#train = train.iloc[:10000,:]
#test = test.iloc[:10000,:]

In [None]:
train['comment_text'].fillna("_na_", inplace=True)
test['comment_text'].fillna("_na_", inplace=True)

In [None]:
# Note, preprocessing the URLs to be uniform had minimal effect on my CV scores.
# As does processing internal wikipedia references (though it saw a teeny-tiny improvement)
# Adding this step did raise my competition result by .0029, which will matter more if I get my rank way up

mod_comments =[]
URLReg = re.compile(r'(http|https)://[^\s]*')
WikiReg = re.compile(r'(Wikipedia|Image):[^\s]*') #finds all reference to internal wikipedia tags
for comment in train['comment_text']:
    comment = re.sub(URLReg, 'httpaddr', comment)
    comment = re.sub(WikiReg, 'wikitag', comment)
    mod_comments.append(comment)
new_comments_df = pd.DataFrame({'comment_text': mod_comments})
    
train.update(new_comments_df)


# Note that this kernel has less preprocessing than my other kernel. That might account for the lower error. 
# It was definitely an oversight from mismanaging how I coordinated testing..

In [None]:
train_text = train['comment_text']
test_text = test['comment_text']
all_data = train.iloc[:,2:]

In [None]:
headings = list(train.columns.values)
comment_headings = headings[2:]

To start, I'll explore the data a little bit to get a sense of what I'm dealing with.

In [None]:
train.head(15)

In [None]:
train.describe()

In [None]:
for i in range(3):
    print(train['comment_text'][i] + '\n')

General impressions of the training data:

    Length varies significantly.
    On average the comments are fine.
    My model should predict a comment is toxic if it also predicts it to be severe_toxic.

Time to vectorize the data and start learning!

# Comment Processing

Inspiration from this part comes from Bojan Tunguz's kernel: Logistic Regression with words and char n-grams.

Vectorize the comments into word and char n-grams. The rational is that these can encode information differently. For example, users might obsfucate swear words .

Bojan's justification for this approach: "People often try to obfuscate bad words with additional characters. Using character n-grams can potentially detect those."


In [None]:
# TODO play with settings of vectorizer further

#all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    analyzer='word',
    token_pattern=r'\w{1,}',
    strip_accents='unicode', 
    stop_words='english',
    lowercase=False, #because usage of all caps is likely indicate of naughty behavior
    sublinear_tf=True,
    ngram_range=(1,1),
    max_features=10000)
#word_vectorizer.fit(all_text)
word_vectorizer.fit(train_text)
train_text_word_transform = word_vectorizer.transform(train_text)
test_text_word_transform = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    analyzer='char', 
    strip_accents='unicode', 
    stop_words='english',
    lowercase=False, #because usage of all caps is likely indicate of naughty behavior
    sublinear_tf=True,
    ngram_range=(2,6), #TODO I want to set the upper bound based off average word length, I think
    max_features=50000)
#char_vectorizer.fit(all_text)
char_vectorizer.fit(train_text)
train_text_char_transform = char_vectorizer.transform(train_text)
test_text_char_transform = char_vectorizer.transform(test_text)

complete_train_text = hstack((train_text_word_transform, train_text_char_transform))
complete_test_text = hstack((test_text_word_transform, test_text_char_transform))

In [None]:
print(train_text_word_transform.shape)
print(train_text_char_transform.shape)
print(complete_train_text.shape)
print(all_data.shape)

# Model


I'm going to use an ensemble model using a voting classifier. The models will be logistic regressison, multinomial Naive Bayes, and Random Forest with two different criterion.


In [None]:
# Comment/uncomment this if running for testing.
"""
X_train, X_test, y_train, y_test = train_test_split(complete_train_text, all_data, test_size=0.3)

start = time.time()

#for vote in ['hard', 'soft']:
#for est in [20,30, 40]:
pred = {}
cv_scores =[]
for category in comment_headings:
    clf1 = LogisticRegression(
            C=1.0,
            solver='sag',
            max_iter=1000)
    clf2 = MultinomialNB(
        alpha=.3)
    #clf3 = Ridge(alpha=1.0,solver='sag',max_iter=1000)
    #clf4 = HuberRegressor(epislon=1.35, alpha=0.0001, max_iter=100)
    clf3 = RandomForestClassifier(
        n_estimators=est,
        criterion='gini')
    clf4 = RandomForestClassifier(
        n_estimators=est,
        criterion='entropy')

    ensemble_clf = VotingClassifier(
        estimators=[('lr', clf1), ('mNB', clf2), ('rf1',clf3), ('rf2', clf4)],
        voting='hard')
    cv_score = cross_val_score(ensemble_clf, complete_train_text, all_data[category], cv=5)
    print(f'Cross-validation score for {category}: {cv_score}})
    #cv_scores.append(cv_score)
#print(f'Overall cross-validation score for {est}: {sum(cv_scores)/6}')
    
end = time.time()
print(end-start)
"""

In [None]:
# Comment/uncomment this depending on if running for submission

pred = {}
cv_scores =[]
for category in comment_headings:
    clf1 = LogisticRegression(
            C=1.0,
            solver='sag',
            max_iter=1000)
    clf2 = MultinomialNB(
        alpha=.3)
    clf3 = RandomForestClassifier(
        n_estimators=20,
        criterion='gini')
    clf4 = RandomForestClassifier(
        n_estimators=20,
        criterion='entropy')

    ensemble_clf = VotingClassifier(
        estimators=[('lr', clf1), ('mNB', clf2), ('rf1',clf3), ('rf2', clf4)],
        voting='soft')
    ensemble_clf.fit(complete_train_text, all_data[category])
    cv_score = ensemble_clf.score(complete_train_text, all_data[category])
    cv_scores.append(cv_score)
    print(f'Score for {category} on entire training set: {cv_score}')
    pred[category] = ensemble_clf.predict_proba(complete_test_text)
    pred[category] = pred[category][:,1]
print(f'Overall score on entire training set: {sum(cv_scores)/6}')

In [None]:
submission_id = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submission_id, pd.DataFrame(pred, columns = headings[2:])], axis=1)
submission.describe()

In [None]:
submission.to_csv('submission.csv', index=False)

Final result: 0.9772 where first place was 0.9885