In [61]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from scipy.sparse import hstack
from scipy.special import logit, expit

pd.options.display.max_colwidth = 200

In [37]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [39]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [49]:
print(train.shape)
print(test.shape)

train = train.sample(5000)
test = test.sample(1000)

print(train.shape)
print(test.shape)

display(train.sample(100))

(159571, 8)
(153164, 2)
(5000, 8)
(1000, 2)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
34513,5c2cb0ca34f4860c,""" Untold Story"""" appeared, in November of 2009...",0,0,0,0,0,0
82707,dd349039e760b747,Electronic Music article disambiguation revers...,0,0,0,0,0,0
93996,fb5561a3908da18c,Please don't add indiscriminately fuill tables...,0,0,0,0,0,0


In [51]:
train_text = train['comment_text']
test_text = test['comment_text']

all_text = pd.concat([train_text, test_text])

In [52]:
all_text.shape

(6000,)

In [53]:
# build word vectorizer

word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern='\w{1,}', 
                                  ngram_range=(1,1), max_features=5000)

In [54]:
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [55]:
# buil char vectorizer
char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', ngram_range=(1, 5),
    max_features=30000)

In [56]:
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [57]:
# merger word and char features

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [73]:
# losses = []
predictions = {'id': test['id'], 'comment' : test['comment_text']}
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(max_iter= 200, solver ='sag')
    
#     cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
#     losses.append(cv_loss)
#     print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
    
# print('Total CV scor {}'.format(np.mean(losses)))

submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.9493670505075295
CV score for class severe_toxic is 0.9734099567410658
CV score for class obscene is 0.9620387331280331
CV score for class threat is 0.9736508405133893
CV score for class insult is 0.949419969852309
CV score for class identity_hate is 0.903173731270408
Total CV scor 0.9518433803354558


In [74]:
submission.head()

Unnamed: 0,comment,id,identity_hate,insult,obscene,severe_toxic,threat,toxic
137164,"the only bloody commie regime from Europe, strongly supported by Russia \n\n *",e53cafb5daffab9f,0.007739,0.021211,0.02691,0.007991,0.002235,0.040529
43296,""" \n\n ==Speedy deletion of Weblogtr.com== \n A tag has been placed on Weblogtr.com, requesting that it be speedily deleted from Wikipedia. This has been done under section G4 of the criteria for...",47b53d786284ea86,0.002277,0.004374,0.002529,0.001855,0.001337,0.004045
66962,""" \n ::The postseason has been pretty pathetic. Go Rays! ''' ' """,6f7f9b24882299ab,0.006754,0.108825,0.069433,0.010575,0.002078,0.199759
105148,"I think my article on Chuck Parsons is close to ready for posting, and would appreciate a review. It has references and is based on reading and interviews, in my own words. I would like to post ...",af88a660f46bb293,0.008004,0.071542,0.02579,0.005529,0.002568,0.099327
106532,":It would have been better to leave it as a stub, since that encourages others to develop it. I have now restored the article and expanded a bit. I hope to make it a B-class article in the future,...",b1cb2880698df604,0.004189,0.015758,0.018225,0.00425,0.002236,0.022075


## use random forest classsifier

In [77]:
from sklearn.ensemble import RandomForestClassifier


In [79]:
predictions = {'id': test['id'], 'comment' : test['comment_text']}
for class_name in class_names:
    train_target = train[class_name]
    classifier = RandomForestClassifier(n_estimators=100, max_depth=10)
    
    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]
    
print('Total CV scor {}'.format(np.mean(losses)))

submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('submission_rf.csv', index=False)

CV score for class toxic is 0.9233647965245019
CV score for class severe_toxic is 0.9641951617195179
CV score for class obscene is 0.9585807700161327
CV score for class threat is 0.7026436107692037
CV score for class insult is 0.9330290798572461
CV score for class identity_hate is 0.866195947389301
Total CV scor 0.9215891373573865
