In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('datasets/train.csv').fillna(' ')
test = pd.read_csv('datasets/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
word_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  strip_accents='unicode',
                                  analyzer='word',
                                  token_pattern=r'\w{1,}',
                                  stop_words='english',
                                  ngram_range=(1, 1),
                                  max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

print(train_word_features.shape, test_word_features.shape)

(159571, 10000) (153164, 10000)


In [9]:
char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  strip_accents='unicode',
                                  analyzer='char',
                                  stop_words='english',
                                  ngram_range=(2, 3), # (2,6)
                                  max_features=50000)


In [10]:
char_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(2, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
train_char_features = char_vectorizer.transform(train_text)

In [12]:
test_char_features = char_vectorizer.transform(test_text)

In [13]:
print(train_char_features.shape, test_char_features.shape)

(159571, 50000) (153164, 50000)


In [14]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

print(train_features.shape, test_features.shape)

(159571, 60000) (153164, 60000)


In [15]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
submission

Unnamed: 0,id
0,00001cee341fdb12
1,0000247867823ef7
2,00013b17ad220c46
3,00017563c3f7919a
4,00017695ad8997eb
5,0001ea8717f6de06
6,00024115d4cbde0f
7,000247e83dcc1211
8,00025358d4737918
9,00026d1092fe71cc


In [16]:
# class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]



CV score for class toxic is 0.9675006156897931
CV score for class severe_toxic is 0.986873636398505
CV score for class obscene is 0.9833487687212067
CV score for class threat is 0.9805702446369825
CV score for class insult is 0.9758870087842437
CV score for class identity_hate is 0.9716034436396366


In [17]:
scores

[0.9675006156897931,
 0.986873636398505,
 0.9833487687212067,
 0.9805702446369825,
 0.9758870087842437,
 0.9716034436396366]

In [18]:
print('Total CV score is {}'.format(np.mean(scores)))


Total CV score is 0.9776306196450614


In [19]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.992186,0.117709,0.982184,0.01492,0.928548,0.098872
1,0000247867823ef7,0.024919,0.005543,0.017496,0.002269,0.015299,0.006916
2,00013b17ad220c46,0.032157,0.006484,0.01817,0.002647,0.015068,0.006313
3,00017563c3f7919a,0.01478,0.003659,0.01033,0.002565,0.010826,0.00287
4,00017695ad8997eb,0.077876,0.00375,0.021442,0.002045,0.024005,0.004708


In [None]:
submission.to_csv('submission_lr.csv', index=False) # 0.9736